2375 lines
68 KiB
JSON
2375 lines
68 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 234,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 1.8042294681072235,
|
||
|
|
"epoch": 0.025806451612903226,
|
||
|
|
"grad_norm": 1.166382074356079,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 2.5975,
|
||
|
|
"mean_token_accuracy": 0.4834420457482338,
|
||
|
|
"num_tokens": 1533.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.8224012553691864,
|
||
|
|
"epoch": 0.05161290322580645,
|
||
|
|
"grad_norm": 1.568097472190857,
|
||
|
|
"learning_rate": 8.333333333333334e-06,
|
||
|
|
"loss": 2.6194,
|
||
|
|
"mean_token_accuracy": 0.5228946506977081,
|
||
|
|
"num_tokens": 2447.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.1347350478172302,
|
||
|
|
"epoch": 0.07741935483870968,
|
||
|
|
"grad_norm": 1.6636226177215576,
|
||
|
|
"learning_rate": 1.6666666666666667e-05,
|
||
|
|
"loss": 3.1216,
|
||
|
|
"mean_token_accuracy": 0.4500608742237091,
|
||
|
|
"num_tokens": 3252.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.042035460472107,
|
||
|
|
"epoch": 0.1032258064516129,
|
||
|
|
"grad_norm": 1.8585174083709717,
|
||
|
|
"learning_rate": 2.5e-05,
|
||
|
|
"loss": 3.0927,
|
||
|
|
"mean_token_accuracy": 0.434286504983902,
|
||
|
|
"num_tokens": 3990.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.0793383419513702,
|
||
|
|
"epoch": 0.12903225806451613,
|
||
|
|
"grad_norm": 2.271517753601074,
|
||
|
|
"learning_rate": 3.3333333333333335e-05,
|
||
|
|
"loss": 3.1323,
|
||
|
|
"mean_token_accuracy": 0.44490282237529755,
|
||
|
|
"num_tokens": 4623.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.078058958053589,
|
||
|
|
"epoch": 0.15483870967741936,
|
||
|
|
"grad_norm": 2.0911874771118164,
|
||
|
|
"learning_rate": 4.166666666666667e-05,
|
||
|
|
"loss": 3.0791,
|
||
|
|
"mean_token_accuracy": 0.4434494748711586,
|
||
|
|
"num_tokens": 5202.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9296036958694458,
|
||
|
|
"epoch": 0.18064516129032257,
|
||
|
|
"grad_norm": 2.447918176651001,
|
||
|
|
"learning_rate": 5e-05,
|
||
|
|
"loss": 2.9283,
|
||
|
|
"mean_token_accuracy": 0.5010824277997017,
|
||
|
|
"num_tokens": 5738.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.1553411781787872,
|
||
|
|
"epoch": 0.2064516129032258,
|
||
|
|
"grad_norm": 2.70611572265625,
|
||
|
|
"learning_rate": 5.833333333333334e-05,
|
||
|
|
"loss": 2.8435,
|
||
|
|
"mean_token_accuracy": 0.498832605779171,
|
||
|
|
"num_tokens": 6235.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.148306369781494,
|
||
|
|
"epoch": 0.23225806451612904,
|
||
|
|
"grad_norm": 2.3149070739746094,
|
||
|
|
"learning_rate": 6.666666666666667e-05,
|
||
|
|
"loss": 2.8677,
|
||
|
|
"mean_token_accuracy": 0.46573129296302795,
|
||
|
|
"num_tokens": 6703.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9346267580986023,
|
||
|
|
"epoch": 0.25806451612903225,
|
||
|
|
"grad_norm": 1.3574178218841553,
|
||
|
|
"learning_rate": 7.500000000000001e-05,
|
||
|
|
"loss": 2.4543,
|
||
|
|
"mean_token_accuracy": 0.5017582848668098,
|
||
|
|
"num_tokens": 8003.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.2560064792633057,
|
||
|
|
"epoch": 0.2838709677419355,
|
||
|
|
"grad_norm": 1.4286997318267822,
|
||
|
|
"learning_rate": 8.333333333333334e-05,
|
||
|
|
"loss": 2.4076,
|
||
|
|
"mean_token_accuracy": 0.516123816370964,
|
||
|
|
"num_tokens": 8830.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.271284520626068,
|
||
|
|
"epoch": 0.3096774193548387,
|
||
|
|
"grad_norm": 1.289847493171692,
|
||
|
|
"learning_rate": 9.166666666666667e-05,
|
||
|
|
"loss": 2.2502,
|
||
|
|
"mean_token_accuracy": 0.581367239356041,
|
||
|
|
"num_tokens": 9586.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.506469488143921,
|
||
|
|
"epoch": 0.33548387096774196,
|
||
|
|
"grad_norm": 1.698026418685913,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 2.5559,
|
||
|
|
"mean_token_accuracy": 0.5279825925827026,
|
||
|
|
"num_tokens": 10255.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.488889992237091,
|
||
|
|
"epoch": 0.36129032258064514,
|
||
|
|
"grad_norm": 2.1104917526245117,
|
||
|
|
"learning_rate": 9.999827315381885e-05,
|
||
|
|
"loss": 2.3051,
|
||
|
|
"mean_token_accuracy": 0.5456234812736511,
|
||
|
|
"num_tokens": 10842.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.494838774204254,
|
||
|
|
"epoch": 0.3870967741935484,
|
||
|
|
"grad_norm": 1.7446825504302979,
|
||
|
|
"learning_rate": 9.999309273455528e-05,
|
||
|
|
"loss": 2.1948,
|
||
|
|
"mean_token_accuracy": 0.5685414522886276,
|
||
|
|
"num_tokens": 11363.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.623446822166443,
|
||
|
|
"epoch": 0.4129032258064516,
|
||
|
|
"grad_norm": 1.934134840965271,
|
||
|
|
"learning_rate": 9.998445910004082e-05,
|
||
|
|
"loss": 2.2624,
|
||
|
|
"mean_token_accuracy": 0.5481147766113281,
|
||
|
|
"num_tokens": 11819.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.3205150961875916,
|
||
|
|
"epoch": 0.43870967741935485,
|
||
|
|
"grad_norm": 1.6750158071517944,
|
||
|
|
"learning_rate": 9.997237284663379e-05,
|
||
|
|
"loss": 1.8547,
|
||
|
|
"mean_token_accuracy": 0.6086297482252121,
|
||
|
|
"num_tokens": 12247.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.435093104839325,
|
||
|
|
"epoch": 0.4645161290322581,
|
||
|
|
"grad_norm": 1.8602609634399414,
|
||
|
|
"learning_rate": 9.995683480917821e-05,
|
||
|
|
"loss": 2.1032,
|
||
|
|
"mean_token_accuracy": 0.5650125294923782,
|
||
|
|
"num_tokens": 12646.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.1141549050807953,
|
||
|
|
"epoch": 0.49032258064516127,
|
||
|
|
"grad_norm": 0.9358610510826111,
|
||
|
|
"learning_rate": 9.993784606094612e-05,
|
||
|
|
"loss": 1.9903,
|
||
|
|
"mean_token_accuracy": 0.5407712012529373,
|
||
|
|
"num_tokens": 14509.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.083885967731476,
|
||
|
|
"epoch": 0.5161290322580645,
|
||
|
|
"grad_norm": 1.1308526992797852,
|
||
|
|
"learning_rate": 9.991540791356342e-05,
|
||
|
|
"loss": 1.8726,
|
||
|
|
"mean_token_accuracy": 0.5599013864994049,
|
||
|
|
"num_tokens": 15617.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.3853049874305725,
|
||
|
|
"epoch": 0.5419354838709678,
|
||
|
|
"grad_norm": 1.350138545036316,
|
||
|
|
"learning_rate": 9.988952191691925e-05,
|
||
|
|
"loss": 2.251,
|
||
|
|
"mean_token_accuracy": 0.5332682132720947,
|
||
|
|
"num_tokens": 16449.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.1798684000968933,
|
||
|
|
"epoch": 0.567741935483871,
|
||
|
|
"grad_norm": 1.3853743076324463,
|
||
|
|
"learning_rate": 9.986018985905901e-05,
|
||
|
|
"loss": 1.9656,
|
||
|
|
"mean_token_accuracy": 0.5732992142438889,
|
||
|
|
"num_tokens": 17216.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.2904029488563538,
|
||
|
|
"epoch": 0.5935483870967742,
|
||
|
|
"grad_norm": 2.5513713359832764,
|
||
|
|
"learning_rate": 9.982741376606078e-05,
|
||
|
|
"loss": 2.1948,
|
||
|
|
"mean_token_accuracy": 0.5600379034876823,
|
||
|
|
"num_tokens": 17868.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.961841881275177,
|
||
|
|
"epoch": 0.6193548387096774,
|
||
|
|
"grad_norm": 1.9767720699310303,
|
||
|
|
"learning_rate": 9.97911959018954e-05,
|
||
|
|
"loss": 1.9528,
|
||
|
|
"mean_token_accuracy": 0.5889081507921219,
|
||
|
|
"num_tokens": 18439.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.061126083135605,
|
||
|
|
"epoch": 0.6451612903225806,
|
||
|
|
"grad_norm": 1.8903456926345825,
|
||
|
|
"learning_rate": 9.975153876827008e-05,
|
||
|
|
"loss": 1.9973,
|
||
|
|
"mean_token_accuracy": 0.5782413184642792,
|
||
|
|
"num_tokens": 18947.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.953830897808075,
|
||
|
|
"epoch": 0.6709677419354839,
|
||
|
|
"grad_norm": 2.247823715209961,
|
||
|
|
"learning_rate": 9.97084451044556e-05,
|
||
|
|
"loss": 1.8999,
|
||
|
|
"mean_token_accuracy": 0.5786410048604012,
|
||
|
|
"num_tokens": 19410.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.8129592537879944,
|
||
|
|
"epoch": 0.6967741935483871,
|
||
|
|
"grad_norm": 2.3078598976135254,
|
||
|
|
"learning_rate": 9.966191788709716e-05,
|
||
|
|
"loss": 1.6035,
|
||
|
|
"mean_token_accuracy": 0.6230615079402924,
|
||
|
|
"num_tokens": 19831.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9399387836456299,
|
||
|
|
"epoch": 0.7225806451612903,
|
||
|
|
"grad_norm": 1.3792117834091187,
|
||
|
|
"learning_rate": 9.961196033000861e-05,
|
||
|
|
"loss": 1.9753,
|
||
|
|
"mean_token_accuracy": 0.5892214328050613,
|
||
|
|
"num_tokens": 20970.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.8130147755146027,
|
||
|
|
"epoch": 0.7483870967741936,
|
||
|
|
"grad_norm": 1.5490132570266724,
|
||
|
|
"learning_rate": 9.955857588395065e-05,
|
||
|
|
"loss": 1.7023,
|
||
|
|
"mean_token_accuracy": 0.6110316589474678,
|
||
|
|
"num_tokens": 21755.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.077410489320755,
|
||
|
|
"epoch": 0.7741935483870968,
|
||
|
|
"grad_norm": 1.8052752017974854,
|
||
|
|
"learning_rate": 9.950176823639233e-05,
|
||
|
|
"loss": 1.9752,
|
||
|
|
"mean_token_accuracy": 0.6064967960119247,
|
||
|
|
"num_tokens": 22504.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9274516999721527,
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 1.9139018058776855,
|
||
|
|
"learning_rate": 9.944154131125642e-05,
|
||
|
|
"loss": 2.0548,
|
||
|
|
"mean_token_accuracy": 0.5636427998542786,
|
||
|
|
"num_tokens": 23183.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9550862610340118,
|
||
|
|
"epoch": 0.8258064516129032,
|
||
|
|
"grad_norm": 1.9849357604980469,
|
||
|
|
"learning_rate": 9.937789926864838e-05,
|
||
|
|
"loss": 1.8553,
|
||
|
|
"mean_token_accuracy": 0.5807601362466812,
|
||
|
|
"num_tokens": 23774.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.8781771957874298,
|
||
|
|
"epoch": 0.8516129032258064,
|
||
|
|
"grad_norm": 2.0134923458099365,
|
||
|
|
"learning_rate": 9.931084650456892e-05,
|
||
|
|
"loss": 1.7917,
|
||
|
|
"mean_token_accuracy": 0.6070037335157394,
|
||
|
|
"num_tokens": 24313.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.897193729877472,
|
||
|
|
"epoch": 0.8774193548387097,
|
||
|
|
"grad_norm": 2.607464551925659,
|
||
|
|
"learning_rate": 9.924038765061042e-05,
|
||
|
|
"loss": 1.7723,
|
||
|
|
"mean_token_accuracy": 0.6191761344671249,
|
||
|
|
"num_tokens": 24779.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7519680559635162,
|
||
|
|
"epoch": 0.9032258064516129,
|
||
|
|
"grad_norm": 2.4835267066955566,
|
||
|
|
"learning_rate": 9.916652757363698e-05,
|
||
|
|
"loss": 1.5883,
|
||
|
|
"mean_token_accuracy": 0.6609883904457092,
|
||
|
|
"num_tokens": 25211.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9292193055152893,
|
||
|
|
"epoch": 0.9290322580645162,
|
||
|
|
"grad_norm": 2.3735604286193848,
|
||
|
|
"learning_rate": 9.90892713754483e-05,
|
||
|
|
"loss": 1.8049,
|
||
|
|
"mean_token_accuracy": 0.5980570763349533,
|
||
|
|
"num_tokens": 25599.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.881245195865631,
|
||
|
|
"epoch": 0.9548387096774194,
|
||
|
|
"grad_norm": 1.9849742650985718,
|
||
|
|
"learning_rate": 9.900862439242719e-05,
|
||
|
|
"loss": 1.7902,
|
||
|
|
"mean_token_accuracy": 0.5820632129907608,
|
||
|
|
"num_tokens": 26408.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 2.113930821418762,
|
||
|
|
"epoch": 0.9806451612903225,
|
||
|
|
"grad_norm": 3.527271270751953,
|
||
|
|
"learning_rate": 9.892459219517108e-05,
|
||
|
|
"loss": 2.2025,
|
||
|
|
"mean_token_accuracy": 0.5260728523135185,
|
||
|
|
"num_tokens": 27021.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7831549247105916,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 2.5327165126800537,
|
||
|
|
"learning_rate": 9.883718058810707e-05,
|
||
|
|
"loss": 1.4478,
|
||
|
|
"mean_token_accuracy": 0.6935366789499918,
|
||
|
|
"num_tokens": 27353.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.797234058380127,
|
||
|
|
"epoch": 1.0258064516129033,
|
||
|
|
"grad_norm": 1.3197723627090454,
|
||
|
|
"learning_rate": 9.874639560909117e-05,
|
||
|
|
"loss": 1.8934,
|
||
|
|
"mean_token_accuracy": 0.5857948064804077,
|
||
|
|
"num_tokens": 28829.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.9161739647388458,
|
||
|
|
"epoch": 1.0516129032258064,
|
||
|
|
"grad_norm": 1.5616050958633423,
|
||
|
|
"learning_rate": 9.865224352899119e-05,
|
||
|
|
"loss": 1.7257,
|
||
|
|
"mean_token_accuracy": 0.6109496206045151,
|
||
|
|
"num_tokens": 29650.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.8019072711467743,
|
||
|
|
"epoch": 1.0774193548387097,
|
||
|
|
"grad_norm": 1.8876160383224487,
|
||
|
|
"learning_rate": 9.85547308512535e-05,
|
||
|
|
"loss": 1.8085,
|
||
|
|
"mean_token_accuracy": 0.5969990640878677,
|
||
|
|
"num_tokens": 30359.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7833741307258606,
|
||
|
|
"epoch": 1.103225806451613,
|
||
|
|
"grad_norm": 2.0070252418518066,
|
||
|
|
"learning_rate": 9.84538643114539e-05,
|
||
|
|
"loss": 1.6704,
|
||
|
|
"mean_token_accuracy": 0.5969647467136383,
|
||
|
|
"num_tokens": 30961.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7372365295886993,
|
||
|
|
"epoch": 1.129032258064516,
|
||
|
|
"grad_norm": 1.8577375411987305,
|
||
|
|
"learning_rate": 9.834965087683236e-05,
|
||
|
|
"loss": 1.6159,
|
||
|
|
"mean_token_accuracy": 0.6475881487131119,
|
||
|
|
"num_tokens": 31527.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.636292964220047,
|
||
|
|
"epoch": 1.1548387096774193,
|
||
|
|
"grad_norm": 1.8432772159576416,
|
||
|
|
"learning_rate": 9.824209774581174e-05,
|
||
|
|
"loss": 1.5197,
|
||
|
|
"mean_token_accuracy": 0.6530560553073883,
|
||
|
|
"num_tokens": 32050.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6075344681739807,
|
||
|
|
"epoch": 1.1806451612903226,
|
||
|
|
"grad_norm": 1.869754672050476,
|
||
|
|
"learning_rate": 9.81312123475006e-05,
|
||
|
|
"loss": 1.3557,
|
||
|
|
"mean_token_accuracy": 0.6470372080802917,
|
||
|
|
"num_tokens": 32532.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6146334111690521,
|
||
|
|
"epoch": 1.206451612903226,
|
||
|
|
"grad_norm": 2.099989175796509,
|
||
|
|
"learning_rate": 9.801700234117999e-05,
|
||
|
|
"loss": 1.2998,
|
||
|
|
"mean_token_accuracy": 0.6936827301979065,
|
||
|
|
"num_tokens": 32967.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7050741314888,
|
||
|
|
"epoch": 1.232258064516129,
|
||
|
|
"grad_norm": 2.504159688949585,
|
||
|
|
"learning_rate": 9.789947561577445e-05,
|
||
|
|
"loss": 1.5017,
|
||
|
|
"mean_token_accuracy": 0.622559979557991,
|
||
|
|
"num_tokens": 33363.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6869353950023651,
|
||
|
|
"epoch": 1.2580645161290323,
|
||
|
|
"grad_norm": 1.2886877059936523,
|
||
|
|
"learning_rate": 9.777864028930705e-05,
|
||
|
|
"loss": 1.6731,
|
||
|
|
"mean_token_accuracy": 0.6039082556962967,
|
||
|
|
"num_tokens": 35015.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6093480288982391,
|
||
|
|
"epoch": 1.2838709677419355,
|
||
|
|
"grad_norm": 1.6378092765808105,
|
||
|
|
"learning_rate": 9.765450470833865e-05,
|
||
|
|
"loss": 1.4894,
|
||
|
|
"mean_token_accuracy": 0.6367563456296921,
|
||
|
|
"num_tokens": 35999.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6687067151069641,
|
||
|
|
"epoch": 1.3096774193548386,
|
||
|
|
"grad_norm": 1.8195027112960815,
|
||
|
|
"learning_rate": 9.752707744739145e-05,
|
||
|
|
"loss": 1.5385,
|
||
|
|
"mean_token_accuracy": 0.6437539905309677,
|
||
|
|
"num_tokens": 36850.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4987359642982483,
|
||
|
|
"epoch": 1.335483870967742,
|
||
|
|
"grad_norm": 1.8060271739959717,
|
||
|
|
"learning_rate": 9.73963673083566e-05,
|
||
|
|
"loss": 1.3978,
|
||
|
|
"mean_token_accuracy": 0.661731407046318,
|
||
|
|
"num_tokens": 37604.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5831853449344635,
|
||
|
|
"epoch": 1.3612903225806452,
|
||
|
|
"grad_norm": 2.213078260421753,
|
||
|
|
"learning_rate": 9.726238331988624e-05,
|
||
|
|
"loss": 1.7863,
|
||
|
|
"mean_token_accuracy": 0.6147271245718002,
|
||
|
|
"num_tokens": 38314.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5708496272563934,
|
||
|
|
"epoch": 1.3870967741935485,
|
||
|
|
"grad_norm": 3.098945140838623,
|
||
|
|
"learning_rate": 9.712513473676996e-05,
|
||
|
|
"loss": 1.6752,
|
||
|
|
"mean_token_accuracy": 0.6371889561414719,
|
||
|
|
"num_tokens": 38941.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4293319284915924,
|
||
|
|
"epoch": 1.4129032258064516,
|
||
|
|
"grad_norm": 2.6225318908691406,
|
||
|
|
"learning_rate": 9.698463103929542e-05,
|
||
|
|
"loss": 1.5132,
|
||
|
|
"mean_token_accuracy": 0.6733423620462418,
|
||
|
|
"num_tokens": 39485.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4221723973751068,
|
||
|
|
"epoch": 1.4387096774193548,
|
||
|
|
"grad_norm": 2.834839105606079,
|
||
|
|
"learning_rate": 9.684088193259355e-05,
|
||
|
|
"loss": 1.4956,
|
||
|
|
"mean_token_accuracy": 0.6675658673048019,
|
||
|
|
"num_tokens": 39954.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.3391860723495483,
|
||
|
|
"epoch": 1.4645161290322581,
|
||
|
|
"grad_norm": 2.185546398162842,
|
||
|
|
"learning_rate": 9.669389734596819e-05,
|
||
|
|
"loss": 1.1981,
|
||
|
|
"mean_token_accuracy": 0.7050470858812332,
|
||
|
|
"num_tokens": 40374.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6070669293403625,
|
||
|
|
"epoch": 1.4903225806451612,
|
||
|
|
"grad_norm": 1.3461191654205322,
|
||
|
|
"learning_rate": 9.654368743221022e-05,
|
||
|
|
"loss": 1.6617,
|
||
|
|
"mean_token_accuracy": 0.5980251729488373,
|
||
|
|
"num_tokens": 42027.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6520465910434723,
|
||
|
|
"epoch": 1.5161290322580645,
|
||
|
|
"grad_norm": 1.6961472034454346,
|
||
|
|
"learning_rate": 9.639026256689628e-05,
|
||
|
|
"loss": 1.577,
|
||
|
|
"mean_token_accuracy": 0.6316726058721542,
|
||
|
|
"num_tokens": 42916.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.7670880556106567,
|
||
|
|
"epoch": 1.5419354838709678,
|
||
|
|
"grad_norm": 2.0527658462524414,
|
||
|
|
"learning_rate": 9.623363334767208e-05,
|
||
|
|
"loss": 1.7517,
|
||
|
|
"mean_token_accuracy": 0.6005731225013733,
|
||
|
|
"num_tokens": 43719.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5744120478630066,
|
||
|
|
"epoch": 1.567741935483871,
|
||
|
|
"grad_norm": 2.1162519454956055,
|
||
|
|
"learning_rate": 9.607381059352038e-05,
|
||
|
|
"loss": 1.5544,
|
||
|
|
"mean_token_accuracy": 0.6523573398590088,
|
||
|
|
"num_tokens": 44493.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.728984385728836,
|
||
|
|
"epoch": 1.5935483870967742,
|
||
|
|
"grad_norm": 2.0401268005371094,
|
||
|
|
"learning_rate": 9.591080534401371e-05,
|
||
|
|
"loss": 1.699,
|
||
|
|
"mean_token_accuracy": 0.6030448973178864,
|
||
|
|
"num_tokens": 45170.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5222464203834534,
|
||
|
|
"epoch": 1.6193548387096774,
|
||
|
|
"grad_norm": 2.430859327316284,
|
||
|
|
"learning_rate": 9.574462885855174e-05,
|
||
|
|
"loss": 1.2944,
|
||
|
|
"mean_token_accuracy": 0.6946325898170471,
|
||
|
|
"num_tokens": 45755.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.528793841600418,
|
||
|
|
"epoch": 1.6451612903225805,
|
||
|
|
"grad_norm": 2.3277854919433594,
|
||
|
|
"learning_rate": 9.557529261558367e-05,
|
||
|
|
"loss": 1.3969,
|
||
|
|
"mean_token_accuracy": 0.6722464263439178,
|
||
|
|
"num_tokens": 46268.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.6062091886997223,
|
||
|
|
"epoch": 1.6709677419354838,
|
||
|
|
"grad_norm": 2.8640811443328857,
|
||
|
|
"learning_rate": 9.540280831181525e-05,
|
||
|
|
"loss": 1.3636,
|
||
|
|
"mean_token_accuracy": 0.6864263862371445,
|
||
|
|
"num_tokens": 46737.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.336740493774414,
|
||
|
|
"epoch": 1.696774193548387,
|
||
|
|
"grad_norm": 2.5550613403320312,
|
||
|
|
"learning_rate": 9.522718786140097e-05,
|
||
|
|
"loss": 1.0106,
|
||
|
|
"mean_token_accuracy": 0.7365925908088684,
|
||
|
|
"num_tokens": 47163.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.789841502904892,
|
||
|
|
"epoch": 1.7225806451612904,
|
||
|
|
"grad_norm": 1.9967743158340454,
|
||
|
|
"learning_rate": 9.504844339512095e-05,
|
||
|
|
"loss": 1.715,
|
||
|
|
"mean_token_accuracy": 0.614040270447731,
|
||
|
|
"num_tokens": 48108.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5481957495212555,
|
||
|
|
"epoch": 1.7483870967741937,
|
||
|
|
"grad_norm": 1.912815809249878,
|
||
|
|
"learning_rate": 9.486658725954321e-05,
|
||
|
|
"loss": 1.3063,
|
||
|
|
"mean_token_accuracy": 0.6685247123241425,
|
||
|
|
"num_tokens": 48901.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.618812471628189,
|
||
|
|
"epoch": 1.7741935483870968,
|
||
|
|
"grad_norm": 2.1326448917388916,
|
||
|
|
"learning_rate": 9.468163201617062e-05,
|
||
|
|
"loss": 1.4826,
|
||
|
|
"mean_token_accuracy": 0.6648016273975372,
|
||
|
|
"num_tokens": 49668.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4738461375236511,
|
||
|
|
"epoch": 1.8,
|
||
|
|
"grad_norm": 2.2856757640838623,
|
||
|
|
"learning_rate": 9.449359044057345e-05,
|
||
|
|
"loss": 1.5099,
|
||
|
|
"mean_token_accuracy": 0.6307590007781982,
|
||
|
|
"num_tokens": 50353.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.42239710688591,
|
||
|
|
"epoch": 1.8258064516129031,
|
||
|
|
"grad_norm": 2.272261381149292,
|
||
|
|
"learning_rate": 9.430247552150673e-05,
|
||
|
|
"loss": 1.4451,
|
||
|
|
"mean_token_accuracy": 0.6698804646730423,
|
||
|
|
"num_tokens": 50954.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5603100061416626,
|
||
|
|
"epoch": 1.8516129032258064,
|
||
|
|
"grad_norm": 2.444957971572876,
|
||
|
|
"learning_rate": 9.410830046001321e-05,
|
||
|
|
"loss": 1.5631,
|
||
|
|
"mean_token_accuracy": 0.6537315994501114,
|
||
|
|
"num_tokens": 51493.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.421448290348053,
|
||
|
|
"epoch": 1.8774193548387097,
|
||
|
|
"grad_norm": 2.62430477142334,
|
||
|
|
"learning_rate": 9.391107866851143e-05,
|
||
|
|
"loss": 1.442,
|
||
|
|
"mean_token_accuracy": 0.6888918429613113,
|
||
|
|
"num_tokens": 51976.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.3042734861373901,
|
||
|
|
"epoch": 1.903225806451613,
|
||
|
|
"grad_norm": 2.522318124771118,
|
||
|
|
"learning_rate": 9.371082376986928e-05,
|
||
|
|
"loss": 1.2438,
|
||
|
|
"mean_token_accuracy": 0.6721822023391724,
|
||
|
|
"num_tokens": 52413.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0973184555768967,
|
||
|
|
"epoch": 1.9290322580645163,
|
||
|
|
"grad_norm": 2.2152483463287354,
|
||
|
|
"learning_rate": 9.350754959646306e-05,
|
||
|
|
"loss": 0.9649,
|
||
|
|
"mean_token_accuracy": 0.7464027404785156,
|
||
|
|
"num_tokens": 52812.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4785442054271698,
|
||
|
|
"epoch": 1.9548387096774194,
|
||
|
|
"grad_norm": 1.778226613998413,
|
||
|
|
"learning_rate": 9.330127018922194e-05,
|
||
|
|
"loss": 1.5472,
|
||
|
|
"mean_token_accuracy": 0.6413073837757111,
|
||
|
|
"num_tokens": 53810.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4850931763648987,
|
||
|
|
"epoch": 1.9806451612903224,
|
||
|
|
"grad_norm": 2.324070453643799,
|
||
|
|
"learning_rate": 9.30919997966582e-05,
|
||
|
|
"loss": 1.4766,
|
||
|
|
"mean_token_accuracy": 0.6507462114095688,
|
||
|
|
"num_tokens": 54370.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.5041760206222534,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 2.711214542388916,
|
||
|
|
"learning_rate": 9.287975287388298e-05,
|
||
|
|
"loss": 1.3224,
|
||
|
|
"mean_token_accuracy": 0.6853142380714417,
|
||
|
|
"num_tokens": 54706.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.561076819896698,
|
||
|
|
"epoch": 2.0258064516129033,
|
||
|
|
"grad_norm": 1.4298901557922363,
|
||
|
|
"learning_rate": 9.266454408160779e-05,
|
||
|
|
"loss": 1.5017,
|
||
|
|
"mean_token_accuracy": 0.6616432368755341,
|
||
|
|
"num_tokens": 56147.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.4342933893203735,
|
||
|
|
"epoch": 2.0516129032258066,
|
||
|
|
"grad_norm": 1.9477201700210571,
|
||
|
|
"learning_rate": 9.244638828513187e-05,
|
||
|
|
"loss": 1.0989,
|
||
|
|
"mean_token_accuracy": 0.7380426079034805,
|
||
|
|
"num_tokens": 56998.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.3799369037151337,
|
||
|
|
"epoch": 2.07741935483871,
|
||
|
|
"grad_norm": 1.899839162826538,
|
||
|
|
"learning_rate": 9.22253005533154e-05,
|
||
|
|
"loss": 1.0685,
|
||
|
|
"mean_token_accuracy": 0.7503155916929245,
|
||
|
|
"num_tokens": 57799.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.2785212695598602,
|
||
|
|
"epoch": 2.1032258064516127,
|
||
|
|
"grad_norm": 2.1526200771331787,
|
||
|
|
"learning_rate": 9.200129615753859e-05,
|
||
|
|
"loss": 1.0346,
|
||
|
|
"mean_token_accuracy": 0.7295394539833069,
|
||
|
|
"num_tokens": 58548.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.1957830488681793,
|
||
|
|
"epoch": 2.129032258064516,
|
||
|
|
"grad_norm": 2.5215909481048584,
|
||
|
|
"learning_rate": 9.177439057064683e-05,
|
||
|
|
"loss": 1.0066,
|
||
|
|
"mean_token_accuracy": 0.7433657646179199,
|
||
|
|
"num_tokens": 59174.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.3421072363853455,
|
||
|
|
"epoch": 2.1548387096774193,
|
||
|
|
"grad_norm": 2.606336832046509,
|
||
|
|
"learning_rate": 9.154459946588198e-05,
|
||
|
|
"loss": 1.1666,
|
||
|
|
"mean_token_accuracy": 0.7091180384159088,
|
||
|
|
"num_tokens": 59769.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.032430723309517,
|
||
|
|
"epoch": 2.1806451612903226,
|
||
|
|
"grad_norm": 2.835961103439331,
|
||
|
|
"learning_rate": 9.131193871579975e-05,
|
||
|
|
"loss": 0.9103,
|
||
|
|
"mean_token_accuracy": 0.7784561067819595,
|
||
|
|
"num_tokens": 60295.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0069421231746674,
|
||
|
|
"epoch": 2.206451612903226,
|
||
|
|
"grad_norm": 3.632134437561035,
|
||
|
|
"learning_rate": 9.107642439117321e-05,
|
||
|
|
"loss": 0.7677,
|
||
|
|
"mean_token_accuracy": 0.7896548062562943,
|
||
|
|
"num_tokens": 60744.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.784252293407917,
|
||
|
|
"epoch": 2.232258064516129,
|
||
|
|
"grad_norm": 3.14766526222229,
|
||
|
|
"learning_rate": 9.083807275988284e-05,
|
||
|
|
"loss": 0.6092,
|
||
|
|
"mean_token_accuracy": 0.8186918497085571,
|
||
|
|
"num_tokens": 61151.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.1425200402736664,
|
||
|
|
"epoch": 2.258064516129032,
|
||
|
|
"grad_norm": 2.9548776149749756,
|
||
|
|
"learning_rate": 9.059690028579283e-05,
|
||
|
|
"loss": 1.2423,
|
||
|
|
"mean_token_accuracy": 0.67966029047966,
|
||
|
|
"num_tokens": 62417.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.075703114271164,
|
||
|
|
"epoch": 2.2838709677419353,
|
||
|
|
"grad_norm": 2.6472651958465576,
|
||
|
|
"learning_rate": 9.035292362761381e-05,
|
||
|
|
"loss": 1.1406,
|
||
|
|
"mean_token_accuracy": 0.7184228450059891,
|
||
|
|
"num_tokens": 63270.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9899384379386902,
|
||
|
|
"epoch": 2.3096774193548386,
|
||
|
|
"grad_norm": 2.6800777912139893,
|
||
|
|
"learning_rate": 9.01061596377522e-05,
|
||
|
|
"loss": 0.9555,
|
||
|
|
"mean_token_accuracy": 0.759021058678627,
|
||
|
|
"num_tokens": 64027.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.2101148664951324,
|
||
|
|
"epoch": 2.335483870967742,
|
||
|
|
"grad_norm": 3.1797468662261963,
|
||
|
|
"learning_rate": 8.985662536114613e-05,
|
||
|
|
"loss": 1.2574,
|
||
|
|
"mean_token_accuracy": 0.707681193947792,
|
||
|
|
"num_tokens": 64701.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9667136818170547,
|
||
|
|
"epoch": 2.361290322580645,
|
||
|
|
"grad_norm": 2.6233391761779785,
|
||
|
|
"learning_rate": 8.960433803408813e-05,
|
||
|
|
"loss": 0.7913,
|
||
|
|
"mean_token_accuracy": 0.7882635146379471,
|
||
|
|
"num_tokens": 65308.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9306632727384567,
|
||
|
|
"epoch": 2.3870967741935485,
|
||
|
|
"grad_norm": 2.395880699157715,
|
||
|
|
"learning_rate": 8.934931508303445e-05,
|
||
|
|
"loss": 0.7301,
|
||
|
|
"mean_token_accuracy": 0.7955707758665085,
|
||
|
|
"num_tokens": 65878.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0530627965927124,
|
||
|
|
"epoch": 2.412903225806452,
|
||
|
|
"grad_norm": 2.9347379207611084,
|
||
|
|
"learning_rate": 8.90915741234015e-05,
|
||
|
|
"loss": 0.8363,
|
||
|
|
"mean_token_accuracy": 0.775736004114151,
|
||
|
|
"num_tokens": 66364.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0531336814165115,
|
||
|
|
"epoch": 2.4387096774193546,
|
||
|
|
"grad_norm": 3.1018309593200684,
|
||
|
|
"learning_rate": 8.883113295834892e-05,
|
||
|
|
"loss": 0.8268,
|
||
|
|
"mean_token_accuracy": 0.7704032361507416,
|
||
|
|
"num_tokens": 66820.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0696537494659424,
|
||
|
|
"epoch": 2.464516129032258,
|
||
|
|
"grad_norm": 3.423306941986084,
|
||
|
|
"learning_rate": 8.856800957755e-05,
|
||
|
|
"loss": 0.7847,
|
||
|
|
"mean_token_accuracy": 0.7773692905902863,
|
||
|
|
"num_tokens": 67214.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.306801289319992,
|
||
|
|
"epoch": 2.490322580645161,
|
||
|
|
"grad_norm": 1.6437768936157227,
|
||
|
|
"learning_rate": 8.83022221559489e-05,
|
||
|
|
"loss": 1.2357,
|
||
|
|
"mean_token_accuracy": 0.669854074716568,
|
||
|
|
"num_tokens": 68733.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.1772551238536835,
|
||
|
|
"epoch": 2.5161290322580645,
|
||
|
|
"grad_norm": 2.4962806701660156,
|
||
|
|
"learning_rate": 8.803378905250544e-05,
|
||
|
|
"loss": 1.0752,
|
||
|
|
"mean_token_accuracy": 0.711113303899765,
|
||
|
|
"num_tokens": 69580.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.166929692029953,
|
||
|
|
"epoch": 2.541935483870968,
|
||
|
|
"grad_norm": 2.8279449939727783,
|
||
|
|
"learning_rate": 8.776272880892675e-05,
|
||
|
|
"loss": 1.0135,
|
||
|
|
"mean_token_accuracy": 0.7302903383970261,
|
||
|
|
"num_tokens": 70359.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.2368881702423096,
|
||
|
|
"epoch": 2.567741935483871,
|
||
|
|
"grad_norm": 2.812784194946289,
|
||
|
|
"learning_rate": 8.748906014838672e-05,
|
||
|
|
"loss": 1.0997,
|
||
|
|
"mean_token_accuracy": 0.7428575754165649,
|
||
|
|
"num_tokens": 71051.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0734427571296692,
|
||
|
|
"epoch": 2.5935483870967744,
|
||
|
|
"grad_norm": 3.168055772781372,
|
||
|
|
"learning_rate": 8.721280197423258e-05,
|
||
|
|
"loss": 0.9557,
|
||
|
|
"mean_token_accuracy": 0.7500255256891251,
|
||
|
|
"num_tokens": 71653.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0182117372751236,
|
||
|
|
"epoch": 2.6193548387096772,
|
||
|
|
"grad_norm": 2.928173065185547,
|
||
|
|
"learning_rate": 8.69339733686793e-05,
|
||
|
|
"loss": 0.7934,
|
||
|
|
"mean_token_accuracy": 0.7967472970485687,
|
||
|
|
"num_tokens": 72206.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9825232028961182,
|
||
|
|
"epoch": 2.6451612903225805,
|
||
|
|
"grad_norm": 3.5911121368408203,
|
||
|
|
"learning_rate": 8.665259359149132e-05,
|
||
|
|
"loss": 0.7435,
|
||
|
|
"mean_token_accuracy": 0.7856406420469284,
|
||
|
|
"num_tokens": 72709.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.8393460661172867,
|
||
|
|
"epoch": 2.670967741935484,
|
||
|
|
"grad_norm": 3.1751551628112793,
|
||
|
|
"learning_rate": 8.636868207865244e-05,
|
||
|
|
"loss": 0.5727,
|
||
|
|
"mean_token_accuracy": 0.8536647707223892,
|
||
|
|
"num_tokens": 73172.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.733843207359314,
|
||
|
|
"epoch": 2.696774193548387,
|
||
|
|
"grad_norm": 3.002105951309204,
|
||
|
|
"learning_rate": 8.60822584410231e-05,
|
||
|
|
"loss": 0.4306,
|
||
|
|
"mean_token_accuracy": 0.9011064171791077,
|
||
|
|
"num_tokens": 73601.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.09127739071846,
|
||
|
|
"epoch": 2.7225806451612904,
|
||
|
|
"grad_norm": 2.7801899909973145,
|
||
|
|
"learning_rate": 8.579334246298593e-05,
|
||
|
|
"loss": 1.3229,
|
||
|
|
"mean_token_accuracy": 0.6847837716341019,
|
||
|
|
"num_tokens": 75066.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.1003702282905579,
|
||
|
|
"epoch": 2.7483870967741937,
|
||
|
|
"grad_norm": 2.8465728759765625,
|
||
|
|
"learning_rate": 8.550195410107902e-05,
|
||
|
|
"loss": 1.026,
|
||
|
|
"mean_token_accuracy": 0.7287466824054718,
|
||
|
|
"num_tokens": 75935.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0054174661636353,
|
||
|
|
"epoch": 2.774193548387097,
|
||
|
|
"grad_norm": 2.6831374168395996,
|
||
|
|
"learning_rate": 8.520811348261759e-05,
|
||
|
|
"loss": 0.8887,
|
||
|
|
"mean_token_accuracy": 0.7784150391817093,
|
||
|
|
"num_tokens": 76730.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.1102914214134216,
|
||
|
|
"epoch": 2.8,
|
||
|
|
"grad_norm": 3.408310651779175,
|
||
|
|
"learning_rate": 8.491184090430364e-05,
|
||
|
|
"loss": 1.0831,
|
||
|
|
"mean_token_accuracy": 0.7278113067150116,
|
||
|
|
"num_tokens": 77474.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.999423012137413,
|
||
|
|
"epoch": 2.825806451612903,
|
||
|
|
"grad_norm": 3.7338831424713135,
|
||
|
|
"learning_rate": 8.461315683082399e-05,
|
||
|
|
"loss": 1.0257,
|
||
|
|
"mean_token_accuracy": 0.7361829876899719,
|
||
|
|
"num_tokens": 78068.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9764816612005234,
|
||
|
|
"epoch": 2.8516129032258064,
|
||
|
|
"grad_norm": 3.499826192855835,
|
||
|
|
"learning_rate": 8.43120818934367e-05,
|
||
|
|
"loss": 0.8335,
|
||
|
|
"mean_token_accuracy": 0.764112114906311,
|
||
|
|
"num_tokens": 78589.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9866785109043121,
|
||
|
|
"epoch": 2.8774193548387097,
|
||
|
|
"grad_norm": 3.31439471244812,
|
||
|
|
"learning_rate": 8.400863688854597e-05,
|
||
|
|
"loss": 0.9472,
|
||
|
|
"mean_token_accuracy": 0.7592662870883942,
|
||
|
|
"num_tokens": 79080.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.8102796524763107,
|
||
|
|
"epoch": 2.903225806451613,
|
||
|
|
"grad_norm": 3.768465757369995,
|
||
|
|
"learning_rate": 8.370284277626577e-05,
|
||
|
|
"loss": 0.6879,
|
||
|
|
"mean_token_accuracy": 0.7918446511030197,
|
||
|
|
"num_tokens": 79518.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.7523371577262878,
|
||
|
|
"epoch": 2.9290322580645163,
|
||
|
|
"grad_norm": 3.107103109359741,
|
||
|
|
"learning_rate": 8.339472067897187e-05,
|
||
|
|
"loss": 0.5142,
|
||
|
|
"mean_token_accuracy": 0.8337104171514511,
|
||
|
|
"num_tokens": 79925.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.2405670583248138,
|
||
|
|
"epoch": 2.9548387096774196,
|
||
|
|
"grad_norm": 2.0415544509887695,
|
||
|
|
"learning_rate": 8.308429187984297e-05,
|
||
|
|
"loss": 1.2469,
|
||
|
|
"mean_token_accuracy": 0.6947166323661804,
|
||
|
|
"num_tokens": 81111.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0534760355949402,
|
||
|
|
"epoch": 2.9806451612903224,
|
||
|
|
"grad_norm": 3.243969440460205,
|
||
|
|
"learning_rate": 8.27715778213905e-05,
|
||
|
|
"loss": 1.0014,
|
||
|
|
"mean_token_accuracy": 0.752901017665863,
|
||
|
|
"num_tokens": 81717.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.8150668541590372,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 3.7620151042938232,
|
||
|
|
"learning_rate": 8.24566001039776e-05,
|
||
|
|
"loss": 0.6544,
|
||
|
|
"mean_token_accuracy": 0.8201234340667725,
|
||
|
|
"num_tokens": 82059.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.2218182981014252,
|
||
|
|
"epoch": 3.0258064516129033,
|
||
|
|
"grad_norm": 2.0384461879730225,
|
||
|
|
"learning_rate": 8.213938048432697e-05,
|
||
|
|
"loss": 0.9903,
|
||
|
|
"mean_token_accuracy": 0.7458517551422119,
|
||
|
|
"num_tokens": 83704.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0654624998569489,
|
||
|
|
"epoch": 3.0516129032258066,
|
||
|
|
"grad_norm": 2.7097387313842773,
|
||
|
|
"learning_rate": 8.181994087401819e-05,
|
||
|
|
"loss": 0.6589,
|
||
|
|
"mean_token_accuracy": 0.8282175809144974,
|
||
|
|
"num_tokens": 84564.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.9389624744653702,
|
||
|
|
"epoch": 3.07741935483871,
|
||
|
|
"grad_norm": 3.422351360321045,
|
||
|
|
"learning_rate": 8.149830333797407e-05,
|
||
|
|
"loss": 0.6736,
|
||
|
|
"mean_token_accuracy": 0.8170457482337952,
|
||
|
|
"num_tokens": 85305.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.891632542014122,
|
||
|
|
"epoch": 3.1032258064516127,
|
||
|
|
"grad_norm": 2.9999988079071045,
|
||
|
|
"learning_rate": 8.117449009293668e-05,
|
||
|
|
"loss": 0.5435,
|
||
|
|
"mean_token_accuracy": 0.8579341620206833,
|
||
|
|
"num_tokens": 85927.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.7080177962779999,
|
||
|
|
"epoch": 3.129032258064516,
|
||
|
|
"grad_norm": 2.7167727947235107,
|
||
|
|
"learning_rate": 8.084852350593264e-05,
|
||
|
|
"loss": 0.386,
|
||
|
|
"mean_token_accuracy": 0.9050543904304504,
|
||
|
|
"num_tokens": 86500.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5361127704381943,
|
||
|
|
"epoch": 3.1548387096774193,
|
||
|
|
"grad_norm": 3.051241874694824,
|
||
|
|
"learning_rate": 8.052042609272817e-05,
|
||
|
|
"loss": 0.314,
|
||
|
|
"mean_token_accuracy": 0.9146886169910431,
|
||
|
|
"num_tokens": 87009.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5324621573090553,
|
||
|
|
"epoch": 3.1806451612903226,
|
||
|
|
"grad_norm": 3.0022952556610107,
|
||
|
|
"learning_rate": 8.019022051627388e-05,
|
||
|
|
"loss": 0.3141,
|
||
|
|
"mean_token_accuracy": 0.9247495979070663,
|
||
|
|
"num_tokens": 87467.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40624529123306274,
|
||
|
|
"epoch": 3.206451612903226,
|
||
|
|
"grad_norm": 3.094412326812744,
|
||
|
|
"learning_rate": 7.985792958513931e-05,
|
||
|
|
"loss": 0.26,
|
||
|
|
"mean_token_accuracy": 0.9299735277891159,
|
||
|
|
"num_tokens": 87885.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3672215938568115,
|
||
|
|
"epoch": 3.232258064516129,
|
||
|
|
"grad_norm": 3.4929354190826416,
|
||
|
|
"learning_rate": 7.952357625193749e-05,
|
||
|
|
"loss": 0.2392,
|
||
|
|
"mean_token_accuracy": 0.9306517392396927,
|
||
|
|
"num_tokens": 88260.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.8298548460006714,
|
||
|
|
"epoch": 3.258064516129032,
|
||
|
|
"grad_norm": 2.836134672164917,
|
||
|
|
"learning_rate": 7.91871836117395e-05,
|
||
|
|
"loss": 0.7053,
|
||
|
|
"mean_token_accuracy": 0.8246497809886932,
|
||
|
|
"num_tokens": 89262.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5190232917666435,
|
||
|
|
"epoch": 3.2838709677419353,
|
||
|
|
"grad_norm": 5.216272830963135,
|
||
|
|
"learning_rate": 7.884877490047915e-05,
|
||
|
|
"loss": 0.565,
|
||
|
|
"mean_token_accuracy": 0.8471736311912537,
|
||
|
|
"num_tokens": 90062.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4947461038827896,
|
||
|
|
"epoch": 3.3096774193548386,
|
||
|
|
"grad_norm": 4.143370628356934,
|
||
|
|
"learning_rate": 7.85083734933481e-05,
|
||
|
|
"loss": 0.5013,
|
||
|
|
"mean_token_accuracy": 0.8697308301925659,
|
||
|
|
"num_tokens": 90841.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5702934339642525,
|
||
|
|
"epoch": 3.335483870967742,
|
||
|
|
"grad_norm": 5.3610520362854,
|
||
|
|
"learning_rate": 7.81660029031811e-05,
|
||
|
|
"loss": 0.657,
|
||
|
|
"mean_token_accuracy": 0.8270199149847031,
|
||
|
|
"num_tokens": 91591.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5612503439188004,
|
||
|
|
"epoch": 3.361290322580645,
|
||
|
|
"grad_norm": 4.896009922027588,
|
||
|
|
"learning_rate": 7.782168677883206e-05,
|
||
|
|
"loss": 0.638,
|
||
|
|
"mean_token_accuracy": 0.8336956202983856,
|
||
|
|
"num_tokens": 92304.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47641437500715256,
|
||
|
|
"epoch": 3.3870967741935485,
|
||
|
|
"grad_norm": 5.059084415435791,
|
||
|
|
"learning_rate": 7.74754489035403e-05,
|
||
|
|
"loss": 0.516,
|
||
|
|
"mean_token_accuracy": 0.8493129163980484,
|
||
|
|
"num_tokens": 92920.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5311232656240463,
|
||
|
|
"epoch": 3.412903225806452,
|
||
|
|
"grad_norm": 3.7369489669799805,
|
||
|
|
"learning_rate": 7.712731319328798e-05,
|
||
|
|
"loss": 0.4084,
|
||
|
|
"mean_token_accuracy": 0.8949003219604492,
|
||
|
|
"num_tokens": 93468.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4599653482437134,
|
||
|
|
"epoch": 3.4387096774193546,
|
||
|
|
"grad_norm": 4.457752704620361,
|
||
|
|
"learning_rate": 7.677730369514793e-05,
|
||
|
|
"loss": 0.4303,
|
||
|
|
"mean_token_accuracy": 0.8998099863529205,
|
||
|
|
"num_tokens": 93952.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3341464288532734,
|
||
|
|
"epoch": 3.464516129032258,
|
||
|
|
"grad_norm": 2.74814772605896,
|
||
|
|
"learning_rate": 7.642544458562278e-05,
|
||
|
|
"loss": 0.2045,
|
||
|
|
"mean_token_accuracy": 0.9389902055263519,
|
||
|
|
"num_tokens": 94378.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.7704500108957291,
|
||
|
|
"epoch": 3.490322580645161,
|
||
|
|
"grad_norm": 2.1899735927581787,
|
||
|
|
"learning_rate": 7.60717601689749e-05,
|
||
|
|
"loss": 0.7928,
|
||
|
|
"mean_token_accuracy": 0.7940146774053574,
|
||
|
|
"num_tokens": 96188.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.8460464626550674,
|
||
|
|
"epoch": 3.5161290322580645,
|
||
|
|
"grad_norm": 2.439542531967163,
|
||
|
|
"learning_rate": 7.571627487554769e-05,
|
||
|
|
"loss": 0.7167,
|
||
|
|
"mean_token_accuracy": 0.7986479252576828,
|
||
|
|
"num_tokens": 97250.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.683267816901207,
|
||
|
|
"epoch": 3.541935483870968,
|
||
|
|
"grad_norm": 3.4693028926849365,
|
||
|
|
"learning_rate": 7.535901326007795e-05,
|
||
|
|
"loss": 0.5391,
|
||
|
|
"mean_token_accuracy": 0.8488983660936356,
|
||
|
|
"num_tokens": 98028.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6665534228086472,
|
||
|
|
"epoch": 3.567741935483871,
|
||
|
|
"grad_norm": 3.313450336456299,
|
||
|
|
"learning_rate": 7.500000000000001e-05,
|
||
|
|
"loss": 0.4977,
|
||
|
|
"mean_token_accuracy": 0.8638099581003189,
|
||
|
|
"num_tokens": 98727.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6375805735588074,
|
||
|
|
"epoch": 3.5935483870967744,
|
||
|
|
"grad_norm": 3.621342897415161,
|
||
|
|
"learning_rate": 7.463925989374089e-05,
|
||
|
|
"loss": 0.521,
|
||
|
|
"mean_token_accuracy": 0.8624279350042343,
|
||
|
|
"num_tokens": 99329.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5712595283985138,
|
||
|
|
"epoch": 3.6193548387096772,
|
||
|
|
"grad_norm": 3.667834520339966,
|
||
|
|
"learning_rate": 7.427681785900761e-05,
|
||
|
|
"loss": 0.4579,
|
||
|
|
"mean_token_accuracy": 0.8609372973442078,
|
||
|
|
"num_tokens": 99866.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5664890855550766,
|
||
|
|
"epoch": 3.6451612903225805,
|
||
|
|
"grad_norm": 3.193061113357544,
|
||
|
|
"learning_rate": 7.391269893106592e-05,
|
||
|
|
"loss": 0.3498,
|
||
|
|
"mean_token_accuracy": 0.9016094356775284,
|
||
|
|
"num_tokens": 100358.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4809069186449051,
|
||
|
|
"epoch": 3.670967741935484,
|
||
|
|
"grad_norm": 2.9797909259796143,
|
||
|
|
"learning_rate": 7.354692826101102e-05,
|
||
|
|
"loss": 0.239,
|
||
|
|
"mean_token_accuracy": 0.937361553311348,
|
||
|
|
"num_tokens": 100810.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3825264722108841,
|
||
|
|
"epoch": 3.696774193548387,
|
||
|
|
"grad_norm": 2.5916123390197754,
|
||
|
|
"learning_rate": 7.317953111403029e-05,
|
||
|
|
"loss": 0.2293,
|
||
|
|
"mean_token_accuracy": 0.959057167172432,
|
||
|
|
"num_tokens": 101224.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 1.0315645188093185,
|
||
|
|
"epoch": 3.7225806451612904,
|
||
|
|
"grad_norm": 2.4332456588745117,
|
||
|
|
"learning_rate": 7.281053286765815e-05,
|
||
|
|
"loss": 0.9734,
|
||
|
|
"mean_token_accuracy": 0.7563262432813644,
|
||
|
|
"num_tokens": 102666.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.7324022054672241,
|
||
|
|
"epoch": 3.7483870967741937,
|
||
|
|
"grad_norm": 3.319155693054199,
|
||
|
|
"learning_rate": 7.243995901002312e-05,
|
||
|
|
"loss": 0.526,
|
||
|
|
"mean_token_accuracy": 0.862901970744133,
|
||
|
|
"num_tokens": 103560.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.7977930456399918,
|
||
|
|
"epoch": 3.774193548387097,
|
||
|
|
"grad_norm": 3.708766460418701,
|
||
|
|
"learning_rate": 7.20678351380872e-05,
|
||
|
|
"loss": 0.5996,
|
||
|
|
"mean_token_accuracy": 0.8376729637384415,
|
||
|
|
"num_tokens": 104386.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.67112597823143,
|
||
|
|
"epoch": 3.8,
|
||
|
|
"grad_norm": 3.474480152130127,
|
||
|
|
"learning_rate": 7.169418695587791e-05,
|
||
|
|
"loss": 0.5283,
|
||
|
|
"mean_token_accuracy": 0.8518707603216171,
|
||
|
|
"num_tokens": 105173.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6674353927373886,
|
||
|
|
"epoch": 3.825806451612903,
|
||
|
|
"grad_norm": 4.0479736328125,
|
||
|
|
"learning_rate": 7.13190402727127e-05,
|
||
|
|
"loss": 0.5836,
|
||
|
|
"mean_token_accuracy": 0.8252883553504944,
|
||
|
|
"num_tokens": 105827.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6601278185844421,
|
||
|
|
"epoch": 3.8516129032258064,
|
||
|
|
"grad_norm": 3.1081454753875732,
|
||
|
|
"learning_rate": 7.094242100141625e-05,
|
||
|
|
"loss": 0.4519,
|
||
|
|
"mean_token_accuracy": 0.8595046997070312,
|
||
|
|
"num_tokens": 106405.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397666834294796,
|
||
|
|
"epoch": 3.8774193548387097,
|
||
|
|
"grad_norm": 2.5936572551727295,
|
||
|
|
"learning_rate": 7.056435515653059e-05,
|
||
|
|
"loss": 0.2092,
|
||
|
|
"mean_token_accuracy": 0.9478294253349304,
|
||
|
|
"num_tokens": 106926.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5597369372844696,
|
||
|
|
"epoch": 3.903225806451613,
|
||
|
|
"grad_norm": 4.103569984436035,
|
||
|
|
"learning_rate": 7.018486885251812e-05,
|
||
|
|
"loss": 0.4531,
|
||
|
|
"mean_token_accuracy": 0.8746808618307114,
|
||
|
|
"num_tokens": 107392.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38467343896627426,
|
||
|
|
"epoch": 3.9290322580645163,
|
||
|
|
"grad_norm": 3.1950509548187256,
|
||
|
|
"learning_rate": 6.980398830195785e-05,
|
||
|
|
"loss": 0.212,
|
||
|
|
"mean_token_accuracy": 0.9444408565759659,
|
||
|
|
"num_tokens": 107827.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5698762461543083,
|
||
|
|
"epoch": 3.9548387096774196,
|
||
|
|
"grad_norm": 3.3116562366485596,
|
||
|
|
"learning_rate": 6.942173981373474e-05,
|
||
|
|
"loss": 0.4076,
|
||
|
|
"mean_token_accuracy": 0.8756328076124191,
|
||
|
|
"num_tokens": 108519.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5269991233944893,
|
||
|
|
"epoch": 3.9806451612903224,
|
||
|
|
"grad_norm": 3.074373483657837,
|
||
|
|
"learning_rate": 6.903814979122249e-05,
|
||
|
|
"loss": 0.3577,
|
||
|
|
"mean_token_accuracy": 0.9049306809902191,
|
||
|
|
"num_tokens": 109080.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.411786029736201,
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 3.1152896881103516,
|
||
|
|
"learning_rate": 6.86532447304597e-05,
|
||
|
|
"loss": 0.2401,
|
||
|
|
"mean_token_accuracy": 0.9342868526776632,
|
||
|
|
"num_tokens": 109412.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.672158882021904,
|
||
|
|
"epoch": 4.025806451612903,
|
||
|
|
"grad_norm": 2.576361894607544,
|
||
|
|
"learning_rate": 6.826705121831976e-05,
|
||
|
|
"loss": 0.5307,
|
||
|
|
"mean_token_accuracy": 0.8603871315717697,
|
||
|
|
"num_tokens": 110911.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5033400803804398,
|
||
|
|
"epoch": 4.051612903225807,
|
||
|
|
"grad_norm": 2.3417139053344727,
|
||
|
|
"learning_rate": 6.78795959306743e-05,
|
||
|
|
"loss": 0.2862,
|
||
|
|
"mean_token_accuracy": 0.9291664808988571,
|
||
|
|
"num_tokens": 111773.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3818225935101509,
|
||
|
|
"epoch": 4.077419354838709,
|
||
|
|
"grad_norm": 2.526963233947754,
|
||
|
|
"learning_rate": 6.749090563055076e-05,
|
||
|
|
"loss": 0.204,
|
||
|
|
"mean_token_accuracy": 0.9366898983716965,
|
||
|
|
"num_tokens": 112552.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5210211500525475,
|
||
|
|
"epoch": 4.103225806451613,
|
||
|
|
"grad_norm": 3.331657648086548,
|
||
|
|
"learning_rate": 6.710100716628344e-05,
|
||
|
|
"loss": 0.3463,
|
||
|
|
"mean_token_accuracy": 0.9079622030258179,
|
||
|
|
"num_tokens": 113279.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4078049287199974,
|
||
|
|
"epoch": 4.129032258064516,
|
||
|
|
"grad_norm": 2.643353223800659,
|
||
|
|
"learning_rate": 6.670992746965938e-05,
|
||
|
|
"loss": 0.2458,
|
||
|
|
"mean_token_accuracy": 0.9378542304039001,
|
||
|
|
"num_tokens": 113927.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2958051636815071,
|
||
|
|
"epoch": 4.15483870967742,
|
||
|
|
"grad_norm": 2.6562397480010986,
|
||
|
|
"learning_rate": 6.63176935540578e-05,
|
||
|
|
"loss": 0.2228,
|
||
|
|
"mean_token_accuracy": 0.9389047920703888,
|
||
|
|
"num_tokens": 114535.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2642120160162449,
|
||
|
|
"epoch": 4.180645161290323,
|
||
|
|
"grad_norm": 3.720411539077759,
|
||
|
|
"learning_rate": 6.592433251258423e-05,
|
||
|
|
"loss": 0.1609,
|
||
|
|
"mean_token_accuracy": 0.9546155333518982,
|
||
|
|
"num_tokens": 115092.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2038814201951027,
|
||
|
|
"epoch": 4.2064516129032254,
|
||
|
|
"grad_norm": 3.742655038833618,
|
||
|
|
"learning_rate": 6.552987151619919e-05,
|
||
|
|
"loss": 0.1438,
|
||
|
|
"mean_token_accuracy": 0.9577045887708664,
|
||
|
|
"num_tokens": 115572.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.21509704366326332,
|
||
|
|
"epoch": 4.232258064516129,
|
||
|
|
"grad_norm": 4.123962879180908,
|
||
|
|
"learning_rate": 6.51343378118413e-05,
|
||
|
|
"loss": 0.1326,
|
||
|
|
"mean_token_accuracy": 0.955599308013916,
|
||
|
|
"num_tokens": 116004.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5882035046815872,
|
||
|
|
"epoch": 4.258064516129032,
|
||
|
|
"grad_norm": 2.629396438598633,
|
||
|
|
"learning_rate": 6.473775872054521e-05,
|
||
|
|
"loss": 0.5174,
|
||
|
|
"mean_token_accuracy": 0.855495274066925,
|
||
|
|
"num_tokens": 117713.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4447134956717491,
|
||
|
|
"epoch": 4.283870967741936,
|
||
|
|
"grad_norm": 5.003028869628906,
|
||
|
|
"learning_rate": 6.434016163555452e-05,
|
||
|
|
"loss": 0.4682,
|
||
|
|
"mean_token_accuracy": 0.8714989423751831,
|
||
|
|
"num_tokens": 118624.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3722687065601349,
|
||
|
|
"epoch": 4.309677419354839,
|
||
|
|
"grad_norm": 3.819241762161255,
|
||
|
|
"learning_rate": 6.394157402042951e-05,
|
||
|
|
"loss": 0.3207,
|
||
|
|
"mean_token_accuracy": 0.9076657742261887,
|
||
|
|
"num_tokens": 119441.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2616325728595257,
|
||
|
|
"epoch": 4.335483870967742,
|
||
|
|
"grad_norm": 3.4206392765045166,
|
||
|
|
"learning_rate": 6.354202340715026e-05,
|
||
|
|
"loss": 0.205,
|
||
|
|
"mean_token_accuracy": 0.9454829543828964,
|
||
|
|
"num_tokens": 120187.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3457096070051193,
|
||
|
|
"epoch": 4.361290322580645,
|
||
|
|
"grad_norm": 3.556037425994873,
|
||
|
|
"learning_rate": 6.314153739421476e-05,
|
||
|
|
"loss": 0.2697,
|
||
|
|
"mean_token_accuracy": 0.9172067493200302,
|
||
|
|
"num_tokens": 120838.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2511453256011009,
|
||
|
|
"epoch": 4.387096774193548,
|
||
|
|
"grad_norm": 2.943145751953125,
|
||
|
|
"learning_rate": 6.274014364473274e-05,
|
||
|
|
"loss": 0.1491,
|
||
|
|
"mean_token_accuracy": 0.9682914614677429,
|
||
|
|
"num_tokens": 121408.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.23977105692029,
|
||
|
|
"epoch": 4.412903225806452,
|
||
|
|
"grad_norm": 3.426252603530884,
|
||
|
|
"learning_rate": 6.233786988451468e-05,
|
||
|
|
"loss": 0.1645,
|
||
|
|
"mean_token_accuracy": 0.9556652754545212,
|
||
|
|
"num_tokens": 121915.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.19089676067233086,
|
||
|
|
"epoch": 4.438709677419355,
|
||
|
|
"grad_norm": 2.1618521213531494,
|
||
|
|
"learning_rate": 6.19347439001569e-05,
|
||
|
|
"loss": 0.1059,
|
||
|
|
"mean_token_accuracy": 0.97336345911026,
|
||
|
|
"num_tokens": 122368.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.19364609941840172,
|
||
|
|
"epoch": 4.464516129032258,
|
||
|
|
"grad_norm": 3.3634703159332275,
|
||
|
|
"learning_rate": 6.153079353712201e-05,
|
||
|
|
"loss": 0.1285,
|
||
|
|
"mean_token_accuracy": 0.9543762654066086,
|
||
|
|
"num_tokens": 122767.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6687990427017212,
|
||
|
|
"epoch": 4.490322580645161,
|
||
|
|
"grad_norm": 2.883437395095825,
|
||
|
|
"learning_rate": 6.112604669781572e-05,
|
||
|
|
"loss": 0.5348,
|
||
|
|
"mean_token_accuracy": 0.8638840764760971,
|
||
|
|
"num_tokens": 124288.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472368985414505,
|
||
|
|
"epoch": 4.516129032258064,
|
||
|
|
"grad_norm": 2.9869871139526367,
|
||
|
|
"learning_rate": 6.072053133965938e-05,
|
||
|
|
"loss": 0.2776,
|
||
|
|
"mean_token_accuracy": 0.9314542561769485,
|
||
|
|
"num_tokens": 125161.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4055846929550171,
|
||
|
|
"epoch": 4.541935483870968,
|
||
|
|
"grad_norm": 3.554269552230835,
|
||
|
|
"learning_rate": 6.031427547315889e-05,
|
||
|
|
"loss": 0.3152,
|
||
|
|
"mean_token_accuracy": 0.9113509654998779,
|
||
|
|
"num_tokens": 125955.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3913852721452713,
|
||
|
|
"epoch": 4.567741935483871,
|
||
|
|
"grad_norm": 3.3943800926208496,
|
||
|
|
"learning_rate": 5.9907307159969884e-05,
|
||
|
|
"loss": 0.2882,
|
||
|
|
"mean_token_accuracy": 0.9336675554513931,
|
||
|
|
"num_tokens": 126654.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2266981489956379,
|
||
|
|
"epoch": 4.593548387096774,
|
||
|
|
"grad_norm": 2.6177566051483154,
|
||
|
|
"learning_rate": 5.949965451095951e-05,
|
||
|
|
"loss": 0.1521,
|
||
|
|
"mean_token_accuracy": 0.9607619494199753,
|
||
|
|
"num_tokens": 127200.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2510114349424839,
|
||
|
|
"epoch": 4.619354838709677,
|
||
|
|
"grad_norm": 2.9274792671203613,
|
||
|
|
"learning_rate": 5.9091345684264546e-05,
|
||
|
|
"loss": 0.1527,
|
||
|
|
"mean_token_accuracy": 0.9545964151620865,
|
||
|
|
"num_tokens": 127710.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.27408041059970856,
|
||
|
|
"epoch": 4.645161290322581,
|
||
|
|
"grad_norm": 3.970353841781616,
|
||
|
|
"learning_rate": 5.868240888334653e-05,
|
||
|
|
"loss": 0.2088,
|
||
|
|
"mean_token_accuracy": 0.9431939721107483,
|
||
|
|
"num_tokens": 128171.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.21555104106664658,
|
||
|
|
"epoch": 4.670967741935484,
|
||
|
|
"grad_norm": 2.1485326290130615,
|
||
|
|
"learning_rate": 5.827287235504356e-05,
|
||
|
|
"loss": 0.1231,
|
||
|
|
"mean_token_accuracy": 0.9743186682462692,
|
||
|
|
"num_tokens": 128603.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1890631914138794,
|
||
|
|
"epoch": 4.6967741935483875,
|
||
|
|
"grad_norm": 3.0446012020111084,
|
||
|
|
"learning_rate": 5.786276438761927e-05,
|
||
|
|
"loss": 0.166,
|
||
|
|
"mean_token_accuracy": 0.9585428386926651,
|
||
|
|
"num_tokens": 129018.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4911561757326126,
|
||
|
|
"epoch": 4.72258064516129,
|
||
|
|
"grad_norm": 2.324612617492676,
|
||
|
|
"learning_rate": 5.745211330880872e-05,
|
||
|
|
"loss": 0.3596,
|
||
|
|
"mean_token_accuracy": 0.9241899400949478,
|
||
|
|
"num_tokens": 130189.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3451598323881626,
|
||
|
|
"epoch": 4.748387096774193,
|
||
|
|
"grad_norm": 3.1134896278381348,
|
||
|
|
"learning_rate": 5.704094748386184e-05,
|
||
|
|
"loss": 0.2163,
|
||
|
|
"mean_token_accuracy": 0.9265208840370178,
|
||
|
|
"num_tokens": 130996.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39382658153772354,
|
||
|
|
"epoch": 4.774193548387097,
|
||
|
|
"grad_norm": 3.3759310245513916,
|
||
|
|
"learning_rate": 5.6629295313583974e-05,
|
||
|
|
"loss": 0.266,
|
||
|
|
"mean_token_accuracy": 0.923931747674942,
|
||
|
|
"num_tokens": 131734.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.362373985350132,
|
||
|
|
"epoch": 4.8,
|
||
|
|
"grad_norm": 3.549544095993042,
|
||
|
|
"learning_rate": 5.621718523237427e-05,
|
||
|
|
"loss": 0.2415,
|
||
|
|
"mean_token_accuracy": 0.9290976673364639,
|
||
|
|
"num_tokens": 132406.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.33830052614212036,
|
||
|
|
"epoch": 4.825806451612904,
|
||
|
|
"grad_norm": 2.8866331577301025,
|
||
|
|
"learning_rate": 5.5804645706261514e-05,
|
||
|
|
"loss": 0.2333,
|
||
|
|
"mean_token_accuracy": 0.93567855656147,
|
||
|
|
"num_tokens": 133001.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2700263783335686,
|
||
|
|
"epoch": 4.851612903225806,
|
||
|
|
"grad_norm": 2.9685375690460205,
|
||
|
|
"learning_rate": 5.539170523093794e-05,
|
||
|
|
"loss": 0.1737,
|
||
|
|
"mean_token_accuracy": 0.9484844356775284,
|
||
|
|
"num_tokens": 133568.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2686317004263401,
|
||
|
|
"epoch": 4.877419354838709,
|
||
|
|
"grad_norm": 2.7458479404449463,
|
||
|
|
"learning_rate": 5.497839232979084e-05,
|
||
|
|
"loss": 0.1727,
|
||
|
|
"mean_token_accuracy": 0.9658856242895126,
|
||
|
|
"num_tokens": 134062.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2341674156486988,
|
||
|
|
"epoch": 4.903225806451613,
|
||
|
|
"grad_norm": 2.944103956222534,
|
||
|
|
"learning_rate": 5.456473555193242e-05,
|
||
|
|
"loss": 0.1788,
|
||
|
|
"mean_token_accuracy": 0.9528596550226212,
|
||
|
|
"num_tokens": 134514.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.22099602594971657,
|
||
|
|
"epoch": 4.929032258064516,
|
||
|
|
"grad_norm": 3.862736940383911,
|
||
|
|
"learning_rate": 5.415076347022776e-05,
|
||
|
|
"loss": 0.1657,
|
||
|
|
"mean_token_accuracy": 0.9679511785507202,
|
||
|
|
"num_tokens": 134923.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5313196182250977,
|
||
|
|
"epoch": 4.95483870967742,
|
||
|
|
"grad_norm": 3.1668918132781982,
|
||
|
|
"learning_rate": 5.373650467932122e-05,
|
||
|
|
"loss": 0.5281,
|
||
|
|
"mean_token_accuracy": 0.8866761773824692,
|
||
|
|
"num_tokens": 135869.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2688843570649624,
|
||
|
|
"epoch": 4.980645161290322,
|
||
|
|
"grad_norm": 2.9400172233581543,
|
||
|
|
"learning_rate": 5.332198779366122e-05,
|
||
|
|
"loss": 0.1822,
|
||
|
|
"mean_token_accuracy": 0.9536565244197845,
|
||
|
|
"num_tokens": 136435.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.34634942809740704,
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 4.880941867828369,
|
||
|
|
"learning_rate": 5.290724144552379e-05,
|
||
|
|
"loss": 0.2718,
|
||
|
|
"mean_token_accuracy": 0.9203394254048666,
|
||
|
|
"num_tokens": 136765.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5787394121289253,
|
||
|
|
"epoch": 5.025806451612903,
|
||
|
|
"grad_norm": 2.429058313369751,
|
||
|
|
"learning_rate": 5.249229428303486e-05,
|
||
|
|
"loss": 0.3105,
|
||
|
|
"mean_token_accuracy": 0.9199163019657135,
|
||
|
|
"num_tokens": 138102.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3213765248656273,
|
||
|
|
"epoch": 5.051612903225807,
|
||
|
|
"grad_norm": 2.9777679443359375,
|
||
|
|
"learning_rate": 5.2077174968191346e-05,
|
||
|
|
"loss": 0.1813,
|
||
|
|
"mean_token_accuracy": 0.9481654316186905,
|
||
|
|
"num_tokens": 138950.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.2601848617196083,
|
||
|
|
"epoch": 5.077419354838709,
|
||
|
|
"grad_norm": 2.173152446746826,
|
||
|
|
"learning_rate": 5.166191217488133e-05,
|
||
|
|
"loss": 0.1352,
|
||
|
|
"mean_token_accuracy": 0.9740329831838608,
|
||
|
|
"num_tokens": 139722.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.27228355780243874,
|
||
|
|
"epoch": 5.103225806451613,
|
||
|
|
"grad_norm": 2.206040859222412,
|
||
|
|
"learning_rate": 5.124653458690365e-05,
|
||
|
|
"loss": 0.1203,
|
||
|
|
"mean_token_accuracy": 0.9656965136528015,
|
||
|
|
"num_tokens": 140396.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.17130273580551147,
|
||
|
|
"epoch": 5.129032258064516,
|
||
|
|
"grad_norm": 2.000005006790161,
|
||
|
|
"learning_rate": 5.083107089598632e-05,
|
||
|
|
"loss": 0.0938,
|
||
|
|
"mean_token_accuracy": 0.9830586761236191,
|
||
|
|
"num_tokens": 140987.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.19337046518921852,
|
||
|
|
"epoch": 5.15483870967742,
|
||
|
|
"grad_norm": 2.180755376815796,
|
||
|
|
"learning_rate": 5.041554979980486e-05,
|
||
|
|
"loss": 0.092,
|
||
|
|
"mean_token_accuracy": 0.9733314365148544,
|
||
|
|
"num_tokens": 141517.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.16925612837076187,
|
||
|
|
"epoch": 5.180645161290323,
|
||
|
|
"grad_norm": 1.6496930122375488,
|
||
|
|
"learning_rate": 5e-05,
|
||
|
|
"loss": 0.0819,
|
||
|
|
"mean_token_accuracy": 0.9781141579151154,
|
||
|
|
"num_tokens": 142025.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.20112577825784683,
|
||
|
|
"epoch": 5.2064516129032254,
|
||
|
|
"grad_norm": 2.5295193195343018,
|
||
|
|
"learning_rate": 4.9584450200195156e-05,
|
||
|
|
"loss": 0.1113,
|
||
|
|
"mean_token_accuracy": 0.972536712884903,
|
||
|
|
"num_tokens": 142501.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.12446376867592335,
|
||
|
|
"epoch": 5.232258064516129,
|
||
|
|
"grad_norm": 1.8126459121704102,
|
||
|
|
"learning_rate": 4.9168929104013697e-05,
|
||
|
|
"loss": 0.1119,
|
||
|
|
"mean_token_accuracy": 0.9784018099308014,
|
||
|
|
"num_tokens": 142930.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3357328027486801,
|
||
|
|
"epoch": 5.258064516129032,
|
||
|
|
"grad_norm": 2.69579815864563,
|
||
|
|
"learning_rate": 4.875346541309637e-05,
|
||
|
|
"loss": 0.2933,
|
||
|
|
"mean_token_accuracy": 0.9279916733503342,
|
||
|
|
"num_tokens": 144619.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.27347391098737717,
|
||
|
|
"epoch": 5.283870967741936,
|
||
|
|
"grad_norm": 3.0113985538482666,
|
||
|
|
"learning_rate": 4.8338087825118675e-05,
|
||
|
|
"loss": 0.2147,
|
||
|
|
"mean_token_accuracy": 0.9462355375289917,
|
||
|
|
"num_tokens": 145485.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.18706193938851357,
|
||
|
|
"epoch": 5.309677419354839,
|
||
|
|
"grad_norm": 2.3350462913513184,
|
||
|
|
"learning_rate": 4.792282503180867e-05,
|
||
|
|
"loss": 0.1089,
|
||
|
|
"mean_token_accuracy": 0.9645346254110336,
|
||
|
|
"num_tokens": 146253.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.23134352639317513,
|
||
|
|
"epoch": 5.335483870967742,
|
||
|
|
"grad_norm": 2.53825306892395,
|
||
|
|
"learning_rate": 4.750770571696514e-05,
|
||
|
|
"loss": 0.139,
|
||
|
|
"mean_token_accuracy": 0.9644808024168015,
|
||
|
|
"num_tokens": 146961.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.18409648537635803,
|
||
|
|
"epoch": 5.361290322580645,
|
||
|
|
"grad_norm": 3.6751139163970947,
|
||
|
|
"learning_rate": 4.709275855447621e-05,
|
||
|
|
"loss": 0.1271,
|
||
|
|
"mean_token_accuracy": 0.9647018611431122,
|
||
|
|
"num_tokens": 147585.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.13805431686341763,
|
||
|
|
"epoch": 5.387096774193548,
|
||
|
|
"grad_norm": 2.252584218978882,
|
||
|
|
"learning_rate": 4.6678012206338793e-05,
|
||
|
|
"loss": 0.11,
|
||
|
|
"mean_token_accuracy": 0.9786661118268967,
|
||
|
|
"num_tokens": 148137.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1293979026377201,
|
||
|
|
"epoch": 5.412903225806452,
|
||
|
|
"grad_norm": 3.228670358657837,
|
||
|
|
"learning_rate": 4.626349532067879e-05,
|
||
|
|
"loss": 0.1009,
|
||
|
|
"mean_token_accuracy": 0.9756647497415543,
|
||
|
|
"num_tokens": 148635.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.15355101972818375,
|
||
|
|
"epoch": 5.438709677419355,
|
||
|
|
"grad_norm": 2.5168986320495605,
|
||
|
|
"learning_rate": 4.584923652977224e-05,
|
||
|
|
"loss": 0.0966,
|
||
|
|
"mean_token_accuracy": 0.9696203321218491,
|
||
|
|
"num_tokens": 149098.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.12985192984342575,
|
||
|
|
"epoch": 5.464516129032258,
|
||
|
|
"grad_norm": 1.9614430665969849,
|
||
|
|
"learning_rate": 4.543526444806759e-05,
|
||
|
|
"loss": 0.0876,
|
||
|
|
"mean_token_accuracy": 0.9787871986627579,
|
||
|
|
"num_tokens": 149525.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41858533024787903,
|
||
|
|
"epoch": 5.490322580645161,
|
||
|
|
"grad_norm": 2.3210058212280273,
|
||
|
|
"learning_rate": 4.502160767020918e-05,
|
||
|
|
"loss": 0.3106,
|
||
|
|
"mean_token_accuracy": 0.9150111377239227,
|
||
|
|
"num_tokens": 151159.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.23978786170482635,
|
||
|
|
"epoch": 5.516129032258064,
|
||
|
|
"grad_norm": 2.6100656986236572,
|
||
|
|
"learning_rate": 4.4608294769062075e-05,
|
||
|
|
"loss": 0.131,
|
||
|
|
"mean_token_accuracy": 0.969085082411766,
|
||
|
|
"num_tokens": 151972.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.20062651857733727,
|
||
|
|
"epoch": 5.541935483870968,
|
||
|
|
"grad_norm": 2.6525464057922363,
|
||
|
|
"learning_rate": 4.4195354293738484e-05,
|
||
|
|
"loss": 0.1297,
|
||
|
|
"mean_token_accuracy": 0.9647854268550873,
|
||
|
|
"num_tokens": 152742.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.18283047527074814,
|
||
|
|
"epoch": 5.567741935483871,
|
||
|
|
"grad_norm": 1.9218651056289673,
|
||
|
|
"learning_rate": 4.378281476762576e-05,
|
||
|
|
"loss": 0.1113,
|
||
|
|
"mean_token_accuracy": 0.9758298695087433,
|
||
|
|
"num_tokens": 153456.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.17359177768230438,
|
||
|
|
"epoch": 5.593548387096774,
|
||
|
|
"grad_norm": 2.074409008026123,
|
||
|
|
"learning_rate": 4.337070468641604e-05,
|
||
|
|
"loss": 0.1127,
|
||
|
|
"mean_token_accuracy": 0.9679757952690125,
|
||
|
|
"num_tokens": 154114.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.15452994219958782,
|
||
|
|
"epoch": 5.619354838709677,
|
||
|
|
"grad_norm": 1.4686728715896606,
|
||
|
|
"learning_rate": 4.295905251613817e-05,
|
||
|
|
"loss": 0.083,
|
||
|
|
"mean_token_accuracy": 0.9716224670410156,
|
||
|
|
"num_tokens": 154710.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1461981236934662,
|
||
|
|
"epoch": 5.645161290322581,
|
||
|
|
"grad_norm": 2.090766191482544,
|
||
|
|
"learning_rate": 4.254788669119127e-05,
|
||
|
|
"loss": 0.0915,
|
||
|
|
"mean_token_accuracy": 0.9731487780809402,
|
||
|
|
"num_tokens": 155272.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.14948130398988724,
|
||
|
|
"epoch": 5.670967741935484,
|
||
|
|
"grad_norm": 2.874465227127075,
|
||
|
|
"learning_rate": 4.213723561238074e-05,
|
||
|
|
"loss": 0.1213,
|
||
|
|
"mean_token_accuracy": 0.9657130539417267,
|
||
|
|
"num_tokens": 155765.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.147341663017869,
|
||
|
|
"epoch": 5.6967741935483875,
|
||
|
|
"grad_norm": 2.8784825801849365,
|
||
|
|
"learning_rate": 4.172712764495644e-05,
|
||
|
|
"loss": 0.1131,
|
||
|
|
"mean_token_accuracy": 0.9677340090274811,
|
||
|
|
"num_tokens": 156170.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.37899941951036453,
|
||
|
|
"epoch": 5.72258064516129,
|
||
|
|
"grad_norm": 2.1102116107940674,
|
||
|
|
"learning_rate": 4.131759111665349e-05,
|
||
|
|
"loss": 0.2919,
|
||
|
|
"mean_token_accuracy": 0.9289288818836212,
|
||
|
|
"num_tokens": 157544.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1955309621989727,
|
||
|
|
"epoch": 5.748387096774193,
|
||
|
|
"grad_norm": 2.2968599796295166,
|
||
|
|
"learning_rate": 4.0908654315735466e-05,
|
||
|
|
"loss": 0.1214,
|
||
|
|
"mean_token_accuracy": 0.9681131392717361,
|
||
|
|
"num_tokens": 158450.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.19111444801092148,
|
||
|
|
"epoch": 5.774193548387097,
|
||
|
|
"grad_norm": 2.6387436389923096,
|
||
|
|
"learning_rate": 4.0500345489040515e-05,
|
||
|
|
"loss": 0.1412,
|
||
|
|
"mean_token_accuracy": 0.9579745233058929,
|
||
|
|
"num_tokens": 159264.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1776861809194088,
|
||
|
|
"epoch": 5.8,
|
||
|
|
"grad_norm": 2.6175966262817383,
|
||
|
|
"learning_rate": 4.0092692840030134e-05,
|
||
|
|
"loss": 0.1223,
|
||
|
|
"mean_token_accuracy": 0.9692755341529846,
|
||
|
|
"num_tokens": 159933.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.15603690408170223,
|
||
|
|
"epoch": 5.825806451612904,
|
||
|
|
"grad_norm": 2.4090588092803955,
|
||
|
|
"learning_rate": 3.968572452684113e-05,
|
||
|
|
"loss": 0.1004,
|
||
|
|
"mean_token_accuracy": 0.9694436490535736,
|
||
|
|
"num_tokens": 160526.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.14687431044876575,
|
||
|
|
"epoch": 5.851612903225806,
|
||
|
|
"grad_norm": 2.5552449226379395,
|
||
|
|
"learning_rate": 3.9279468660340626e-05,
|
||
|
|
"loss": 0.1015,
|
||
|
|
"mean_token_accuracy": 0.9700941145420074,
|
||
|
|
"num_tokens": 161001.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.14584726840257645,
|
||
|
|
"epoch": 5.877419354838709,
|
||
|
|
"grad_norm": 2.417149782180786,
|
||
|
|
"learning_rate": 3.887395330218429e-05,
|
||
|
|
"loss": 0.1257,
|
||
|
|
"mean_token_accuracy": 0.969669446349144,
|
||
|
|
"num_tokens": 161434.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.12778180465102196,
|
||
|
|
"epoch": 5.903225806451613,
|
||
|
|
"grad_norm": 1.2179059982299805,
|
||
|
|
"learning_rate": 3.846920646287799e-05,
|
||
|
|
"loss": 0.0758,
|
||
|
|
"mean_token_accuracy": 0.9738518297672272,
|
||
|
|
"num_tokens": 161858.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.1522289477288723,
|
||
|
|
"epoch": 5.929032258064516,
|
||
|
|
"grad_norm": 2.0130059719085693,
|
||
|
|
"learning_rate": 3.806525609984312e-05,
|
||
|
|
"loss": 0.1062,
|
||
|
|
"mean_token_accuracy": 0.9636791348457336,
|
||
|
|
"num_tokens": 162250.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.23178323358297348,
|
||
|
|
"epoch": 5.95483870967742,
|
||
|
|
"grad_norm": 2.4759209156036377,
|
||
|
|
"learning_rate": 3.7662130115485314e-05,
|
||
|
|
"loss": 0.1228,
|
||
|
|
"mean_token_accuracy": 0.9636365175247192,
|
||
|
|
"num_tokens": 163108.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.16584154963493347,
|
||
|
|
"epoch": 5.980645161290322,
|
||
|
|
"grad_norm": 2.447923421859741,
|
||
|
|
"learning_rate": 3.7259856355267273e-05,
|
||
|
|
"loss": 0.1304,
|
||
|
|
"mean_token_accuracy": 0.9603947103023529,
|
||
|
|
"num_tokens": 163768.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.12077461183071136,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"grad_norm": 3.7384884357452393,
|
||
|
|
"learning_rate": 3.685846260578524e-05,
|
||
|
|
"loss": 0.0966,
|
||
|
|
"mean_token_accuracy": 0.9680581092834473,
|
||
|
|
"num_tokens": 164118.0,
|
||
|
|
"step": 234
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 390,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 7434558634475520.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|