6046 lines
164 KiB
JSON
6046 lines
164 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.2520478890989288,
|
||
|
|
"eval_steps": 3000,
|
||
|
|
"global_step": 3000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 10.69201192855835,
|
||
|
|
"epoch": 0.0004200798151648813,
|
||
|
|
"grad_norm": 13.375,
|
||
|
|
"learning_rate": 2e-06,
|
||
|
|
"loss": 10.8001,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 8348.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.691978454589844,
|
||
|
|
"epoch": 0.0008401596303297626,
|
||
|
|
"grad_norm": 12.5,
|
||
|
|
"learning_rate": 4.5e-06,
|
||
|
|
"loss": 10.7548,
|
||
|
|
"mean_token_accuracy": 0.00010881392518058419,
|
||
|
|
"num_tokens": 17465.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.691164684295654,
|
||
|
|
"epoch": 0.001260239445494644,
|
||
|
|
"grad_norm": 9.9375,
|
||
|
|
"learning_rate": 7e-06,
|
||
|
|
"loss": 10.5365,
|
||
|
|
"mean_token_accuracy": 0.021085147676058114,
|
||
|
|
"num_tokens": 26627.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.678658771514893,
|
||
|
|
"epoch": 0.0016803192606595252,
|
||
|
|
"grad_norm": 6.46875,
|
||
|
|
"learning_rate": 9.5e-06,
|
||
|
|
"loss": 10.2026,
|
||
|
|
"mean_token_accuracy": 0.046403773874044416,
|
||
|
|
"num_tokens": 36069.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.598964595794678,
|
||
|
|
"epoch": 0.002100399075824407,
|
||
|
|
"grad_norm": 4.46875,
|
||
|
|
"learning_rate": 1.2e-05,
|
||
|
|
"loss": 9.8984,
|
||
|
|
"mean_token_accuracy": 0.04546841159462929,
|
||
|
|
"num_tokens": 44967.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.592682838439941,
|
||
|
|
"epoch": 0.002520478890989288,
|
||
|
|
"grad_norm": 3.25,
|
||
|
|
"learning_rate": 1.4500000000000002e-05,
|
||
|
|
"loss": 9.8253,
|
||
|
|
"mean_token_accuracy": 0.04163686409592628,
|
||
|
|
"num_tokens": 55132.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.616032028198243,
|
||
|
|
"epoch": 0.0029405587061541692,
|
||
|
|
"grad_norm": 2.734375,
|
||
|
|
"learning_rate": 1.7000000000000003e-05,
|
||
|
|
"loss": 9.6909,
|
||
|
|
"mean_token_accuracy": 0.04541983306407928,
|
||
|
|
"num_tokens": 65141.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.587666893005371,
|
||
|
|
"epoch": 0.0033606385213190504,
|
||
|
|
"grad_norm": 2.453125,
|
||
|
|
"learning_rate": 1.95e-05,
|
||
|
|
"loss": 9.6967,
|
||
|
|
"mean_token_accuracy": 0.040509892627596855,
|
||
|
|
"num_tokens": 74007.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.587863063812256,
|
||
|
|
"epoch": 0.003780718336483932,
|
||
|
|
"grad_norm": 2.453125,
|
||
|
|
"learning_rate": 2.2e-05,
|
||
|
|
"loss": 9.6278,
|
||
|
|
"mean_token_accuracy": 0.04380051270127296,
|
||
|
|
"num_tokens": 83736.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.581284713745116,
|
||
|
|
"epoch": 0.004200798151648814,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 2.4500000000000003e-05,
|
||
|
|
"loss": 9.5554,
|
||
|
|
"mean_token_accuracy": 0.04462047629058361,
|
||
|
|
"num_tokens": 92525.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.579821586608887,
|
||
|
|
"epoch": 0.004620877966813695,
|
||
|
|
"grad_norm": 2.515625,
|
||
|
|
"learning_rate": 2.7e-05,
|
||
|
|
"loss": 9.5042,
|
||
|
|
"mean_token_accuracy": 0.0499776991084218,
|
||
|
|
"num_tokens": 102015.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.527470588684082,
|
||
|
|
"epoch": 0.005040957781978576,
|
||
|
|
"grad_norm": 2.203125,
|
||
|
|
"learning_rate": 2.95e-05,
|
||
|
|
"loss": 9.4648,
|
||
|
|
"mean_token_accuracy": 0.05102687180042267,
|
||
|
|
"num_tokens": 110887.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.398450374603271,
|
||
|
|
"epoch": 0.005461037597143457,
|
||
|
|
"grad_norm": 2.265625,
|
||
|
|
"learning_rate": 3.2e-05,
|
||
|
|
"loss": 9.3768,
|
||
|
|
"mean_token_accuracy": 0.05401572398841381,
|
||
|
|
"num_tokens": 120442.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.466637897491456,
|
||
|
|
"epoch": 0.0058811174123083385,
|
||
|
|
"grad_norm": 2.34375,
|
||
|
|
"learning_rate": 3.4500000000000005e-05,
|
||
|
|
"loss": 9.2516,
|
||
|
|
"mean_token_accuracy": 0.05276094898581505,
|
||
|
|
"num_tokens": 129297.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.477723217010498,
|
||
|
|
"epoch": 0.00630119722747322,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 3.7e-05,
|
||
|
|
"loss": 9.1585,
|
||
|
|
"mean_token_accuracy": 0.05686353407800197,
|
||
|
|
"num_tokens": 138305.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.401033782958985,
|
||
|
|
"epoch": 0.006721277042638101,
|
||
|
|
"grad_norm": 2.3125,
|
||
|
|
"learning_rate": 3.95e-05,
|
||
|
|
"loss": 9.0976,
|
||
|
|
"mean_token_accuracy": 0.055690228939056396,
|
||
|
|
"num_tokens": 147640.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.44783878326416,
|
||
|
|
"epoch": 0.007141356857802983,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 4.2000000000000004e-05,
|
||
|
|
"loss": 8.9803,
|
||
|
|
"mean_token_accuracy": 0.05669833719730377,
|
||
|
|
"num_tokens": 157633.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.396310806274414,
|
||
|
|
"epoch": 0.007561436672967864,
|
||
|
|
"grad_norm": 1.921875,
|
||
|
|
"learning_rate": 4.45e-05,
|
||
|
|
"loss": 8.9499,
|
||
|
|
"mean_token_accuracy": 0.05056734494864941,
|
||
|
|
"num_tokens": 167984.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.333494663238525,
|
||
|
|
"epoch": 0.007981516488132745,
|
||
|
|
"grad_norm": 1.90625,
|
||
|
|
"learning_rate": 4.7000000000000004e-05,
|
||
|
|
"loss": 8.8301,
|
||
|
|
"mean_token_accuracy": 0.06639725379645825,
|
||
|
|
"num_tokens": 176984.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.28737268447876,
|
||
|
|
"epoch": 0.008401596303297627,
|
||
|
|
"grad_norm": 2.171875,
|
||
|
|
"learning_rate": 4.9500000000000004e-05,
|
||
|
|
"loss": 8.654,
|
||
|
|
"mean_token_accuracy": 0.06538619883358479,
|
||
|
|
"num_tokens": 185931.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.208460235595703,
|
||
|
|
"epoch": 0.008821676118462508,
|
||
|
|
"grad_norm": 2.921875,
|
||
|
|
"learning_rate": 5.2e-05,
|
||
|
|
"loss": 8.6478,
|
||
|
|
"mean_token_accuracy": 0.050938266515731814,
|
||
|
|
"num_tokens": 195065.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.092334175109864,
|
||
|
|
"epoch": 0.00924175593362739,
|
||
|
|
"grad_norm": 1.9453125,
|
||
|
|
"learning_rate": 5.45e-05,
|
||
|
|
"loss": 8.5099,
|
||
|
|
"mean_token_accuracy": 0.06477361544966698,
|
||
|
|
"num_tokens": 203687.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.105284690856934,
|
||
|
|
"epoch": 0.00966183574879227,
|
||
|
|
"grad_norm": 1.9296875,
|
||
|
|
"learning_rate": 5.7e-05,
|
||
|
|
"loss": 8.4081,
|
||
|
|
"mean_token_accuracy": 0.0666894868016243,
|
||
|
|
"num_tokens": 212847.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.957781219482422,
|
||
|
|
"epoch": 0.010081915563957152,
|
||
|
|
"grad_norm": 1.71875,
|
||
|
|
"learning_rate": 5.9499999999999996e-05,
|
||
|
|
"loss": 8.3004,
|
||
|
|
"mean_token_accuracy": 0.0674133587628603,
|
||
|
|
"num_tokens": 222593.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.889359092712402,
|
||
|
|
"epoch": 0.010501995379122032,
|
||
|
|
"grad_norm": 1.6953125,
|
||
|
|
"learning_rate": 6.2e-05,
|
||
|
|
"loss": 8.129,
|
||
|
|
"mean_token_accuracy": 0.07197456955909728,
|
||
|
|
"num_tokens": 231174.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.669556808471679,
|
||
|
|
"epoch": 0.010922075194286915,
|
||
|
|
"grad_norm": 1.703125,
|
||
|
|
"learning_rate": 6.450000000000001e-05,
|
||
|
|
"loss": 7.9843,
|
||
|
|
"mean_token_accuracy": 0.07425511926412583,
|
||
|
|
"num_tokens": 239833.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.519672775268555,
|
||
|
|
"epoch": 0.011342155009451797,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 6.7e-05,
|
||
|
|
"loss": 8.0143,
|
||
|
|
"mean_token_accuracy": 0.07254141308367253,
|
||
|
|
"num_tokens": 248794.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.303325176239014,
|
||
|
|
"epoch": 0.011762234824616677,
|
||
|
|
"grad_norm": 1.6953125,
|
||
|
|
"learning_rate": 6.950000000000001e-05,
|
||
|
|
"loss": 7.9537,
|
||
|
|
"mean_token_accuracy": 0.07010119631886483,
|
||
|
|
"num_tokens": 257123.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.143257808685302,
|
||
|
|
"epoch": 0.012182314639781559,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 7.2e-05,
|
||
|
|
"loss": 7.6458,
|
||
|
|
"mean_token_accuracy": 0.07959595024585724,
|
||
|
|
"num_tokens": 266088.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.888239574432372,
|
||
|
|
"epoch": 0.01260239445494644,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 7.45e-05,
|
||
|
|
"loss": 7.8236,
|
||
|
|
"mean_token_accuracy": 0.07102414257824421,
|
||
|
|
"num_tokens": 276074.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.727731895446777,
|
||
|
|
"epoch": 0.013022474270111321,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 7.7e-05,
|
||
|
|
"loss": 7.7082,
|
||
|
|
"mean_token_accuracy": 0.07570267021656037,
|
||
|
|
"num_tokens": 285280.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.563877964019776,
|
||
|
|
"epoch": 0.013442554085276202,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 7.950000000000001e-05,
|
||
|
|
"loss": 7.6962,
|
||
|
|
"mean_token_accuracy": 0.06895132511854171,
|
||
|
|
"num_tokens": 296115.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.412875747680664,
|
||
|
|
"epoch": 0.013862633900441084,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 8.2e-05,
|
||
|
|
"loss": 7.5497,
|
||
|
|
"mean_token_accuracy": 0.07601302340626717,
|
||
|
|
"num_tokens": 305483.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.340911769866944,
|
||
|
|
"epoch": 0.014282713715605966,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 8.450000000000001e-05,
|
||
|
|
"loss": 7.5593,
|
||
|
|
"mean_token_accuracy": 0.07040085420012474,
|
||
|
|
"num_tokens": 314000.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.245043659210205,
|
||
|
|
"epoch": 0.014702793530770846,
|
||
|
|
"grad_norm": 1.5234375,
|
||
|
|
"learning_rate": 8.7e-05,
|
||
|
|
"loss": 7.5541,
|
||
|
|
"mean_token_accuracy": 0.07777635231614113,
|
||
|
|
"num_tokens": 323667.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.15629415512085,
|
||
|
|
"epoch": 0.015122873345935728,
|
||
|
|
"grad_norm": 1.4296875,
|
||
|
|
"learning_rate": 8.95e-05,
|
||
|
|
"loss": 7.5554,
|
||
|
|
"mean_token_accuracy": 0.07515333034098148,
|
||
|
|
"num_tokens": 332695.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.065321111679078,
|
||
|
|
"epoch": 0.015542953161100609,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 9.2e-05,
|
||
|
|
"loss": 7.3947,
|
||
|
|
"mean_token_accuracy": 0.07709791958332061,
|
||
|
|
"num_tokens": 342428.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.054158020019532,
|
||
|
|
"epoch": 0.01596303297626549,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 9.45e-05,
|
||
|
|
"loss": 7.5079,
|
||
|
|
"mean_token_accuracy": 0.0735605925321579,
|
||
|
|
"num_tokens": 353587.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.988022661209106,
|
||
|
|
"epoch": 0.01638311279143037,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 9.7e-05,
|
||
|
|
"loss": 7.443,
|
||
|
|
"mean_token_accuracy": 0.07551693692803382,
|
||
|
|
"num_tokens": 362997.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.02585473060608,
|
||
|
|
"epoch": 0.016803192606595255,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 9.95e-05,
|
||
|
|
"loss": 7.4821,
|
||
|
|
"mean_token_accuracy": 0.07873391062021255,
|
||
|
|
"num_tokens": 372346.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.984146022796631,
|
||
|
|
"epoch": 0.017223272421760135,
|
||
|
|
"grad_norm": 1.65625,
|
||
|
|
"learning_rate": 0.000102,
|
||
|
|
"loss": 7.3473,
|
||
|
|
"mean_token_accuracy": 0.07624267861247062,
|
||
|
|
"num_tokens": 381575.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.912975454330445,
|
||
|
|
"epoch": 0.017643352236925015,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00010449999999999999,
|
||
|
|
"loss": 7.4236,
|
||
|
|
"mean_token_accuracy": 0.0766436841338873,
|
||
|
|
"num_tokens": 390706.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.888600492477417,
|
||
|
|
"epoch": 0.018063432052089896,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.000107,
|
||
|
|
"loss": 7.4209,
|
||
|
|
"mean_token_accuracy": 0.0734835498034954,
|
||
|
|
"num_tokens": 400000.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.803367996215821,
|
||
|
|
"epoch": 0.01848351186725478,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.0001095,
|
||
|
|
"loss": 7.3774,
|
||
|
|
"mean_token_accuracy": 0.08182684779167175,
|
||
|
|
"num_tokens": 409447.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.875886058807373,
|
||
|
|
"epoch": 0.01890359168241966,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.000112,
|
||
|
|
"loss": 7.3393,
|
||
|
|
"mean_token_accuracy": 0.08449244052171707,
|
||
|
|
"num_tokens": 418417.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.78724856376648,
|
||
|
|
"epoch": 0.01932367149758454,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0001145,
|
||
|
|
"loss": 7.3048,
|
||
|
|
"mean_token_accuracy": 0.08006256446242332,
|
||
|
|
"num_tokens": 427619.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.736767053604126,
|
||
|
|
"epoch": 0.019743751312749424,
|
||
|
|
"grad_norm": 1.421875,
|
||
|
|
"learning_rate": 0.00011700000000000001,
|
||
|
|
"loss": 7.372,
|
||
|
|
"mean_token_accuracy": 0.07579129710793495,
|
||
|
|
"num_tokens": 437931.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.841858673095703,
|
||
|
|
"epoch": 0.020163831127914304,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.00011949999999999999,
|
||
|
|
"loss": 7.4001,
|
||
|
|
"mean_token_accuracy": 0.08351109325885772,
|
||
|
|
"num_tokens": 447595.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7983135223388675,
|
||
|
|
"epoch": 0.020583910943079185,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.000122,
|
||
|
|
"loss": 7.2633,
|
||
|
|
"mean_token_accuracy": 0.07488272562623025,
|
||
|
|
"num_tokens": 457062.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.813820743560791,
|
||
|
|
"epoch": 0.021003990758244065,
|
||
|
|
"grad_norm": 1.46875,
|
||
|
|
"learning_rate": 0.0001245,
|
||
|
|
"loss": 7.3567,
|
||
|
|
"mean_token_accuracy": 0.07759504988789559,
|
||
|
|
"num_tokens": 466191.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.757200431823731,
|
||
|
|
"epoch": 0.02142407057340895,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 0.000127,
|
||
|
|
"loss": 7.3146,
|
||
|
|
"mean_token_accuracy": 0.08031945005059242,
|
||
|
|
"num_tokens": 475693.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.7279805660247805,
|
||
|
|
"epoch": 0.02184415038857383,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.0001295,
|
||
|
|
"loss": 7.3269,
|
||
|
|
"mean_token_accuracy": 0.08141026981174945,
|
||
|
|
"num_tokens": 485173.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.724671411514282,
|
||
|
|
"epoch": 0.02226423020373871,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.000132,
|
||
|
|
"loss": 7.2369,
|
||
|
|
"mean_token_accuracy": 0.083962532132864,
|
||
|
|
"num_tokens": 493985.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.6601485252380375,
|
||
|
|
"epoch": 0.022684310018903593,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.00013450000000000002,
|
||
|
|
"loss": 7.2687,
|
||
|
|
"mean_token_accuracy": 0.08190520852804184,
|
||
|
|
"num_tokens": 502837.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.751116943359375,
|
||
|
|
"epoch": 0.023104389834068473,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.00013700000000000002,
|
||
|
|
"loss": 7.2065,
|
||
|
|
"mean_token_accuracy": 0.0843705341219902,
|
||
|
|
"num_tokens": 511503.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.717013120651245,
|
||
|
|
"epoch": 0.023524469649233354,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.0001395,
|
||
|
|
"loss": 7.4058,
|
||
|
|
"mean_token_accuracy": 0.08034609854221345,
|
||
|
|
"num_tokens": 521499.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.592406368255615,
|
||
|
|
"epoch": 0.023944549464398234,
|
||
|
|
"grad_norm": 1.3984375,
|
||
|
|
"learning_rate": 0.00014199999999999998,
|
||
|
|
"loss": 7.166,
|
||
|
|
"mean_token_accuracy": 0.08277052193880081,
|
||
|
|
"num_tokens": 530067.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.6297852993011475,
|
||
|
|
"epoch": 0.024364629279563118,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0001445,
|
||
|
|
"loss": 7.1721,
|
||
|
|
"mean_token_accuracy": 0.08475914299488067,
|
||
|
|
"num_tokens": 538559.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.705462646484375,
|
||
|
|
"epoch": 0.024784709094728,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000147,
|
||
|
|
"loss": 7.3653,
|
||
|
|
"mean_token_accuracy": 0.07328721843659877,
|
||
|
|
"num_tokens": 547288.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.596541261672973,
|
||
|
|
"epoch": 0.02520478890989288,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0001495,
|
||
|
|
"loss": 7.2357,
|
||
|
|
"mean_token_accuracy": 0.07816045507788658,
|
||
|
|
"num_tokens": 557269.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.701767444610596,
|
||
|
|
"epoch": 0.025624868725057762,
|
||
|
|
"grad_norm": 1.6953125,
|
||
|
|
"learning_rate": 0.000152,
|
||
|
|
"loss": 7.2628,
|
||
|
|
"mean_token_accuracy": 0.07311495915055274,
|
||
|
|
"num_tokens": 567280.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.602482271194458,
|
||
|
|
"epoch": 0.026044948540222643,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00015450000000000001,
|
||
|
|
"loss": 7.0908,
|
||
|
|
"mean_token_accuracy": 0.08299101889133453,
|
||
|
|
"num_tokens": 576609.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.399111747741699,
|
||
|
|
"epoch": 0.026465028355387523,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000157,
|
||
|
|
"loss": 7.0032,
|
||
|
|
"mean_token_accuracy": 0.09095181971788406,
|
||
|
|
"num_tokens": 586053.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.507453203201294,
|
||
|
|
"epoch": 0.026885108170552403,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0001595,
|
||
|
|
"loss": 7.203,
|
||
|
|
"mean_token_accuracy": 0.08823259696364402,
|
||
|
|
"num_tokens": 594649.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.599713850021362,
|
||
|
|
"epoch": 0.027305187985717287,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.000162,
|
||
|
|
"loss": 7.1383,
|
||
|
|
"mean_token_accuracy": 0.08195743858814239,
|
||
|
|
"num_tokens": 603445.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.587759685516358,
|
||
|
|
"epoch": 0.027725267800882167,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.00016450000000000001,
|
||
|
|
"loss": 7.2543,
|
||
|
|
"mean_token_accuracy": 0.07800514288246632,
|
||
|
|
"num_tokens": 613611.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.745543384552002,
|
||
|
|
"epoch": 0.028145347616047048,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.00016700000000000002,
|
||
|
|
"loss": 7.429,
|
||
|
|
"mean_token_accuracy": 0.07839688062667846,
|
||
|
|
"num_tokens": 623024.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4431709289550785,
|
||
|
|
"epoch": 0.02856542743121193,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.00016950000000000003,
|
||
|
|
"loss": 7.1028,
|
||
|
|
"mean_token_accuracy": 0.08672705665230751,
|
||
|
|
"num_tokens": 631624.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.574361371994018,
|
||
|
|
"epoch": 0.028985507246376812,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.00017199999999999998,
|
||
|
|
"loss": 7.0557,
|
||
|
|
"mean_token_accuracy": 0.08923942148685456,
|
||
|
|
"num_tokens": 640473.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.541849613189697,
|
||
|
|
"epoch": 0.029405587061541692,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.00017449999999999999,
|
||
|
|
"loss": 7.2383,
|
||
|
|
"mean_token_accuracy": 0.08173563033342361,
|
||
|
|
"num_tokens": 649692.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.571516275405884,
|
||
|
|
"epoch": 0.029825666876706573,
|
||
|
|
"grad_norm": 1.484375,
|
||
|
|
"learning_rate": 0.000177,
|
||
|
|
"loss": 7.1875,
|
||
|
|
"mean_token_accuracy": 0.08110572174191474,
|
||
|
|
"num_tokens": 658236.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.34685640335083,
|
||
|
|
"epoch": 0.030245746691871456,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.0001795,
|
||
|
|
"loss": 6.9645,
|
||
|
|
"mean_token_accuracy": 0.08569629490375519,
|
||
|
|
"num_tokens": 667175.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.556408214569092,
|
||
|
|
"epoch": 0.030665826507036337,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.000182,
|
||
|
|
"loss": 7.2834,
|
||
|
|
"mean_token_accuracy": 0.08148858584463596,
|
||
|
|
"num_tokens": 676456.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.606632947921753,
|
||
|
|
"epoch": 0.031085906322201217,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0001845,
|
||
|
|
"loss": 7.2448,
|
||
|
|
"mean_token_accuracy": 0.08052070513367653,
|
||
|
|
"num_tokens": 686881.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.371811389923096,
|
||
|
|
"epoch": 0.0315059861373661,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000187,
|
||
|
|
"loss": 7.0307,
|
||
|
|
"mean_token_accuracy": 0.08108055517077446,
|
||
|
|
"num_tokens": 696045.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.382633686065674,
|
||
|
|
"epoch": 0.03192606595253098,
|
||
|
|
"grad_norm": 1.359375,
|
||
|
|
"learning_rate": 0.0001895,
|
||
|
|
"loss": 7.003,
|
||
|
|
"mean_token_accuracy": 0.09089459106326103,
|
||
|
|
"num_tokens": 704729.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.353933048248291,
|
||
|
|
"epoch": 0.032346145767695865,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000192,
|
||
|
|
"loss": 7.0639,
|
||
|
|
"mean_token_accuracy": 0.08123919740319252,
|
||
|
|
"num_tokens": 714331.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.430750465393066,
|
||
|
|
"epoch": 0.03276622558286074,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.0001945,
|
||
|
|
"loss": 7.0163,
|
||
|
|
"mean_token_accuracy": 0.08898987770080566,
|
||
|
|
"num_tokens": 722788.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.388132476806641,
|
||
|
|
"epoch": 0.033186305398025626,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.00019700000000000002,
|
||
|
|
"loss": 7.0996,
|
||
|
|
"mean_token_accuracy": 0.0889863982796669,
|
||
|
|
"num_tokens": 731417.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.394377708435059,
|
||
|
|
"epoch": 0.03360638521319051,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00019950000000000002,
|
||
|
|
"loss": 7.0686,
|
||
|
|
"mean_token_accuracy": 0.0865507885813713,
|
||
|
|
"num_tokens": 741034.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.370957660675049,
|
||
|
|
"epoch": 0.034026465028355386,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000202,
|
||
|
|
"loss": 7.063,
|
||
|
|
"mean_token_accuracy": 0.08408316597342491,
|
||
|
|
"num_tokens": 749596.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.360737991333008,
|
||
|
|
"epoch": 0.03444654484352027,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00020449999999999998,
|
||
|
|
"loss": 7.0166,
|
||
|
|
"mean_token_accuracy": 0.08443826884031295,
|
||
|
|
"num_tokens": 758931.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.253893661499023,
|
||
|
|
"epoch": 0.03486662465868515,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000207,
|
||
|
|
"loss": 6.9221,
|
||
|
|
"mean_token_accuracy": 0.08874604031443596,
|
||
|
|
"num_tokens": 767534.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.336139726638794,
|
||
|
|
"epoch": 0.03528670447385003,
|
||
|
|
"grad_norm": 1.28125,
|
||
|
|
"learning_rate": 0.0002095,
|
||
|
|
"loss": 6.9742,
|
||
|
|
"mean_token_accuracy": 0.08901742175221443,
|
||
|
|
"num_tokens": 776456.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.32063570022583,
|
||
|
|
"epoch": 0.035706784289014915,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.000212,
|
||
|
|
"loss": 7.0512,
|
||
|
|
"mean_token_accuracy": 0.0825334556400776,
|
||
|
|
"num_tokens": 786172.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2836973667144775,
|
||
|
|
"epoch": 0.03612686410417979,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.0002145,
|
||
|
|
"loss": 6.9281,
|
||
|
|
"mean_token_accuracy": 0.09393875077366828,
|
||
|
|
"num_tokens": 795081.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.279390621185303,
|
||
|
|
"epoch": 0.036546943919344675,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.00021700000000000002,
|
||
|
|
"loss": 6.9729,
|
||
|
|
"mean_token_accuracy": 0.08336275964975357,
|
||
|
|
"num_tokens": 804259.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.3233130931854244,
|
||
|
|
"epoch": 0.03696702373450956,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0002195,
|
||
|
|
"loss": 6.9836,
|
||
|
|
"mean_token_accuracy": 0.08346287980675697,
|
||
|
|
"num_tokens": 813463.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.265643119812012,
|
||
|
|
"epoch": 0.037387103549674436,
|
||
|
|
"grad_norm": 1.3125,
|
||
|
|
"learning_rate": 0.000222,
|
||
|
|
"loss": 6.915,
|
||
|
|
"mean_token_accuracy": 0.09436434507369995,
|
||
|
|
"num_tokens": 823029.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2830162525177,
|
||
|
|
"epoch": 0.03780718336483932,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0002245,
|
||
|
|
"loss": 6.9822,
|
||
|
|
"mean_token_accuracy": 0.08020757511258125,
|
||
|
|
"num_tokens": 832902.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.172808027267456,
|
||
|
|
"epoch": 0.0382272631800042,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00022700000000000002,
|
||
|
|
"loss": 6.9269,
|
||
|
|
"mean_token_accuracy": 0.08937018439173698,
|
||
|
|
"num_tokens": 842162.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.261403322219849,
|
||
|
|
"epoch": 0.03864734299516908,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.00022950000000000002,
|
||
|
|
"loss": 6.9709,
|
||
|
|
"mean_token_accuracy": 0.09120814129710197,
|
||
|
|
"num_tokens": 852328.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.207744789123535,
|
||
|
|
"epoch": 0.039067422810333964,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00023200000000000003,
|
||
|
|
"loss": 6.9283,
|
||
|
|
"mean_token_accuracy": 0.08966456726193428,
|
||
|
|
"num_tokens": 860929.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.253277540206909,
|
||
|
|
"epoch": 0.03948750262549885,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.00023449999999999998,
|
||
|
|
"loss": 7.0043,
|
||
|
|
"mean_token_accuracy": 0.0854820430278778,
|
||
|
|
"num_tokens": 869144.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.303921031951904,
|
||
|
|
"epoch": 0.039907582440663725,
|
||
|
|
"grad_norm": 1.3671875,
|
||
|
|
"learning_rate": 0.000237,
|
||
|
|
"loss": 6.9451,
|
||
|
|
"mean_token_accuracy": 0.09673570543527603,
|
||
|
|
"num_tokens": 877447.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.20126519203186,
|
||
|
|
"epoch": 0.04032766225582861,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0002395,
|
||
|
|
"loss": 6.9017,
|
||
|
|
"mean_token_accuracy": 0.08463463708758354,
|
||
|
|
"num_tokens": 887020.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1618622779846195,
|
||
|
|
"epoch": 0.040747742070993485,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.000242,
|
||
|
|
"loss": 6.9503,
|
||
|
|
"mean_token_accuracy": 0.08903224021196365,
|
||
|
|
"num_tokens": 895937.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.172050189971924,
|
||
|
|
"epoch": 0.04116782188615837,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0002445,
|
||
|
|
"loss": 6.9573,
|
||
|
|
"mean_token_accuracy": 0.08436014279723167,
|
||
|
|
"num_tokens": 905446.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1261190414428714,
|
||
|
|
"epoch": 0.04158790170132325,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.000247,
|
||
|
|
"loss": 6.8507,
|
||
|
|
"mean_token_accuracy": 0.09782563373446465,
|
||
|
|
"num_tokens": 914547.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.219514274597168,
|
||
|
|
"epoch": 0.04200798151648813,
|
||
|
|
"grad_norm": 1.3515625,
|
||
|
|
"learning_rate": 0.0002495,
|
||
|
|
"loss": 6.8597,
|
||
|
|
"mean_token_accuracy": 0.09429225027561187,
|
||
|
|
"num_tokens": 922900.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.174054384231567,
|
||
|
|
"epoch": 0.042428061331653014,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.000252,
|
||
|
|
"loss": 6.9026,
|
||
|
|
"mean_token_accuracy": 0.09461246877908706,
|
||
|
|
"num_tokens": 930876.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.149679851531983,
|
||
|
|
"epoch": 0.0428481411468179,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0002545,
|
||
|
|
"loss": 6.9327,
|
||
|
|
"mean_token_accuracy": 0.09384474828839302,
|
||
|
|
"num_tokens": 939871.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1536510467529295,
|
||
|
|
"epoch": 0.043268220961982774,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.000257,
|
||
|
|
"loss": 6.9204,
|
||
|
|
"mean_token_accuracy": 0.08957441225647926,
|
||
|
|
"num_tokens": 948673.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.07887830734253,
|
||
|
|
"epoch": 0.04368830077714766,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0002595,
|
||
|
|
"loss": 6.8686,
|
||
|
|
"mean_token_accuracy": 0.08727961704134941,
|
||
|
|
"num_tokens": 957603.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.11884388923645,
|
||
|
|
"epoch": 0.04410838059231254,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000262,
|
||
|
|
"loss": 6.9378,
|
||
|
|
"mean_token_accuracy": 0.08589621968567371,
|
||
|
|
"num_tokens": 967731.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1688611030578615,
|
||
|
|
"epoch": 0.04452846040747742,
|
||
|
|
"grad_norm": 1.3828125,
|
||
|
|
"learning_rate": 0.00026450000000000003,
|
||
|
|
"loss": 6.9387,
|
||
|
|
"mean_token_accuracy": 0.09485394582152366,
|
||
|
|
"num_tokens": 977427.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.146421909332275,
|
||
|
|
"epoch": 0.0449485402226423,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.00026700000000000004,
|
||
|
|
"loss": 6.9243,
|
||
|
|
"mean_token_accuracy": 0.08625848963856697,
|
||
|
|
"num_tokens": 986758.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.25874433517456,
|
||
|
|
"epoch": 0.045368620037807186,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.00026950000000000005,
|
||
|
|
"loss": 6.92,
|
||
|
|
"mean_token_accuracy": 0.09832347258925438,
|
||
|
|
"num_tokens": 996377.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.057836389541626,
|
||
|
|
"epoch": 0.04578869985297206,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.00027200000000000005,
|
||
|
|
"loss": 6.9742,
|
||
|
|
"mean_token_accuracy": 0.08528567403554917,
|
||
|
|
"num_tokens": 1006483.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.995539855957031,
|
||
|
|
"epoch": 0.04620877966813695,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0002745,
|
||
|
|
"loss": 6.8574,
|
||
|
|
"mean_token_accuracy": 0.08858747258782387,
|
||
|
|
"num_tokens": 1016132.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.106180238723755,
|
||
|
|
"epoch": 0.04662885948330183,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000277,
|
||
|
|
"loss": 6.7984,
|
||
|
|
"mean_token_accuracy": 0.09407598823308945,
|
||
|
|
"num_tokens": 1024970.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.142482328414917,
|
||
|
|
"epoch": 0.04704893929846671,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0002795,
|
||
|
|
"loss": 6.8936,
|
||
|
|
"mean_token_accuracy": 0.08978619575500488,
|
||
|
|
"num_tokens": 1034335.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.139913558959961,
|
||
|
|
"epoch": 0.04746901911363159,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00028199999999999997,
|
||
|
|
"loss": 6.9495,
|
||
|
|
"mean_token_accuracy": 0.0973325490951538,
|
||
|
|
"num_tokens": 1043954.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.08342981338501,
|
||
|
|
"epoch": 0.04788909892879647,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0002845,
|
||
|
|
"loss": 6.8806,
|
||
|
|
"mean_token_accuracy": 0.09276892617344856,
|
||
|
|
"num_tokens": 1053554.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.0591119766235355,
|
||
|
|
"epoch": 0.04830917874396135,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000287,
|
||
|
|
"loss": 6.8354,
|
||
|
|
"mean_token_accuracy": 0.09314879402518272,
|
||
|
|
"num_tokens": 1062008.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.029165410995484,
|
||
|
|
"epoch": 0.048729258559126236,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.0002895,
|
||
|
|
"loss": 6.9074,
|
||
|
|
"mean_token_accuracy": 0.09056607261300087,
|
||
|
|
"num_tokens": 1070740.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.027670526504517,
|
||
|
|
"epoch": 0.04914933837429111,
|
||
|
|
"grad_norm": 1.2890625,
|
||
|
|
"learning_rate": 0.000292,
|
||
|
|
"loss": 6.8895,
|
||
|
|
"mean_token_accuracy": 0.09351922869682312,
|
||
|
|
"num_tokens": 1079681.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.076567363739014,
|
||
|
|
"epoch": 0.049569418189456,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0002945,
|
||
|
|
"loss": 6.7669,
|
||
|
|
"mean_token_accuracy": 0.0963557355105877,
|
||
|
|
"num_tokens": 1088979.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.955168056488037,
|
||
|
|
"epoch": 0.04998949800462088,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.000297,
|
||
|
|
"loss": 6.7794,
|
||
|
|
"mean_token_accuracy": 0.09716788977384568,
|
||
|
|
"num_tokens": 1097870.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.0498795986175535,
|
||
|
|
"epoch": 0.05040957781978576,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.0002995,
|
||
|
|
"loss": 6.8985,
|
||
|
|
"mean_token_accuracy": 0.08934849128127098,
|
||
|
|
"num_tokens": 1107948.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.038954401016236,
|
||
|
|
"epoch": 0.05082965763495064,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000302,
|
||
|
|
"loss": 6.8034,
|
||
|
|
"mean_token_accuracy": 0.09711324200034141,
|
||
|
|
"num_tokens": 1117032.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.016556072235107,
|
||
|
|
"epoch": 0.051249737450115525,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0003045,
|
||
|
|
"loss": 6.7736,
|
||
|
|
"mean_token_accuracy": 0.10140406414866447,
|
||
|
|
"num_tokens": 1127834.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.053543567657471,
|
||
|
|
"epoch": 0.0516698172652804,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.000307,
|
||
|
|
"loss": 6.8664,
|
||
|
|
"mean_token_accuracy": 0.10583841800689697,
|
||
|
|
"num_tokens": 1137382.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.960672283172608,
|
||
|
|
"epoch": 0.052089897080445285,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0003095,
|
||
|
|
"loss": 6.7295,
|
||
|
|
"mean_token_accuracy": 0.09906250685453415,
|
||
|
|
"num_tokens": 1146095.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.916978216171264,
|
||
|
|
"epoch": 0.05250997689561017,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.000312,
|
||
|
|
"loss": 6.7648,
|
||
|
|
"mean_token_accuracy": 0.1004838652908802,
|
||
|
|
"num_tokens": 1154981.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.948708629608154,
|
||
|
|
"epoch": 0.052930056710775046,
|
||
|
|
"grad_norm": 1.5390625,
|
||
|
|
"learning_rate": 0.0003145,
|
||
|
|
"loss": 6.7765,
|
||
|
|
"mean_token_accuracy": 0.10312124192714692,
|
||
|
|
"num_tokens": 1164939.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.024917793273926,
|
||
|
|
"epoch": 0.05335013652593993,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.000317,
|
||
|
|
"loss": 6.8939,
|
||
|
|
"mean_token_accuracy": 0.09090543612837791,
|
||
|
|
"num_tokens": 1174991.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.0208131790161135,
|
||
|
|
"epoch": 0.05377021634110481,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0003195,
|
||
|
|
"loss": 6.9459,
|
||
|
|
"mean_token_accuracy": 0.08811391443014145,
|
||
|
|
"num_tokens": 1184885.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.984617424011231,
|
||
|
|
"epoch": 0.05419029615626969,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.000322,
|
||
|
|
"loss": 6.8348,
|
||
|
|
"mean_token_accuracy": 0.09274234399199485,
|
||
|
|
"num_tokens": 1193637.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.901879405975341,
|
||
|
|
"epoch": 0.054610375971434574,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.00032450000000000003,
|
||
|
|
"loss": 6.6237,
|
||
|
|
"mean_token_accuracy": 0.10028594210743905,
|
||
|
|
"num_tokens": 1202188.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.964693355560303,
|
||
|
|
"epoch": 0.05503045578659945,
|
||
|
|
"grad_norm": 1.25,
|
||
|
|
"learning_rate": 0.00032700000000000003,
|
||
|
|
"loss": 6.7513,
|
||
|
|
"mean_token_accuracy": 0.09297072812914849,
|
||
|
|
"num_tokens": 1210768.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.921257066726684,
|
||
|
|
"epoch": 0.055450535601764335,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.00032950000000000004,
|
||
|
|
"loss": 6.7581,
|
||
|
|
"mean_token_accuracy": 0.09513410851359368,
|
||
|
|
"num_tokens": 1219819.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.969961500167846,
|
||
|
|
"epoch": 0.05587061541692922,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00033200000000000005,
|
||
|
|
"loss": 6.8151,
|
||
|
|
"mean_token_accuracy": 0.08720013573765754,
|
||
|
|
"num_tokens": 1229703.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.008356428146362,
|
||
|
|
"epoch": 0.056290695232094096,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00033450000000000005,
|
||
|
|
"loss": 6.8385,
|
||
|
|
"mean_token_accuracy": 0.09394309446215629,
|
||
|
|
"num_tokens": 1238942.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.041683959960937,
|
||
|
|
"epoch": 0.05671077504725898,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000337,
|
||
|
|
"loss": 6.8901,
|
||
|
|
"mean_token_accuracy": 0.0907767005264759,
|
||
|
|
"num_tokens": 1248943.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.869440269470215,
|
||
|
|
"epoch": 0.05713085486242386,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0003395,
|
||
|
|
"loss": 6.7728,
|
||
|
|
"mean_token_accuracy": 0.09719423428177834,
|
||
|
|
"num_tokens": 1257761.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.80675859451294,
|
||
|
|
"epoch": 0.05755093467758874,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.000342,
|
||
|
|
"loss": 6.722,
|
||
|
|
"mean_token_accuracy": 0.09433782026171685,
|
||
|
|
"num_tokens": 1267216.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.962690448760986,
|
||
|
|
"epoch": 0.057971014492753624,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.00034449999999999997,
|
||
|
|
"loss": 6.8182,
|
||
|
|
"mean_token_accuracy": 0.09524153247475624,
|
||
|
|
"num_tokens": 1277210.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.910012054443359,
|
||
|
|
"epoch": 0.05839109430791851,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000347,
|
||
|
|
"loss": 6.7268,
|
||
|
|
"mean_token_accuracy": 0.09480128362774849,
|
||
|
|
"num_tokens": 1285310.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.9359142780303955,
|
||
|
|
"epoch": 0.058811174123083385,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0003495,
|
||
|
|
"loss": 6.7418,
|
||
|
|
"mean_token_accuracy": 0.09830545634031296,
|
||
|
|
"num_tokens": 1294421.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.773298215866089,
|
||
|
|
"epoch": 0.05923125393824827,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.000352,
|
||
|
|
"loss": 6.5648,
|
||
|
|
"mean_token_accuracy": 0.10509093776345253,
|
||
|
|
"num_tokens": 1303281.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.848818397521972,
|
||
|
|
"epoch": 0.059651333753413145,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.0003545,
|
||
|
|
"loss": 6.7413,
|
||
|
|
"mean_token_accuracy": 0.10247144997119903,
|
||
|
|
"num_tokens": 1312280.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.792526483535767,
|
||
|
|
"epoch": 0.06007141356857803,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000357,
|
||
|
|
"loss": 6.703,
|
||
|
|
"mean_token_accuracy": 0.09476525709033012,
|
||
|
|
"num_tokens": 1321243.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8667539119720455,
|
||
|
|
"epoch": 0.06049149338374291,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003595,
|
||
|
|
"loss": 6.8092,
|
||
|
|
"mean_token_accuracy": 0.10024766996502876,
|
||
|
|
"num_tokens": 1330324.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.874475002288818,
|
||
|
|
"epoch": 0.06091157319890779,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.000362,
|
||
|
|
"loss": 6.6476,
|
||
|
|
"mean_token_accuracy": 0.10230677276849746,
|
||
|
|
"num_tokens": 1339485.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.930787801742554,
|
||
|
|
"epoch": 0.06133165301407267,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0003645,
|
||
|
|
"loss": 6.8065,
|
||
|
|
"mean_token_accuracy": 0.09302590638399125,
|
||
|
|
"num_tokens": 1348640.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.799437236785889,
|
||
|
|
"epoch": 0.06175173282923756,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.000367,
|
||
|
|
"loss": 6.6978,
|
||
|
|
"mean_token_accuracy": 0.09949951842427254,
|
||
|
|
"num_tokens": 1357581.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.888378238677978,
|
||
|
|
"epoch": 0.062171812644402434,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0003695,
|
||
|
|
"loss": 6.7652,
|
||
|
|
"mean_token_accuracy": 0.09876005351543427,
|
||
|
|
"num_tokens": 1367883.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.812366771697998,
|
||
|
|
"epoch": 0.06259189245956731,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000372,
|
||
|
|
"loss": 6.7175,
|
||
|
|
"mean_token_accuracy": 0.09678780436515808,
|
||
|
|
"num_tokens": 1376936.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.708990812301636,
|
||
|
|
"epoch": 0.0630119722747322,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0003745,
|
||
|
|
"loss": 6.6402,
|
||
|
|
"mean_token_accuracy": 0.09989499375224113,
|
||
|
|
"num_tokens": 1386359.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.86722469329834,
|
||
|
|
"epoch": 0.06343205208989708,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000377,
|
||
|
|
"loss": 6.6965,
|
||
|
|
"mean_token_accuracy": 0.10066593587398528,
|
||
|
|
"num_tokens": 1395223.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.944450616836548,
|
||
|
|
"epoch": 0.06385213190506196,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0003795,
|
||
|
|
"loss": 6.847,
|
||
|
|
"mean_token_accuracy": 0.09334802627563477,
|
||
|
|
"num_tokens": 1404917.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.823553276062012,
|
||
|
|
"epoch": 0.06427221172022685,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.000382,
|
||
|
|
"loss": 6.7474,
|
||
|
|
"mean_token_accuracy": 0.10658529698848725,
|
||
|
|
"num_tokens": 1413348.0,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7500804424285885,
|
||
|
|
"epoch": 0.06469229153539173,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0003845,
|
||
|
|
"loss": 6.7193,
|
||
|
|
"mean_token_accuracy": 0.09804128184914589,
|
||
|
|
"num_tokens": 1421726.0,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.822430419921875,
|
||
|
|
"epoch": 0.0651123713505566,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00038700000000000003,
|
||
|
|
"loss": 6.7314,
|
||
|
|
"mean_token_accuracy": 0.09830505326390267,
|
||
|
|
"num_tokens": 1430686.0,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.889693403244019,
|
||
|
|
"epoch": 0.06553245116572148,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.00038950000000000003,
|
||
|
|
"loss": 6.7193,
|
||
|
|
"mean_token_accuracy": 0.1001870684325695,
|
||
|
|
"num_tokens": 1439499.0,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.836849641799927,
|
||
|
|
"epoch": 0.06595253098088637,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.00039200000000000004,
|
||
|
|
"loss": 6.7144,
|
||
|
|
"mean_token_accuracy": 0.10016432479023933,
|
||
|
|
"num_tokens": 1448220.0,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.703166866302491,
|
||
|
|
"epoch": 0.06637261079605125,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00039450000000000005,
|
||
|
|
"loss": 6.7252,
|
||
|
|
"mean_token_accuracy": 0.09049011170864105,
|
||
|
|
"num_tokens": 1458217.0,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.805354738235474,
|
||
|
|
"epoch": 0.06679269061121614,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00039700000000000005,
|
||
|
|
"loss": 6.6229,
|
||
|
|
"mean_token_accuracy": 0.0928824745118618,
|
||
|
|
"num_tokens": 1467422.0,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.788901376724243,
|
||
|
|
"epoch": 0.06721277042638102,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0003995,
|
||
|
|
"loss": 6.6204,
|
||
|
|
"mean_token_accuracy": 0.10320913046598434,
|
||
|
|
"num_tokens": 1476152.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.731419372558594,
|
||
|
|
"epoch": 0.06763285024154589,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000402,
|
||
|
|
"loss": 6.7128,
|
||
|
|
"mean_token_accuracy": 0.09539571255445481,
|
||
|
|
"num_tokens": 1485248.0,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7255181789398195,
|
||
|
|
"epoch": 0.06805293005671077,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004045,
|
||
|
|
"loss": 6.6711,
|
||
|
|
"mean_token_accuracy": 0.09965705946087837,
|
||
|
|
"num_tokens": 1494248.0,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.825131368637085,
|
||
|
|
"epoch": 0.06847300987187566,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.00040699999999999997,
|
||
|
|
"loss": 6.785,
|
||
|
|
"mean_token_accuracy": 0.09547284319996834,
|
||
|
|
"num_tokens": 1503565.0,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.932170867919922,
|
||
|
|
"epoch": 0.06889308968704054,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004095,
|
||
|
|
"loss": 6.8605,
|
||
|
|
"mean_token_accuracy": 0.09502148702740669,
|
||
|
|
"num_tokens": 1513227.0,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8283134460449215,
|
||
|
|
"epoch": 0.06931316950220542,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000412,
|
||
|
|
"loss": 6.6616,
|
||
|
|
"mean_token_accuracy": 0.1039304107427597,
|
||
|
|
"num_tokens": 1522312.0,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6956737518310545,
|
||
|
|
"epoch": 0.0697332493173703,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004145,
|
||
|
|
"loss": 6.5989,
|
||
|
|
"mean_token_accuracy": 0.10552669763565063,
|
||
|
|
"num_tokens": 1531720.0,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.70291919708252,
|
||
|
|
"epoch": 0.07015332913253518,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.000417,
|
||
|
|
"loss": 6.7026,
|
||
|
|
"mean_token_accuracy": 0.09495449438691139,
|
||
|
|
"num_tokens": 1541238.0,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.867031812667847,
|
||
|
|
"epoch": 0.07057340894770006,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004195,
|
||
|
|
"loss": 6.7955,
|
||
|
|
"mean_token_accuracy": 0.09560235142707825,
|
||
|
|
"num_tokens": 1550875.0,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.679243516921997,
|
||
|
|
"epoch": 0.07099348876286495,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000422,
|
||
|
|
"loss": 6.7373,
|
||
|
|
"mean_token_accuracy": 0.10205229669809342,
|
||
|
|
"num_tokens": 1560287.0,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.812178373336792,
|
||
|
|
"epoch": 0.07141356857802983,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004245,
|
||
|
|
"loss": 6.6139,
|
||
|
|
"mean_token_accuracy": 0.10624400898814201,
|
||
|
|
"num_tokens": 1569043.0,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.66694450378418,
|
||
|
|
"epoch": 0.07183364839319471,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000427,
|
||
|
|
"loss": 6.6372,
|
||
|
|
"mean_token_accuracy": 0.10226837545633316,
|
||
|
|
"num_tokens": 1578112.0,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.592900228500366,
|
||
|
|
"epoch": 0.07225372820835958,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004295,
|
||
|
|
"loss": 6.5542,
|
||
|
|
"mean_token_accuracy": 0.10482543483376502,
|
||
|
|
"num_tokens": 1586587.0,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.831333017349243,
|
||
|
|
"epoch": 0.07267380802352447,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000432,
|
||
|
|
"loss": 6.7191,
|
||
|
|
"mean_token_accuracy": 0.0988001950085163,
|
||
|
|
"num_tokens": 1595585.0,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7406104564666744,
|
||
|
|
"epoch": 0.07309388783868935,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004345,
|
||
|
|
"loss": 6.6715,
|
||
|
|
"mean_token_accuracy": 0.1029144361615181,
|
||
|
|
"num_tokens": 1605355.0,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.673774909973145,
|
||
|
|
"epoch": 0.07351396765385423,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.000437,
|
||
|
|
"loss": 6.7087,
|
||
|
|
"mean_token_accuracy": 0.0972638413310051,
|
||
|
|
"num_tokens": 1613637.0,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.780192899703979,
|
||
|
|
"epoch": 0.07393404746901912,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004395,
|
||
|
|
"loss": 6.6547,
|
||
|
|
"mean_token_accuracy": 0.10374342575669289,
|
||
|
|
"num_tokens": 1622731.0,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.733386611938476,
|
||
|
|
"epoch": 0.074354127284184,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000442,
|
||
|
|
"loss": 6.6411,
|
||
|
|
"mean_token_accuracy": 0.09785914570093154,
|
||
|
|
"num_tokens": 1632098.0,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.656809377670288,
|
||
|
|
"epoch": 0.07477420709934887,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004445,
|
||
|
|
"loss": 6.6333,
|
||
|
|
"mean_token_accuracy": 0.09908856153488159,
|
||
|
|
"num_tokens": 1641259.0,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.787235689163208,
|
||
|
|
"epoch": 0.07519428691451376,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.000447,
|
||
|
|
"loss": 6.7023,
|
||
|
|
"mean_token_accuracy": 0.09753435328602791,
|
||
|
|
"num_tokens": 1651362.0,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.644986867904663,
|
||
|
|
"epoch": 0.07561436672967864,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00044950000000000003,
|
||
|
|
"loss": 6.6169,
|
||
|
|
"mean_token_accuracy": 0.09910911172628403,
|
||
|
|
"num_tokens": 1660190.0,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.722699403762817,
|
||
|
|
"epoch": 0.07603444654484352,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.00045200000000000004,
|
||
|
|
"loss": 6.659,
|
||
|
|
"mean_token_accuracy": 0.09519267976284027,
|
||
|
|
"num_tokens": 1669020.0,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.747388315200806,
|
||
|
|
"epoch": 0.0764545263600084,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00045450000000000004,
|
||
|
|
"loss": 6.6775,
|
||
|
|
"mean_token_accuracy": 0.10076266825199127,
|
||
|
|
"num_tokens": 1678158.0,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.702866649627685,
|
||
|
|
"epoch": 0.07687460617517328,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.00045700000000000005,
|
||
|
|
"loss": 6.6868,
|
||
|
|
"mean_token_accuracy": 0.09906790256500245,
|
||
|
|
"num_tokens": 1687481.0,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.647071504592896,
|
||
|
|
"epoch": 0.07729468599033816,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00045950000000000006,
|
||
|
|
"loss": 6.6511,
|
||
|
|
"mean_token_accuracy": 0.10402323752641678,
|
||
|
|
"num_tokens": 1696782.0,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6832818508148195,
|
||
|
|
"epoch": 0.07771476580550304,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000462,
|
||
|
|
"loss": 6.6575,
|
||
|
|
"mean_token_accuracy": 0.10666462555527687,
|
||
|
|
"num_tokens": 1706153.0,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.698217678070068,
|
||
|
|
"epoch": 0.07813484562066793,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004645,
|
||
|
|
"loss": 6.6895,
|
||
|
|
"mean_token_accuracy": 0.10017500966787338,
|
||
|
|
"num_tokens": 1715585.0,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.823991441726685,
|
||
|
|
"epoch": 0.07855492543583281,
|
||
|
|
"grad_norm": 1.4921875,
|
||
|
|
"learning_rate": 0.000467,
|
||
|
|
"loss": 6.8005,
|
||
|
|
"mean_token_accuracy": 0.09734346494078636,
|
||
|
|
"num_tokens": 1724857.0,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.700028705596924,
|
||
|
|
"epoch": 0.0789750052509977,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004695,
|
||
|
|
"loss": 6.6103,
|
||
|
|
"mean_token_accuracy": 0.10624456107616424,
|
||
|
|
"num_tokens": 1733528.0,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.742655563354492,
|
||
|
|
"epoch": 0.07939508506616257,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000472,
|
||
|
|
"loss": 6.7304,
|
||
|
|
"mean_token_accuracy": 0.10352228581905365,
|
||
|
|
"num_tokens": 1742953.0,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.669600582122802,
|
||
|
|
"epoch": 0.07981516488132745,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0004745,
|
||
|
|
"loss": 6.6746,
|
||
|
|
"mean_token_accuracy": 0.10271603912115097,
|
||
|
|
"num_tokens": 1752155.0,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.660818243026734,
|
||
|
|
"epoch": 0.08023524469649233,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.000477,
|
||
|
|
"loss": 6.5695,
|
||
|
|
"mean_token_accuracy": 0.10144439786672592,
|
||
|
|
"num_tokens": 1760562.0,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.623502588272094,
|
||
|
|
"epoch": 0.08065532451165722,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004795,
|
||
|
|
"loss": 6.5902,
|
||
|
|
"mean_token_accuracy": 0.1015326887369156,
|
||
|
|
"num_tokens": 1769631.0,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.647875261306763,
|
||
|
|
"epoch": 0.0810754043268221,
|
||
|
|
"grad_norm": 1.265625,
|
||
|
|
"learning_rate": 0.000482,
|
||
|
|
"loss": 6.624,
|
||
|
|
"mean_token_accuracy": 0.10202456414699554,
|
||
|
|
"num_tokens": 1779080.0,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.654635858535767,
|
||
|
|
"epoch": 0.08149548414198697,
|
||
|
|
"grad_norm": 1.375,
|
||
|
|
"learning_rate": 0.0004845,
|
||
|
|
"loss": 6.6146,
|
||
|
|
"mean_token_accuracy": 0.10121759623289109,
|
||
|
|
"num_tokens": 1787830.0,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.546731615066529,
|
||
|
|
"epoch": 0.08191556395715185,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000487,
|
||
|
|
"loss": 6.5331,
|
||
|
|
"mean_token_accuracy": 0.10186785906553268,
|
||
|
|
"num_tokens": 1796998.0,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6796527862548825,
|
||
|
|
"epoch": 0.08233564377231674,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004895,
|
||
|
|
"loss": 6.619,
|
||
|
|
"mean_token_accuracy": 0.10591355115175247,
|
||
|
|
"num_tokens": 1806194.0,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.40926570892334,
|
||
|
|
"epoch": 0.08275572358748162,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000492,
|
||
|
|
"loss": 6.514,
|
||
|
|
"mean_token_accuracy": 0.10517977550625801,
|
||
|
|
"num_tokens": 1815751.0,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.57440676689148,
|
||
|
|
"epoch": 0.0831758034026465,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004945,
|
||
|
|
"loss": 6.5942,
|
||
|
|
"mean_token_accuracy": 0.10343918055295945,
|
||
|
|
"num_tokens": 1825379.0,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.637695789337158,
|
||
|
|
"epoch": 0.08359588321781139,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000497,
|
||
|
|
"loss": 6.5522,
|
||
|
|
"mean_token_accuracy": 0.10346684157848358,
|
||
|
|
"num_tokens": 1834158.0,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.537919807434082,
|
||
|
|
"epoch": 0.08401596303297626,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004995,
|
||
|
|
"loss": 6.5098,
|
||
|
|
"mean_token_accuracy": 0.10425886288285255,
|
||
|
|
"num_tokens": 1842724.0,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.62498288154602,
|
||
|
|
"epoch": 0.08443604284814114,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499999998724557,
|
||
|
|
"loss": 6.5288,
|
||
|
|
"mean_token_accuracy": 0.10198150128126145,
|
||
|
|
"num_tokens": 1852485.0,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.57701358795166,
|
||
|
|
"epoch": 0.08485612266330603,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999999935430703,
|
||
|
|
"loss": 6.5545,
|
||
|
|
"mean_token_accuracy": 0.11041983366012573,
|
||
|
|
"num_tokens": 1861303.0,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.423639154434204,
|
||
|
|
"epoch": 0.08527620247847091,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004999999843758243,
|
||
|
|
"loss": 6.5428,
|
||
|
|
"mean_token_accuracy": 0.11022127270698548,
|
||
|
|
"num_tokens": 1870859.0,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.760848808288574,
|
||
|
|
"epoch": 0.0856962822936358,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999999712228196,
|
||
|
|
"loss": 6.7105,
|
||
|
|
"mean_token_accuracy": 0.09618140533566474,
|
||
|
|
"num_tokens": 1880295.0,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.645368003845215,
|
||
|
|
"epoch": 0.08611636210880068,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999999540840562,
|
||
|
|
"loss": 6.6079,
|
||
|
|
"mean_token_accuracy": 0.1056639552116394,
|
||
|
|
"num_tokens": 1889193.0,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.568785905838013,
|
||
|
|
"epoch": 0.08653644192396555,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999999329595345,
|
||
|
|
"loss": 6.7096,
|
||
|
|
"mean_token_accuracy": 0.09398577436804771,
|
||
|
|
"num_tokens": 1899437.0,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.708119821548462,
|
||
|
|
"epoch": 0.08695652173913043,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999999078492548,
|
||
|
|
"loss": 6.5939,
|
||
|
|
"mean_token_accuracy": 0.1046712227165699,
|
||
|
|
"num_tokens": 1907882.0,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.493611288070679,
|
||
|
|
"epoch": 0.08737660155429532,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999998787532176,
|
||
|
|
"loss": 6.5021,
|
||
|
|
"mean_token_accuracy": 0.10290396809577942,
|
||
|
|
"num_tokens": 1916872.0,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.608988046646118,
|
||
|
|
"epoch": 0.0877966813694602,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999998456714234,
|
||
|
|
"loss": 6.675,
|
||
|
|
"mean_token_accuracy": 0.10352342054247857,
|
||
|
|
"num_tokens": 1926636.0,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.586896228790283,
|
||
|
|
"epoch": 0.08821676118462508,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004999998086038729,
|
||
|
|
"loss": 6.5742,
|
||
|
|
"mean_token_accuracy": 0.10714709535241126,
|
||
|
|
"num_tokens": 1935962.0,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.579021549224853,
|
||
|
|
"epoch": 0.08863684099978995,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999997675505665,
|
||
|
|
"loss": 6.5514,
|
||
|
|
"mean_token_accuracy": 0.10487730801105499,
|
||
|
|
"num_tokens": 1944600.0,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.625632095336914,
|
||
|
|
"epoch": 0.08905692081495484,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999997225115052,
|
||
|
|
"loss": 6.7269,
|
||
|
|
"mean_token_accuracy": 0.10071012005209923,
|
||
|
|
"num_tokens": 1954234.0,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7796577453613285,
|
||
|
|
"epoch": 0.08947700063011972,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999996734866896,
|
||
|
|
"loss": 6.683,
|
||
|
|
"mean_token_accuracy": 0.09888390973210334,
|
||
|
|
"num_tokens": 1964499.0,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.377533006668091,
|
||
|
|
"epoch": 0.0898970804452846,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004999996204761206,
|
||
|
|
"loss": 6.3832,
|
||
|
|
"mean_token_accuracy": 0.11216704472899437,
|
||
|
|
"num_tokens": 1973635.0,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.54502387046814,
|
||
|
|
"epoch": 0.09031716026044949,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999995634797993,
|
||
|
|
"loss": 6.5308,
|
||
|
|
"mean_token_accuracy": 0.11021102443337441,
|
||
|
|
"num_tokens": 1983509.0,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.567485332489014,
|
||
|
|
"epoch": 0.09073724007561437,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999995024977265,
|
||
|
|
"loss": 6.5197,
|
||
|
|
"mean_token_accuracy": 0.11247633025050163,
|
||
|
|
"num_tokens": 1992336.0,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.545616102218628,
|
||
|
|
"epoch": 0.09115731989077924,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999994375299034,
|
||
|
|
"loss": 6.5532,
|
||
|
|
"mean_token_accuracy": 0.10819393768906593,
|
||
|
|
"num_tokens": 2001931.0,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.484406518936157,
|
||
|
|
"epoch": 0.09157739970594413,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499999368576331,
|
||
|
|
"loss": 6.4218,
|
||
|
|
"mean_token_accuracy": 0.11132358983159066,
|
||
|
|
"num_tokens": 2010935.0,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.49219536781311,
|
||
|
|
"epoch": 0.09199747952110901,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999992956370109,
|
||
|
|
"loss": 6.4842,
|
||
|
|
"mean_token_accuracy": 0.10731736794114113,
|
||
|
|
"num_tokens": 2020587.0,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.410812473297119,
|
||
|
|
"epoch": 0.0924175593362739,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499999218711944,
|
||
|
|
"loss": 6.5089,
|
||
|
|
"mean_token_accuracy": 0.11067400127649307,
|
||
|
|
"num_tokens": 2029743.0,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.581059837341309,
|
||
|
|
"epoch": 0.09283763915143878,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999991378011317,
|
||
|
|
"loss": 6.5257,
|
||
|
|
"mean_token_accuracy": 0.10916591510176658,
|
||
|
|
"num_tokens": 2038468.0,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.456353855133057,
|
||
|
|
"epoch": 0.09325771896660366,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999990529045757,
|
||
|
|
"loss": 6.4482,
|
||
|
|
"mean_token_accuracy": 0.10893432199954986,
|
||
|
|
"num_tokens": 2047456.0,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.627411127090454,
|
||
|
|
"epoch": 0.09367779878176853,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999989640222771,
|
||
|
|
"loss": 6.7525,
|
||
|
|
"mean_token_accuracy": 0.09431043416261672,
|
||
|
|
"num_tokens": 2056691.0,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.684362411499023,
|
||
|
|
"epoch": 0.09409787859693342,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499998871154238,
|
||
|
|
"loss": 6.5462,
|
||
|
|
"mean_token_accuracy": 0.10591837242245675,
|
||
|
|
"num_tokens": 2066068.0,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.578407287597656,
|
||
|
|
"epoch": 0.0945179584120983,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999987743004597,
|
||
|
|
"loss": 6.4733,
|
||
|
|
"mean_token_accuracy": 0.1102992869913578,
|
||
|
|
"num_tokens": 2075113.0,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.506056404113769,
|
||
|
|
"epoch": 0.09493803822726318,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999986734609438,
|
||
|
|
"loss": 6.6105,
|
||
|
|
"mean_token_accuracy": 0.10494827926158905,
|
||
|
|
"num_tokens": 2084557.0,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6157310009002686,
|
||
|
|
"epoch": 0.09535811804242807,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999985686356923,
|
||
|
|
"loss": 6.5139,
|
||
|
|
"mean_token_accuracy": 0.1062320664525032,
|
||
|
|
"num_tokens": 2093424.0,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.539625740051269,
|
||
|
|
"epoch": 0.09577819785759294,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499998459824707,
|
||
|
|
"loss": 6.6346,
|
||
|
|
"mean_token_accuracy": 0.10304314494132996,
|
||
|
|
"num_tokens": 2103066.0,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.53157410621643,
|
||
|
|
"epoch": 0.09619827767275782,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.00049999834702799,
|
||
|
|
"loss": 6.5013,
|
||
|
|
"mean_token_accuracy": 0.10883507803082466,
|
||
|
|
"num_tokens": 2112447.0,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.507535743713379,
|
||
|
|
"epoch": 0.0966183574879227,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999982302455431,
|
||
|
|
"loss": 6.5269,
|
||
|
|
"mean_token_accuracy": 0.11191204637289047,
|
||
|
|
"num_tokens": 2121949.0,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.507864904403687,
|
||
|
|
"epoch": 0.09703843730308759,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999981094773683,
|
||
|
|
"loss": 6.4328,
|
||
|
|
"mean_token_accuracy": 0.11216317638754844,
|
||
|
|
"num_tokens": 2130464.0,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.520567464828491,
|
||
|
|
"epoch": 0.09745851711825247,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.000499997984723468,
|
||
|
|
"loss": 6.5942,
|
||
|
|
"mean_token_accuracy": 0.10294081419706344,
|
||
|
|
"num_tokens": 2139577.0,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.288797092437744,
|
||
|
|
"epoch": 0.09787859693341736,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999978559838441,
|
||
|
|
"loss": 6.3204,
|
||
|
|
"mean_token_accuracy": 0.11208199337124825,
|
||
|
|
"num_tokens": 2147919.0,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.472030353546143,
|
||
|
|
"epoch": 0.09829867674858223,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999977232584991,
|
||
|
|
"loss": 6.4949,
|
||
|
|
"mean_token_accuracy": 0.10832359045743942,
|
||
|
|
"num_tokens": 2156936.0,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.558899450302124,
|
||
|
|
"epoch": 0.09871875656374711,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999975865474354,
|
||
|
|
"loss": 6.5512,
|
||
|
|
"mean_token_accuracy": 0.10766256302595138,
|
||
|
|
"num_tokens": 2165362.0,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.469175338745117,
|
||
|
|
"epoch": 0.099138836378912,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999974458506551,
|
||
|
|
"loss": 6.4643,
|
||
|
|
"mean_token_accuracy": 0.10836688205599784,
|
||
|
|
"num_tokens": 2173665.0,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.551422071456909,
|
||
|
|
"epoch": 0.09955891619407688,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000499997301168161,
|
||
|
|
"loss": 6.4532,
|
||
|
|
"mean_token_accuracy": 0.11138271391391755,
|
||
|
|
"num_tokens": 2182222.0,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.531885147094727,
|
||
|
|
"epoch": 0.09997899600924176,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999971524999556,
|
||
|
|
"loss": 6.5228,
|
||
|
|
"mean_token_accuracy": 0.11111016869544983,
|
||
|
|
"num_tokens": 2192358.0,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.534890985488891,
|
||
|
|
"epoch": 0.10039907582440663,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999969998460414,
|
||
|
|
"loss": 6.5355,
|
||
|
|
"mean_token_accuracy": 0.10454710125923157,
|
||
|
|
"num_tokens": 2201889.0,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.433488464355468,
|
||
|
|
"epoch": 0.10081915563957151,
|
||
|
|
"grad_norm": 1.328125,
|
||
|
|
"learning_rate": 0.0004999968432064213,
|
||
|
|
"loss": 6.5322,
|
||
|
|
"mean_token_accuracy": 0.1198379322886467,
|
||
|
|
"num_tokens": 2211810.0,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.474250078201294,
|
||
|
|
"epoch": 0.1012392354547364,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004999966825810979,
|
||
|
|
"loss": 6.4684,
|
||
|
|
"mean_token_accuracy": 0.10700508952140808,
|
||
|
|
"num_tokens": 2221123.0,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.384520959854126,
|
||
|
|
"epoch": 0.10165931526990128,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999965179700742,
|
||
|
|
"loss": 6.3986,
|
||
|
|
"mean_token_accuracy": 0.11781087368726731,
|
||
|
|
"num_tokens": 2230129.0,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4176534652709964,
|
||
|
|
"epoch": 0.10207939508506617,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499996349373353,
|
||
|
|
"loss": 6.4609,
|
||
|
|
"mean_token_accuracy": 0.10817519575357437,
|
||
|
|
"num_tokens": 2239929.0,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5110820770263675,
|
||
|
|
"epoch": 0.10249947490023105,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999961767909374,
|
||
|
|
"loss": 6.4372,
|
||
|
|
"mean_token_accuracy": 0.1148509480059147,
|
||
|
|
"num_tokens": 2248078.0,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4125104427337645,
|
||
|
|
"epoch": 0.10291955471539592,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999960002228303,
|
||
|
|
"loss": 6.5274,
|
||
|
|
"mean_token_accuracy": 0.10999985039234161,
|
||
|
|
"num_tokens": 2256975.0,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.474673461914063,
|
||
|
|
"epoch": 0.1033396345305608,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999958196690349,
|
||
|
|
"loss": 6.3849,
|
||
|
|
"mean_token_accuracy": 0.11320202201604843,
|
||
|
|
"num_tokens": 2265797.0,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.479385900497436,
|
||
|
|
"epoch": 0.10375971434572569,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999956351295545,
|
||
|
|
"loss": 6.4946,
|
||
|
|
"mean_token_accuracy": 0.11450825035572051,
|
||
|
|
"num_tokens": 2274099.0,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3540520668029785,
|
||
|
|
"epoch": 0.10417979416089057,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999954466043922,
|
||
|
|
"loss": 6.3917,
|
||
|
|
"mean_token_accuracy": 0.11258968263864517,
|
||
|
|
"num_tokens": 2282360.0,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.481705999374389,
|
||
|
|
"epoch": 0.10459987397605545,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999952540935514,
|
||
|
|
"loss": 6.5009,
|
||
|
|
"mean_token_accuracy": 0.10285271480679511,
|
||
|
|
"num_tokens": 2292714.0,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.455303287506103,
|
||
|
|
"epoch": 0.10501995379122034,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999950575970356,
|
||
|
|
"loss": 6.426,
|
||
|
|
"mean_token_accuracy": 0.11442826837301254,
|
||
|
|
"num_tokens": 2301633.0,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.465747499465943,
|
||
|
|
"epoch": 0.10544003360638521,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999948571148482,
|
||
|
|
"loss": 6.4138,
|
||
|
|
"mean_token_accuracy": 0.11426257789134979,
|
||
|
|
"num_tokens": 2310067.0,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.466140460968018,
|
||
|
|
"epoch": 0.10586011342155009,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999946526469927,
|
||
|
|
"loss": 6.4932,
|
||
|
|
"mean_token_accuracy": 0.11244904398918151,
|
||
|
|
"num_tokens": 2320090.0,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.438083505630493,
|
||
|
|
"epoch": 0.10628019323671498,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999944441934728,
|
||
|
|
"loss": 6.4509,
|
||
|
|
"mean_token_accuracy": 0.11593573912978172,
|
||
|
|
"num_tokens": 2329255.0,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.467304992675781,
|
||
|
|
"epoch": 0.10670027305187986,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999942317542922,
|
||
|
|
"loss": 6.5481,
|
||
|
|
"mean_token_accuracy": 0.10965899974107743,
|
||
|
|
"num_tokens": 2339535.0,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.434674501419067,
|
||
|
|
"epoch": 0.10712035286704474,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999940153294546,
|
||
|
|
"loss": 6.4448,
|
||
|
|
"mean_token_accuracy": 0.11061845496296882,
|
||
|
|
"num_tokens": 2348948.0,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.447847843170166,
|
||
|
|
"epoch": 0.10754043268220961,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000499993794918964,
|
||
|
|
"loss": 6.4628,
|
||
|
|
"mean_token_accuracy": 0.10641181394457817,
|
||
|
|
"num_tokens": 2359141.0,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.401166343688965,
|
||
|
|
"epoch": 0.1079605124973745,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004999935705228241,
|
||
|
|
"loss": 6.5084,
|
||
|
|
"mean_token_accuracy": 0.1094856470823288,
|
||
|
|
"num_tokens": 2368906.0,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.554097080230713,
|
||
|
|
"epoch": 0.10838059231253938,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999933421410389,
|
||
|
|
"loss": 6.4839,
|
||
|
|
"mean_token_accuracy": 0.11065066531300545,
|
||
|
|
"num_tokens": 2377029.0,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5027672290802006,
|
||
|
|
"epoch": 0.10880067212770426,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004999931097736125,
|
||
|
|
"loss": 6.5541,
|
||
|
|
"mean_token_accuracy": 0.10604767650365829,
|
||
|
|
"num_tokens": 2387088.0,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.470385646820068,
|
||
|
|
"epoch": 0.10922075194286915,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999928734205492,
|
||
|
|
"loss": 6.4468,
|
||
|
|
"mean_token_accuracy": 0.11056585833430291,
|
||
|
|
"num_tokens": 2395596.0,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.403819370269775,
|
||
|
|
"epoch": 0.10964083175803403,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999926330818528,
|
||
|
|
"loss": 6.4393,
|
||
|
|
"mean_token_accuracy": 0.11377019882202148,
|
||
|
|
"num_tokens": 2404506.0,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.469174242019653,
|
||
|
|
"epoch": 0.1100609115731989,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999923887575278,
|
||
|
|
"loss": 6.4777,
|
||
|
|
"mean_token_accuracy": 0.11094499379396439,
|
||
|
|
"num_tokens": 2414342.0,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.476234006881714,
|
||
|
|
"epoch": 0.11048099138836379,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999921404475785,
|
||
|
|
"loss": 6.4422,
|
||
|
|
"mean_token_accuracy": 0.11336205825209618,
|
||
|
|
"num_tokens": 2423076.0,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.415568065643311,
|
||
|
|
"epoch": 0.11090107120352867,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004999918881520093,
|
||
|
|
"loss": 6.391,
|
||
|
|
"mean_token_accuracy": 0.11621783077716827,
|
||
|
|
"num_tokens": 2432492.0,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.362053871154785,
|
||
|
|
"epoch": 0.11132115101869355,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999916318708246,
|
||
|
|
"loss": 6.354,
|
||
|
|
"mean_token_accuracy": 0.11400164812803268,
|
||
|
|
"num_tokens": 2441916.0,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.406490755081177,
|
||
|
|
"epoch": 0.11174123083385844,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004999913716040291,
|
||
|
|
"loss": 6.4072,
|
||
|
|
"mean_token_accuracy": 0.11762610748410225,
|
||
|
|
"num_tokens": 2450932.0,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.336502504348755,
|
||
|
|
"epoch": 0.11216131064902331,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004999911073516272,
|
||
|
|
"loss": 6.4319,
|
||
|
|
"mean_token_accuracy": 0.11254018545150757,
|
||
|
|
"num_tokens": 2460058.0,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.392711496353149,
|
||
|
|
"epoch": 0.11258139046418819,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999908391136237,
|
||
|
|
"loss": 6.3569,
|
||
|
|
"mean_token_accuracy": 0.11563631743192673,
|
||
|
|
"num_tokens": 2469607.0,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.441662883758545,
|
||
|
|
"epoch": 0.11300147027935308,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999905668900234,
|
||
|
|
"loss": 6.4002,
|
||
|
|
"mean_token_accuracy": 0.11395884156227112,
|
||
|
|
"num_tokens": 2478345.0,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.438292360305786,
|
||
|
|
"epoch": 0.11342155009451796,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000499990290680831,
|
||
|
|
"loss": 6.3261,
|
||
|
|
"mean_token_accuracy": 0.11877992302179337,
|
||
|
|
"num_tokens": 2486662.0,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.379430055618286,
|
||
|
|
"epoch": 0.11384162990968284,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999900104860516,
|
||
|
|
"loss": 6.472,
|
||
|
|
"mean_token_accuracy": 0.11443257331848145,
|
||
|
|
"num_tokens": 2495392.0,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.437303638458252,
|
||
|
|
"epoch": 0.11426170972484773,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999897263056898,
|
||
|
|
"loss": 6.4969,
|
||
|
|
"mean_token_accuracy": 0.10801200717687606,
|
||
|
|
"num_tokens": 2505254.0,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.457095766067505,
|
||
|
|
"epoch": 0.1146817895400126,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000499989438139751,
|
||
|
|
"loss": 6.3155,
|
||
|
|
"mean_token_accuracy": 0.11900854557752609,
|
||
|
|
"num_tokens": 2514096.0,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.339952230453491,
|
||
|
|
"epoch": 0.11510186935517748,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004999891459882401,
|
||
|
|
"loss": 6.3262,
|
||
|
|
"mean_token_accuracy": 0.1178194098174572,
|
||
|
|
"num_tokens": 2523635.0,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.318808507919312,
|
||
|
|
"epoch": 0.11552194917034236,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999888498511624,
|
||
|
|
"loss": 6.3954,
|
||
|
|
"mean_token_accuracy": 0.11501155719161034,
|
||
|
|
"num_tokens": 2532528.0,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.366592121124268,
|
||
|
|
"epoch": 0.11594202898550725,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999885497285229,
|
||
|
|
"loss": 6.307,
|
||
|
|
"mean_token_accuracy": 0.11583952903747559,
|
||
|
|
"num_tokens": 2541893.0,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.354608488082886,
|
||
|
|
"epoch": 0.11636210880067213,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999882456203273,
|
||
|
|
"loss": 6.3581,
|
||
|
|
"mean_token_accuracy": 0.11632645949721336,
|
||
|
|
"num_tokens": 2551551.0,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.349077987670898,
|
||
|
|
"epoch": 0.11678218861583702,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999879375265806,
|
||
|
|
"loss": 6.3146,
|
||
|
|
"mean_token_accuracy": 0.1158558964729309,
|
||
|
|
"num_tokens": 2560183.0,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.344199848175049,
|
||
|
|
"epoch": 0.11720226843100189,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999876254472886,
|
||
|
|
"loss": 6.1959,
|
||
|
|
"mean_token_accuracy": 0.12459081262350083,
|
||
|
|
"num_tokens": 2568697.0,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.348653078079224,
|
||
|
|
"epoch": 0.11762234824616677,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004999873093824565,
|
||
|
|
"loss": 6.4194,
|
||
|
|
"mean_token_accuracy": 0.11410524025559425,
|
||
|
|
"num_tokens": 2578151.0,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.50674262046814,
|
||
|
|
"epoch": 0.11804242806133165,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999869893320902,
|
||
|
|
"loss": 6.5289,
|
||
|
|
"mean_token_accuracy": 0.1147321492433548,
|
||
|
|
"num_tokens": 2585901.0,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.338491153717041,
|
||
|
|
"epoch": 0.11846250787649654,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999866652961952,
|
||
|
|
"loss": 6.3629,
|
||
|
|
"mean_token_accuracy": 0.11298267319798469,
|
||
|
|
"num_tokens": 2595655.0,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.389230489730835,
|
||
|
|
"epoch": 0.11888258769166142,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999863372747773,
|
||
|
|
"loss": 6.3335,
|
||
|
|
"mean_token_accuracy": 0.11225836053490638,
|
||
|
|
"num_tokens": 2604949.0,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.439256811141968,
|
||
|
|
"epoch": 0.11930266750682629,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999860052678423,
|
||
|
|
"loss": 6.3989,
|
||
|
|
"mean_token_accuracy": 0.11546840667724609,
|
||
|
|
"num_tokens": 2614260.0,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.299542999267578,
|
||
|
|
"epoch": 0.11972274732199117,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004999856692753959,
|
||
|
|
"loss": 6.3905,
|
||
|
|
"mean_token_accuracy": 0.11243033632636071,
|
||
|
|
"num_tokens": 2623740.0,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.37091474533081,
|
||
|
|
"epoch": 0.12014282713715606,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999853292974444,
|
||
|
|
"loss": 6.2964,
|
||
|
|
"mean_token_accuracy": 0.1178373210132122,
|
||
|
|
"num_tokens": 2631998.0,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.372178649902343,
|
||
|
|
"epoch": 0.12056290695232094,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004999849853339936,
|
||
|
|
"loss": 6.4358,
|
||
|
|
"mean_token_accuracy": 0.11526904925704003,
|
||
|
|
"num_tokens": 2641169.0,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.44800329208374,
|
||
|
|
"epoch": 0.12098298676748583,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004999846373850497,
|
||
|
|
"loss": 6.2945,
|
||
|
|
"mean_token_accuracy": 0.11855239495635032,
|
||
|
|
"num_tokens": 2650576.0,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.257949161529541,
|
||
|
|
"epoch": 0.12140306658265071,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999842854506186,
|
||
|
|
"loss": 6.3807,
|
||
|
|
"mean_token_accuracy": 0.11334980726242065,
|
||
|
|
"num_tokens": 2660817.0,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.38723406791687,
|
||
|
|
"epoch": 0.12182314639781558,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999839295307069,
|
||
|
|
"loss": 6.3212,
|
||
|
|
"mean_token_accuracy": 0.11455826535820961,
|
||
|
|
"num_tokens": 2669338.0,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.404263877868653,
|
||
|
|
"epoch": 0.12224322621298046,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999835696253206,
|
||
|
|
"loss": 6.3789,
|
||
|
|
"mean_token_accuracy": 0.11618088632822036,
|
||
|
|
"num_tokens": 2679108.0,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.435732698440551,
|
||
|
|
"epoch": 0.12266330602814535,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999832057344664,
|
||
|
|
"loss": 6.3325,
|
||
|
|
"mean_token_accuracy": 0.1142914392054081,
|
||
|
|
"num_tokens": 2688126.0,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.152384519577026,
|
||
|
|
"epoch": 0.12308338584331023,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999828378581504,
|
||
|
|
"loss": 6.3063,
|
||
|
|
"mean_token_accuracy": 0.12400648295879364,
|
||
|
|
"num_tokens": 2697245.0,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.425075197219849,
|
||
|
|
"epoch": 0.12350346565847511,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999824659963793,
|
||
|
|
"loss": 6.3465,
|
||
|
|
"mean_token_accuracy": 0.1198640413582325,
|
||
|
|
"num_tokens": 2705934.0,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.265953540802002,
|
||
|
|
"epoch": 0.12392354547364,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999820901491598,
|
||
|
|
"loss": 6.2796,
|
||
|
|
"mean_token_accuracy": 0.12351771965622901,
|
||
|
|
"num_tokens": 2714367.0,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.334036827087402,
|
||
|
|
"epoch": 0.12434362528880487,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999817103164983,
|
||
|
|
"loss": 6.3413,
|
||
|
|
"mean_token_accuracy": 0.11931266412138938,
|
||
|
|
"num_tokens": 2724366.0,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.360864496231079,
|
||
|
|
"epoch": 0.12476370510396975,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999813264984017,
|
||
|
|
"loss": 6.3448,
|
||
|
|
"mean_token_accuracy": 0.11467731669545174,
|
||
|
|
"num_tokens": 2733980.0,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.366592979431152,
|
||
|
|
"epoch": 0.12518378491913462,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999809386948767,
|
||
|
|
"loss": 6.3342,
|
||
|
|
"mean_token_accuracy": 0.12208072617650031,
|
||
|
|
"num_tokens": 2744013.0,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.299022817611695,
|
||
|
|
"epoch": 0.12560386473429952,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999805469059302,
|
||
|
|
"loss": 6.4186,
|
||
|
|
"mean_token_accuracy": 0.11027913689613342,
|
||
|
|
"num_tokens": 2753385.0,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.366168975830078,
|
||
|
|
"epoch": 0.1260239445494644,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999801511315693,
|
||
|
|
"loss": 6.256,
|
||
|
|
"mean_token_accuracy": 0.11804210916161537,
|
||
|
|
"num_tokens": 2762875.0,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.342552661895752,
|
||
|
|
"epoch": 0.1264440243646293,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999797513718007,
|
||
|
|
"loss": 6.3108,
|
||
|
|
"mean_token_accuracy": 0.12443676739931106,
|
||
|
|
"num_tokens": 2772182.0,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.206664896011352,
|
||
|
|
"epoch": 0.12686410417979416,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999793476266317,
|
||
|
|
"loss": 6.2711,
|
||
|
|
"mean_token_accuracy": 0.12031201645731926,
|
||
|
|
"num_tokens": 2780814.0,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.639998197555542,
|
||
|
|
"epoch": 0.12728418399495905,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999789398960695,
|
||
|
|
"loss": 6.5474,
|
||
|
|
"mean_token_accuracy": 0.1183062419295311,
|
||
|
|
"num_tokens": 2791104.0,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.19776029586792,
|
||
|
|
"epoch": 0.12770426381012392,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999785281801212,
|
||
|
|
"loss": 6.256,
|
||
|
|
"mean_token_accuracy": 0.11993122175335884,
|
||
|
|
"num_tokens": 2800081.0,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.334916496276856,
|
||
|
|
"epoch": 0.1281243436252888,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000499978112478794,
|
||
|
|
"loss": 6.3835,
|
||
|
|
"mean_token_accuracy": 0.11843734234571457,
|
||
|
|
"num_tokens": 2809096.0,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.403998374938965,
|
||
|
|
"epoch": 0.1285444234404537,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999776927920955,
|
||
|
|
"loss": 6.3545,
|
||
|
|
"mean_token_accuracy": 0.12085104510188102,
|
||
|
|
"num_tokens": 2818857.0,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3299469470977785,
|
||
|
|
"epoch": 0.12896450325561856,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499977269120033,
|
||
|
|
"loss": 6.4167,
|
||
|
|
"mean_token_accuracy": 0.11449578031897545,
|
||
|
|
"num_tokens": 2829332.0,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3263038158416744,
|
||
|
|
"epoch": 0.12938458307078346,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.000499976841462614,
|
||
|
|
"loss": 6.3436,
|
||
|
|
"mean_token_accuracy": 0.11686776131391526,
|
||
|
|
"num_tokens": 2839193.0,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.397625589370728,
|
||
|
|
"epoch": 0.12980466288594833,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.000499976409819846,
|
||
|
|
"loss": 6.3117,
|
||
|
|
"mean_token_accuracy": 0.11800177842378616,
|
||
|
|
"num_tokens": 2848535.0,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.116656970977783,
|
||
|
|
"epoch": 0.1302247427011132,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999759741917369,
|
||
|
|
"loss": 6.2278,
|
||
|
|
"mean_token_accuracy": 0.12729543596506118,
|
||
|
|
"num_tokens": 2858090.0,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.364631414413452,
|
||
|
|
"epoch": 0.1306448225162781,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004999755345782941,
|
||
|
|
"loss": 6.378,
|
||
|
|
"mean_token_accuracy": 0.11326263695955277,
|
||
|
|
"num_tokens": 2866984.0,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.246821451187134,
|
||
|
|
"epoch": 0.13106490233144297,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999750909795256,
|
||
|
|
"loss": 6.1885,
|
||
|
|
"mean_token_accuracy": 0.1256905347108841,
|
||
|
|
"num_tokens": 2876550.0,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.341800737380981,
|
||
|
|
"epoch": 0.13148498214660786,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999746433954394,
|
||
|
|
"loss": 6.286,
|
||
|
|
"mean_token_accuracy": 0.12146776840090752,
|
||
|
|
"num_tokens": 2885782.0,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.275845241546631,
|
||
|
|
"epoch": 0.13190506196177273,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499974191826043,
|
||
|
|
"loss": 6.2653,
|
||
|
|
"mean_token_accuracy": 0.13301032781600952,
|
||
|
|
"num_tokens": 2894807.0,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.351547765731811,
|
||
|
|
"epoch": 0.1323251417769376,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999737362713448,
|
||
|
|
"loss": 6.304,
|
||
|
|
"mean_token_accuracy": 0.12145641520619392,
|
||
|
|
"num_tokens": 2904076.0,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.267245769500732,
|
||
|
|
"epoch": 0.1327452215921025,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999732767313527,
|
||
|
|
"loss": 6.2029,
|
||
|
|
"mean_token_accuracy": 0.12209122702479362,
|
||
|
|
"num_tokens": 2913761.0,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.383308267593383,
|
||
|
|
"epoch": 0.13316530140726737,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999728132060746,
|
||
|
|
"loss": 6.439,
|
||
|
|
"mean_token_accuracy": 0.12098384723067283,
|
||
|
|
"num_tokens": 2922848.0,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.364631271362304,
|
||
|
|
"epoch": 0.13358538122243227,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004999723456955192,
|
||
|
|
"loss": 6.3245,
|
||
|
|
"mean_token_accuracy": 0.11949731931090354,
|
||
|
|
"num_tokens": 2932718.0,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2494594097137455,
|
||
|
|
"epoch": 0.13400546103759714,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999718741996945,
|
||
|
|
"loss": 6.2837,
|
||
|
|
"mean_token_accuracy": 0.12003797963261605,
|
||
|
|
"num_tokens": 2942686.0,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2547472476959225,
|
||
|
|
"epoch": 0.13442554085276204,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000499971398718609,
|
||
|
|
"loss": 6.2407,
|
||
|
|
"mean_token_accuracy": 0.1179835021495819,
|
||
|
|
"num_tokens": 2952096.0,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3157384395599365,
|
||
|
|
"epoch": 0.1348456206679269,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999709192522708,
|
||
|
|
"loss": 6.3129,
|
||
|
|
"mean_token_accuracy": 0.12474863901734352,
|
||
|
|
"num_tokens": 2960660.0,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.379588079452515,
|
||
|
|
"epoch": 0.13526570048309178,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999704358006887,
|
||
|
|
"loss": 6.3158,
|
||
|
|
"mean_token_accuracy": 0.11744728311896324,
|
||
|
|
"num_tokens": 2969834.0,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.285486459732056,
|
||
|
|
"epoch": 0.13568578029825668,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999699483638712,
|
||
|
|
"loss": 6.311,
|
||
|
|
"mean_token_accuracy": 0.12142582982778549,
|
||
|
|
"num_tokens": 2979023.0,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.294291210174561,
|
||
|
|
"epoch": 0.13610586011342155,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999694569418269,
|
||
|
|
"loss": 6.3063,
|
||
|
|
"mean_token_accuracy": 0.12201808094978332,
|
||
|
|
"num_tokens": 2988083.0,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2657451152801515,
|
||
|
|
"epoch": 0.13652593992858644,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999689615345645,
|
||
|
|
"loss": 6.2388,
|
||
|
|
"mean_token_accuracy": 0.1231310561299324,
|
||
|
|
"num_tokens": 2997240.0,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.308252573013306,
|
||
|
|
"epoch": 0.1369460197437513,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999684621420928,
|
||
|
|
"loss": 6.3111,
|
||
|
|
"mean_token_accuracy": 0.1184695117175579,
|
||
|
|
"num_tokens": 3007077.0,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.319302654266357,
|
||
|
|
"epoch": 0.13736609955891618,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999679587644205,
|
||
|
|
"loss": 6.3497,
|
||
|
|
"mean_token_accuracy": 0.11671060770750045,
|
||
|
|
"num_tokens": 3015821.0,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.236631298065186,
|
||
|
|
"epoch": 0.13778617937408108,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999674514015568,
|
||
|
|
"loss": 6.2724,
|
||
|
|
"mean_token_accuracy": 0.11908711194992065,
|
||
|
|
"num_tokens": 3025858.0,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3658030986785885,
|
||
|
|
"epoch": 0.13820625918924595,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999669400535105,
|
||
|
|
"loss": 6.2416,
|
||
|
|
"mean_token_accuracy": 0.11343135982751847,
|
||
|
|
"num_tokens": 3035537.0,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.147812271118164,
|
||
|
|
"epoch": 0.13862633900441085,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999664247202907,
|
||
|
|
"loss": 6.1617,
|
||
|
|
"mean_token_accuracy": 0.11974595785140991,
|
||
|
|
"num_tokens": 3044204.0,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.327428913116455,
|
||
|
|
"epoch": 0.13904641881957572,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999659054019066,
|
||
|
|
"loss": 6.3345,
|
||
|
|
"mean_token_accuracy": 0.11974811106920243,
|
||
|
|
"num_tokens": 3053111.0,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.258665418624878,
|
||
|
|
"epoch": 0.1394664986347406,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999653820983673,
|
||
|
|
"loss": 6.2415,
|
||
|
|
"mean_token_accuracy": 0.12036412507295609,
|
||
|
|
"num_tokens": 3062456.0,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2644579887390135,
|
||
|
|
"epoch": 0.13988657844990549,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499964854809682,
|
||
|
|
"loss": 6.2627,
|
||
|
|
"mean_token_accuracy": 0.12668107002973555,
|
||
|
|
"num_tokens": 3071132.0,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.261227464675903,
|
||
|
|
"epoch": 0.14030665826507036,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999643235358602,
|
||
|
|
"loss": 6.222,
|
||
|
|
"mean_token_accuracy": 0.125965429097414,
|
||
|
|
"num_tokens": 3080892.0,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.215318775177002,
|
||
|
|
"epoch": 0.14072673808023525,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999637882769112,
|
||
|
|
"loss": 6.1526,
|
||
|
|
"mean_token_accuracy": 0.12532262802124022,
|
||
|
|
"num_tokens": 3089874.0,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.308867406845093,
|
||
|
|
"epoch": 0.14114681789540012,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004999632490328447,
|
||
|
|
"loss": 6.3008,
|
||
|
|
"mean_token_accuracy": 0.12098695039749145,
|
||
|
|
"num_tokens": 3099535.0,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.281496620178222,
|
||
|
|
"epoch": 0.14156689771056502,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004999627058036699,
|
||
|
|
"loss": 6.2552,
|
||
|
|
"mean_token_accuracy": 0.12044425159692765,
|
||
|
|
"num_tokens": 3108772.0,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.311051607131958,
|
||
|
|
"epoch": 0.1419869775257299,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999621585893966,
|
||
|
|
"loss": 6.2799,
|
||
|
|
"mean_token_accuracy": 0.11901640743017197,
|
||
|
|
"num_tokens": 3118333.0,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.305313062667847,
|
||
|
|
"epoch": 0.14240705734089476,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999616073900346,
|
||
|
|
"loss": 6.3091,
|
||
|
|
"mean_token_accuracy": 0.12129790410399437,
|
||
|
|
"num_tokens": 3127356.0,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2683678150177,
|
||
|
|
"epoch": 0.14282713715605966,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999610522055935,
|
||
|
|
"loss": 6.2794,
|
||
|
|
"mean_token_accuracy": 0.11691329404711723,
|
||
|
|
"num_tokens": 3136859.0,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.303126668930053,
|
||
|
|
"epoch": 0.14324721697122453,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999604930360832,
|
||
|
|
"loss": 6.304,
|
||
|
|
"mean_token_accuracy": 0.11767303720116615,
|
||
|
|
"num_tokens": 3146607.0,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.214645338058472,
|
||
|
|
"epoch": 0.14366729678638943,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999599298815136,
|
||
|
|
"loss": 6.2515,
|
||
|
|
"mean_token_accuracy": 0.12662419229745864,
|
||
|
|
"num_tokens": 3156327.0,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.21446213722229,
|
||
|
|
"epoch": 0.1440873766015543,
|
||
|
|
"grad_norm": 1.5859375,
|
||
|
|
"learning_rate": 0.0004999593627418947,
|
||
|
|
"loss": 6.2009,
|
||
|
|
"mean_token_accuracy": 0.1281860999763012,
|
||
|
|
"num_tokens": 3165559.0,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.299745416641235,
|
||
|
|
"epoch": 0.14450745641671917,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999587916172365,
|
||
|
|
"loss": 6.2848,
|
||
|
|
"mean_token_accuracy": 0.11663243547081947,
|
||
|
|
"num_tokens": 3173850.0,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.324022483825684,
|
||
|
|
"epoch": 0.14492753623188406,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999582165075492,
|
||
|
|
"loss": 6.2353,
|
||
|
|
"mean_token_accuracy": 0.11788406521081925,
|
||
|
|
"num_tokens": 3182838.0,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.144151782989502,
|
||
|
|
"epoch": 0.14534761604704893,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999576374128429,
|
||
|
|
"loss": 6.2299,
|
||
|
|
"mean_token_accuracy": 0.1223968394100666,
|
||
|
|
"num_tokens": 3191692.0,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.343899536132812,
|
||
|
|
"epoch": 0.14576769586221383,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999570543331279,
|
||
|
|
"loss": 6.2507,
|
||
|
|
"mean_token_accuracy": 0.12281694263219833,
|
||
|
|
"num_tokens": 3200069.0,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2878196239471436,
|
||
|
|
"epoch": 0.1461877756773787,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004999564672684145,
|
||
|
|
"loss": 6.3406,
|
||
|
|
"mean_token_accuracy": 0.11862553879618645,
|
||
|
|
"num_tokens": 3209653.0,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.361492061614991,
|
||
|
|
"epoch": 0.14660785549254357,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999558762187131,
|
||
|
|
"loss": 6.2041,
|
||
|
|
"mean_token_accuracy": 0.12774061411619186,
|
||
|
|
"num_tokens": 3218313.0,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.146276044845581,
|
||
|
|
"epoch": 0.14702793530770847,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999552811840342,
|
||
|
|
"loss": 6.1521,
|
||
|
|
"mean_token_accuracy": 0.1273271396756172,
|
||
|
|
"num_tokens": 3227525.0,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.241751718521118,
|
||
|
|
"epoch": 0.14744801512287334,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999546821643884,
|
||
|
|
"loss": 6.2657,
|
||
|
|
"mean_token_accuracy": 0.121260417252779,
|
||
|
|
"num_tokens": 3237022.0,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.169715499877929,
|
||
|
|
"epoch": 0.14786809493803824,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999540791597861,
|
||
|
|
"loss": 6.156,
|
||
|
|
"mean_token_accuracy": 0.12248859778046609,
|
||
|
|
"num_tokens": 3246605.0,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1003180027008055,
|
||
|
|
"epoch": 0.1482881747532031,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999534721702383,
|
||
|
|
"loss": 6.1054,
|
||
|
|
"mean_token_accuracy": 0.12855856791138648,
|
||
|
|
"num_tokens": 3255587.0,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.226248407363892,
|
||
|
|
"epoch": 0.148708254568368,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999528611957553,
|
||
|
|
"loss": 6.2171,
|
||
|
|
"mean_token_accuracy": 0.12187446802854537,
|
||
|
|
"num_tokens": 3265669.0,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.278449535369873,
|
||
|
|
"epoch": 0.14912833438353287,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004999522462363485,
|
||
|
|
"loss": 6.1919,
|
||
|
|
"mean_token_accuracy": 0.1278035633265972,
|
||
|
|
"num_tokens": 3275013.0,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.265809679031372,
|
||
|
|
"epoch": 0.14954841419869774,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999516272920283,
|
||
|
|
"loss": 6.311,
|
||
|
|
"mean_token_accuracy": 0.1240921102464199,
|
||
|
|
"num_tokens": 3284723.0,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.131893539428711,
|
||
|
|
"epoch": 0.14996849401386264,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499951004362806,
|
||
|
|
"loss": 6.1325,
|
||
|
|
"mean_token_accuracy": 0.12936908155679702,
|
||
|
|
"num_tokens": 3293860.0,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.151740789413452,
|
||
|
|
"epoch": 0.1503885738290275,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999503774486924,
|
||
|
|
"loss": 6.1833,
|
||
|
|
"mean_token_accuracy": 0.12577988132834433,
|
||
|
|
"num_tokens": 3303158.0,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.184361696243286,
|
||
|
|
"epoch": 0.1508086536441924,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999497465496987,
|
||
|
|
"loss": 6.1137,
|
||
|
|
"mean_token_accuracy": 0.11985947787761689,
|
||
|
|
"num_tokens": 3313068.0,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.191692352294922,
|
||
|
|
"epoch": 0.15122873345935728,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000499949111665836,
|
||
|
|
"loss": 6.2033,
|
||
|
|
"mean_token_accuracy": 0.12312208265066146,
|
||
|
|
"num_tokens": 3321885.0,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.25971827507019,
|
||
|
|
"epoch": 0.15164881327452215,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999484727971158,
|
||
|
|
"loss": 6.1858,
|
||
|
|
"mean_token_accuracy": 0.12474783286452293,
|
||
|
|
"num_tokens": 3330924.0,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.176667261123657,
|
||
|
|
"epoch": 0.15206889308968705,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499947829943549,
|
||
|
|
"loss": 6.2248,
|
||
|
|
"mean_token_accuracy": 0.12161886692047119,
|
||
|
|
"num_tokens": 3340070.0,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.295008039474487,
|
||
|
|
"epoch": 0.15248897290485192,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999471831051474,
|
||
|
|
"loss": 6.213,
|
||
|
|
"mean_token_accuracy": 0.13358828723430632,
|
||
|
|
"num_tokens": 3349870.0,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.278341436386109,
|
||
|
|
"epoch": 0.1529090527200168,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999465322819222,
|
||
|
|
"loss": 6.2576,
|
||
|
|
"mean_token_accuracy": 0.11560158357024193,
|
||
|
|
"num_tokens": 3359573.0,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.279096603393555,
|
||
|
|
"epoch": 0.15332913253518168,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999458774738851,
|
||
|
|
"loss": 6.1999,
|
||
|
|
"mean_token_accuracy": 0.13126230910420417,
|
||
|
|
"num_tokens": 3368577.0,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1456389904022215,
|
||
|
|
"epoch": 0.15374921235034655,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999452186810476,
|
||
|
|
"loss": 6.1662,
|
||
|
|
"mean_token_accuracy": 0.12922282814979552,
|
||
|
|
"num_tokens": 3377801.0,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.282723903656006,
|
||
|
|
"epoch": 0.15416929216551145,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999445559034214,
|
||
|
|
"loss": 6.2248,
|
||
|
|
"mean_token_accuracy": 0.12709890604019164,
|
||
|
|
"num_tokens": 3386666.0,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3540504455566404,
|
||
|
|
"epoch": 0.15458937198067632,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999438891410181,
|
||
|
|
"loss": 6.3599,
|
||
|
|
"mean_token_accuracy": 0.12122973501682281,
|
||
|
|
"num_tokens": 3396086.0,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2125379085540775,
|
||
|
|
"epoch": 0.15500945179584122,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999432183938496,
|
||
|
|
"loss": 6.2646,
|
||
|
|
"mean_token_accuracy": 0.1275039754807949,
|
||
|
|
"num_tokens": 3404894.0,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.214909315109253,
|
||
|
|
"epoch": 0.1554295316110061,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999425436619279,
|
||
|
|
"loss": 6.2499,
|
||
|
|
"mean_token_accuracy": 0.12167986705899239,
|
||
|
|
"num_tokens": 3414172.0,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.310878896713257,
|
||
|
|
"epoch": 0.15584961142617096,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.000499941864945265,
|
||
|
|
"loss": 6.2176,
|
||
|
|
"mean_token_accuracy": 0.11906537339091301,
|
||
|
|
"num_tokens": 3423409.0,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134654092788696,
|
||
|
|
"epoch": 0.15626969124133586,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999411822438726,
|
||
|
|
"loss": 6.1799,
|
||
|
|
"mean_token_accuracy": 0.12394418343901634,
|
||
|
|
"num_tokens": 3433047.0,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2948554992675785,
|
||
|
|
"epoch": 0.15668977105650073,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000499940495557763,
|
||
|
|
"loss": 6.173,
|
||
|
|
"mean_token_accuracy": 0.12352384477853776,
|
||
|
|
"num_tokens": 3442490.0,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.233772277832031,
|
||
|
|
"epoch": 0.15710985087166562,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999398048869485,
|
||
|
|
"loss": 6.2356,
|
||
|
|
"mean_token_accuracy": 0.1239772841334343,
|
||
|
|
"num_tokens": 3451804.0,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.296554517745972,
|
||
|
|
"epoch": 0.1575299306868305,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000499939110231441,
|
||
|
|
"loss": 6.2223,
|
||
|
|
"mean_token_accuracy": 0.12610766440629959,
|
||
|
|
"num_tokens": 3461481.0,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.218039226531983,
|
||
|
|
"epoch": 0.1579500105019954,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999384115912531,
|
||
|
|
"loss": 6.2673,
|
||
|
|
"mean_token_accuracy": 0.1208581991493702,
|
||
|
|
"num_tokens": 3471798.0,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.088755655288696,
|
||
|
|
"epoch": 0.15837009031716026,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499937708966397,
|
||
|
|
"loss": 6.1755,
|
||
|
|
"mean_token_accuracy": 0.12277546525001526,
|
||
|
|
"num_tokens": 3481386.0,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.257310009002685,
|
||
|
|
"epoch": 0.15879017013232513,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999370023568853,
|
||
|
|
"loss": 6.1643,
|
||
|
|
"mean_token_accuracy": 0.12328559309244155,
|
||
|
|
"num_tokens": 3489981.0,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.140112638473511,
|
||
|
|
"epoch": 0.15921024994749003,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999362917627304,
|
||
|
|
"loss": 6.1438,
|
||
|
|
"mean_token_accuracy": 0.12805134281516076,
|
||
|
|
"num_tokens": 3498551.0,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.224145746231079,
|
||
|
|
"epoch": 0.1596303297626549,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999355771839448,
|
||
|
|
"loss": 6.1267,
|
||
|
|
"mean_token_accuracy": 0.1276252895593643,
|
||
|
|
"num_tokens": 3507921.0,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.316604804992676,
|
||
|
|
"epoch": 0.1600504095778198,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999348586205414,
|
||
|
|
"loss": 6.2984,
|
||
|
|
"mean_token_accuracy": 0.12361158952116966,
|
||
|
|
"num_tokens": 3517570.0,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.265382909774781,
|
||
|
|
"epoch": 0.16047048939298467,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999341360725327,
|
||
|
|
"loss": 6.2786,
|
||
|
|
"mean_token_accuracy": 0.11925147697329522,
|
||
|
|
"num_tokens": 3526774.0,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.244428873062134,
|
||
|
|
"epoch": 0.16089056920814954,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999334095399317,
|
||
|
|
"loss": 6.2167,
|
||
|
|
"mean_token_accuracy": 0.1289656363427639,
|
||
|
|
"num_tokens": 3535319.0,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.091944026947021,
|
||
|
|
"epoch": 0.16131064902331443,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999326790227512,
|
||
|
|
"loss": 6.1819,
|
||
|
|
"mean_token_accuracy": 0.12599623277783395,
|
||
|
|
"num_tokens": 3544468.0,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.069698667526245,
|
||
|
|
"epoch": 0.1617307288384793,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999319445210041,
|
||
|
|
"loss": 6.0574,
|
||
|
|
"mean_token_accuracy": 0.13135963827371597,
|
||
|
|
"num_tokens": 3553529.0,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.176232147216797,
|
||
|
|
"epoch": 0.1621508086536442,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999312060347034,
|
||
|
|
"loss": 6.1206,
|
||
|
|
"mean_token_accuracy": 0.12521466836333275,
|
||
|
|
"num_tokens": 3563053.0,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.155474901199341,
|
||
|
|
"epoch": 0.16257088846880907,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999304635638621,
|
||
|
|
"loss": 6.0713,
|
||
|
|
"mean_token_accuracy": 0.13156753256917,
|
||
|
|
"num_tokens": 3571877.0,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.117454576492309,
|
||
|
|
"epoch": 0.16299096828397394,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004999297171084935,
|
||
|
|
"loss": 6.1211,
|
||
|
|
"mean_token_accuracy": 0.12843042388558387,
|
||
|
|
"num_tokens": 3581496.0,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.246276712417602,
|
||
|
|
"epoch": 0.16341104809913884,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999289666686109,
|
||
|
|
"loss": 6.1408,
|
||
|
|
"mean_token_accuracy": 0.12944318503141403,
|
||
|
|
"num_tokens": 3590752.0,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.026504850387573,
|
||
|
|
"epoch": 0.1638311279143037,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999282122442274,
|
||
|
|
"loss": 6.1427,
|
||
|
|
"mean_token_accuracy": 0.12940528690814973,
|
||
|
|
"num_tokens": 3599885.0,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.306515789031982,
|
||
|
|
"epoch": 0.1642512077294686,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999274538353564,
|
||
|
|
"loss": 6.2127,
|
||
|
|
"mean_token_accuracy": 0.12124313414096832,
|
||
|
|
"num_tokens": 3610039.0,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1400439739227295,
|
||
|
|
"epoch": 0.16467128754463348,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999266914420114,
|
||
|
|
"loss": 6.1432,
|
||
|
|
"mean_token_accuracy": 0.12274663522839546,
|
||
|
|
"num_tokens": 3619954.0,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1886210441589355,
|
||
|
|
"epoch": 0.16509136735979837,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499925925064206,
|
||
|
|
"loss": 6.0913,
|
||
|
|
"mean_token_accuracy": 0.13008279874920844,
|
||
|
|
"num_tokens": 3628164.0,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.256851673126221,
|
||
|
|
"epoch": 0.16551144717496324,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999251547019535,
|
||
|
|
"loss": 6.2411,
|
||
|
|
"mean_token_accuracy": 0.1288958877325058,
|
||
|
|
"num_tokens": 3636778.0,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.259689378738403,
|
||
|
|
"epoch": 0.16593152699012811,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999243803552678,
|
||
|
|
"loss": 6.2104,
|
||
|
|
"mean_token_accuracy": 0.1265132576227188,
|
||
|
|
"num_tokens": 3647046.0,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134534025192261,
|
||
|
|
"epoch": 0.166351606805293,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999236020241625,
|
||
|
|
"loss": 6.1237,
|
||
|
|
"mean_token_accuracy": 0.1289564423263073,
|
||
|
|
"num_tokens": 3656130.0,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.189244413375855,
|
||
|
|
"epoch": 0.16677168662045788,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999228197086514,
|
||
|
|
"loss": 6.2018,
|
||
|
|
"mean_token_accuracy": 0.11904976442456246,
|
||
|
|
"num_tokens": 3666145.0,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2379295349121096,
|
||
|
|
"epoch": 0.16719176643562278,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004999220334087484,
|
||
|
|
"loss": 6.2356,
|
||
|
|
"mean_token_accuracy": 0.12509587332606315,
|
||
|
|
"num_tokens": 3676722.0,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.233392667770386,
|
||
|
|
"epoch": 0.16761184625078765,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999212431244673,
|
||
|
|
"loss": 6.2382,
|
||
|
|
"mean_token_accuracy": 0.1240171104669571,
|
||
|
|
"num_tokens": 3685880.0,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1124889850616455,
|
||
|
|
"epoch": 0.16803192606595252,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999204488558222,
|
||
|
|
"loss": 6.0582,
|
||
|
|
"mean_token_accuracy": 0.13227254450321196,
|
||
|
|
"num_tokens": 3695167.0,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.222057247161866,
|
||
|
|
"epoch": 0.16845200588111742,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999196506028273,
|
||
|
|
"loss": 6.1797,
|
||
|
|
"mean_token_accuracy": 0.12606113404035568,
|
||
|
|
"num_tokens": 3703700.0,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.204267930984497,
|
||
|
|
"epoch": 0.1688720856962823,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999188483654965,
|
||
|
|
"loss": 6.1263,
|
||
|
|
"mean_token_accuracy": 0.12780678346753122,
|
||
|
|
"num_tokens": 3712825.0,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.068148231506347,
|
||
|
|
"epoch": 0.16929216551144718,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999180421438442,
|
||
|
|
"loss": 6.0953,
|
||
|
|
"mean_token_accuracy": 0.12944422513246537,
|
||
|
|
"num_tokens": 3721807.0,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.252347660064697,
|
||
|
|
"epoch": 0.16971224532661205,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999172319378846,
|
||
|
|
"loss": 6.2617,
|
||
|
|
"mean_token_accuracy": 0.12066083624958993,
|
||
|
|
"num_tokens": 3730502.0,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.223606538772583,
|
||
|
|
"epoch": 0.17013232514177692,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999164177476319,
|
||
|
|
"loss": 6.1457,
|
||
|
|
"mean_token_accuracy": 0.13003366217017173,
|
||
|
|
"num_tokens": 3739696.0,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0265522480010985,
|
||
|
|
"epoch": 0.17055240495694182,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999155995731009,
|
||
|
|
"loss": 6.1404,
|
||
|
|
"mean_token_accuracy": 0.1299336552619934,
|
||
|
|
"num_tokens": 3748675.0,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.380355882644653,
|
||
|
|
"epoch": 0.1709724847721067,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999147774143057,
|
||
|
|
"loss": 6.2221,
|
||
|
|
"mean_token_accuracy": 0.12048738449811935,
|
||
|
|
"num_tokens": 3757714.0,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.067580938339233,
|
||
|
|
"epoch": 0.1713925645872716,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499913951271261,
|
||
|
|
"loss": 6.0375,
|
||
|
|
"mean_token_accuracy": 0.13202561810612679,
|
||
|
|
"num_tokens": 3767589.0,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.142302322387695,
|
||
|
|
"epoch": 0.17181264440243646,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.0004999131211439816,
|
||
|
|
"loss": 6.1596,
|
||
|
|
"mean_token_accuracy": 0.12828587144613265,
|
||
|
|
"num_tokens": 3777261.0,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.232779121398925,
|
||
|
|
"epoch": 0.17223272421760136,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000499912287032482,
|
||
|
|
"loss": 6.1001,
|
||
|
|
"mean_token_accuracy": 0.1372594192624092,
|
||
|
|
"num_tokens": 3786658.0,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.025224256515503,
|
||
|
|
"epoch": 0.17265280403276623,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000499911448936777,
|
||
|
|
"loss": 6.1026,
|
||
|
|
"mean_token_accuracy": 0.13396917879581452,
|
||
|
|
"num_tokens": 3794977.0,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084959363937378,
|
||
|
|
"epoch": 0.1730728838479311,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999106068568816,
|
||
|
|
"loss": 6.1787,
|
||
|
|
"mean_token_accuracy": 0.12529570311307908,
|
||
|
|
"num_tokens": 3805138.0,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.263661098480225,
|
||
|
|
"epoch": 0.173492963663096,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999097607928106,
|
||
|
|
"loss": 6.1258,
|
||
|
|
"mean_token_accuracy": 0.13813115134835244,
|
||
|
|
"num_tokens": 3814444.0,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.166193580627441,
|
||
|
|
"epoch": 0.17391304347826086,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999089107445788,
|
||
|
|
"loss": 6.0785,
|
||
|
|
"mean_token_accuracy": 0.12874337583780288,
|
||
|
|
"num_tokens": 3822859.0,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0040192127227785,
|
||
|
|
"epoch": 0.17433312329342576,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004999080567122016,
|
||
|
|
"loss": 6.102,
|
||
|
|
"mean_token_accuracy": 0.1266925446689129,
|
||
|
|
"num_tokens": 3833159.0,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.185031747817993,
|
||
|
|
"epoch": 0.17475320310859063,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999071986956941,
|
||
|
|
"loss": 6.1269,
|
||
|
|
"mean_token_accuracy": 0.1295515276491642,
|
||
|
|
"num_tokens": 3842136.0,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.116478013992309,
|
||
|
|
"epoch": 0.1751732829237555,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999063366950713,
|
||
|
|
"loss": 6.1939,
|
||
|
|
"mean_token_accuracy": 0.1253967322409153,
|
||
|
|
"num_tokens": 3851406.0,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1408590316772464,
|
||
|
|
"epoch": 0.1755933627389204,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999054707103486,
|
||
|
|
"loss": 6.1026,
|
||
|
|
"mean_token_accuracy": 0.1274511694908142,
|
||
|
|
"num_tokens": 3861061.0,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.164148044586182,
|
||
|
|
"epoch": 0.17601344255408527,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999046007415412,
|
||
|
|
"loss": 6.067,
|
||
|
|
"mean_token_accuracy": 0.12591860070824623,
|
||
|
|
"num_tokens": 3870357.0,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.192416858673096,
|
||
|
|
"epoch": 0.17643352236925017,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999037267886646,
|
||
|
|
"loss": 6.0964,
|
||
|
|
"mean_token_accuracy": 0.1299741767346859,
|
||
|
|
"num_tokens": 3879393.0,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0785363674163815,
|
||
|
|
"epoch": 0.17685360218441504,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999028488517343,
|
||
|
|
"loss": 6.1037,
|
||
|
|
"mean_token_accuracy": 0.12889744639396666,
|
||
|
|
"num_tokens": 3888030.0,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.11736216545105,
|
||
|
|
"epoch": 0.1772736819995799,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999019669307659,
|
||
|
|
"loss": 6.1275,
|
||
|
|
"mean_token_accuracy": 0.13039418011903764,
|
||
|
|
"num_tokens": 3897430.0,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1809111595153805,
|
||
|
|
"epoch": 0.1776937618147448,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999010810257749,
|
||
|
|
"loss": 6.1428,
|
||
|
|
"mean_token_accuracy": 0.1269817218184471,
|
||
|
|
"num_tokens": 3907711.0,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.062447786331177,
|
||
|
|
"epoch": 0.17811384162990967,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999001911367771,
|
||
|
|
"loss": 6.0668,
|
||
|
|
"mean_token_accuracy": 0.1323694571852684,
|
||
|
|
"num_tokens": 3915816.0,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1604491710662845,
|
||
|
|
"epoch": 0.17853392144507457,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998992972637883,
|
||
|
|
"loss": 6.1943,
|
||
|
|
"mean_token_accuracy": 0.1183660313487053,
|
||
|
|
"num_tokens": 3925162.0,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.203741979598999,
|
||
|
|
"epoch": 0.17895400126023944,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998983994068242,
|
||
|
|
"loss": 6.0864,
|
||
|
|
"mean_token_accuracy": 0.1282353989779949,
|
||
|
|
"num_tokens": 3934476.0,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.044822025299072,
|
||
|
|
"epoch": 0.17937408107540434,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004998974975659006,
|
||
|
|
"loss": 6.124,
|
||
|
|
"mean_token_accuracy": 0.12441963106393814,
|
||
|
|
"num_tokens": 3943501.0,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.184865283966064,
|
||
|
|
"epoch": 0.1797941608905692,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998965917410338,
|
||
|
|
"loss": 6.1111,
|
||
|
|
"mean_token_accuracy": 0.12969196289777757,
|
||
|
|
"num_tokens": 3953663.0,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.129238748550415,
|
||
|
|
"epoch": 0.18021424070573408,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998956819322397,
|
||
|
|
"loss": 6.0839,
|
||
|
|
"mean_token_accuracy": 0.13072072938084603,
|
||
|
|
"num_tokens": 3962634.0,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.135206937789917,
|
||
|
|
"epoch": 0.18063432052089898,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998947681395343,
|
||
|
|
"loss": 6.0859,
|
||
|
|
"mean_token_accuracy": 0.1366378679871559,
|
||
|
|
"num_tokens": 3972496.0,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.271072053909302,
|
||
|
|
"epoch": 0.18105440033606385,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000499893850362934,
|
||
|
|
"loss": 6.3296,
|
||
|
|
"mean_token_accuracy": 0.12187584564089775,
|
||
|
|
"num_tokens": 3980724.0,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.224115467071533,
|
||
|
|
"epoch": 0.18147448015122875,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998929286024548,
|
||
|
|
"loss": 6.1594,
|
||
|
|
"mean_token_accuracy": 0.12844373360276223,
|
||
|
|
"num_tokens": 3989842.0,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.123717546463013,
|
||
|
|
"epoch": 0.18189455996639362,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004998920028581133,
|
||
|
|
"loss": 6.0814,
|
||
|
|
"mean_token_accuracy": 0.13656101748347282,
|
||
|
|
"num_tokens": 3998534.0,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.150679874420166,
|
||
|
|
"epoch": 0.18231463978155849,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998910731299258,
|
||
|
|
"loss": 6.1088,
|
||
|
|
"mean_token_accuracy": 0.12456604689359665,
|
||
|
|
"num_tokens": 4007677.0,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126907587051392,
|
||
|
|
"epoch": 0.18273471959672338,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998901394179085,
|
||
|
|
"loss": 6.1638,
|
||
|
|
"mean_token_accuracy": 0.12525054216384887,
|
||
|
|
"num_tokens": 4016347.0,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.135372829437256,
|
||
|
|
"epoch": 0.18315479941188825,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998892017220784,
|
||
|
|
"loss": 6.0213,
|
||
|
|
"mean_token_accuracy": 0.13323480933904647,
|
||
|
|
"num_tokens": 4025199.0,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.137722158432007,
|
||
|
|
"epoch": 0.18357487922705315,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004998882600424519,
|
||
|
|
"loss": 6.0876,
|
||
|
|
"mean_token_accuracy": 0.12551357075572014,
|
||
|
|
"num_tokens": 4033933.0,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.108227968215942,
|
||
|
|
"epoch": 0.18399495904221802,
|
||
|
|
"grad_norm": 1.2109375,
|
||
|
|
"learning_rate": 0.0004998873143790455,
|
||
|
|
"loss": 6.0183,
|
||
|
|
"mean_token_accuracy": 0.1379354938864708,
|
||
|
|
"num_tokens": 4042891.0,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1591612815856935,
|
||
|
|
"epoch": 0.1844150388573829,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998863647318763,
|
||
|
|
"loss": 6.1366,
|
||
|
|
"mean_token_accuracy": 0.1241612270474434,
|
||
|
|
"num_tokens": 4051123.0,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.089571523666382,
|
||
|
|
"epoch": 0.1848351186725478,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004998854111009608,
|
||
|
|
"loss": 6.113,
|
||
|
|
"mean_token_accuracy": 0.12376126572489739,
|
||
|
|
"num_tokens": 4060025.0,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.11730580329895,
|
||
|
|
"epoch": 0.18525519848771266,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998844534863161,
|
||
|
|
"loss": 6.0217,
|
||
|
|
"mean_token_accuracy": 0.12926619052886962,
|
||
|
|
"num_tokens": 4069363.0,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.176160907745361,
|
||
|
|
"epoch": 0.18567527830287756,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998834918879592,
|
||
|
|
"loss": 6.1692,
|
||
|
|
"mean_token_accuracy": 0.12947654128074645,
|
||
|
|
"num_tokens": 4078855.0,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.131696176528931,
|
||
|
|
"epoch": 0.18609535811804243,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499882526305907,
|
||
|
|
"loss": 6.1424,
|
||
|
|
"mean_token_accuracy": 0.12837494984269143,
|
||
|
|
"num_tokens": 4087801.0,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.191353893280029,
|
||
|
|
"epoch": 0.18651543793320732,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998815567401765,
|
||
|
|
"loss": 6.1351,
|
||
|
|
"mean_token_accuracy": 0.12790770679712296,
|
||
|
|
"num_tokens": 4096949.0,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.171415328979492,
|
||
|
|
"epoch": 0.1869355177483722,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998805831907851,
|
||
|
|
"loss": 6.084,
|
||
|
|
"mean_token_accuracy": 0.1275387942790985,
|
||
|
|
"num_tokens": 4105399.0,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.12052903175354,
|
||
|
|
"epoch": 0.18735559756353706,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998796056577501,
|
||
|
|
"loss": 6.0391,
|
||
|
|
"mean_token_accuracy": 0.1234730213880539,
|
||
|
|
"num_tokens": 4113873.0,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.033805179595947,
|
||
|
|
"epoch": 0.18777567737870196,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998786241410886,
|
||
|
|
"loss": 6.1003,
|
||
|
|
"mean_token_accuracy": 0.12796764224767684,
|
||
|
|
"num_tokens": 4123528.0,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.244566345214844,
|
||
|
|
"epoch": 0.18819575719386683,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499877638640818,
|
||
|
|
"loss": 6.1131,
|
||
|
|
"mean_token_accuracy": 0.12414761930704117,
|
||
|
|
"num_tokens": 4133370.0,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0351306915283205,
|
||
|
|
"epoch": 0.18861583700903173,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499876649156956,
|
||
|
|
"loss": 6.0237,
|
||
|
|
"mean_token_accuracy": 0.13068948239088057,
|
||
|
|
"num_tokens": 4142370.0,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.075446557998657,
|
||
|
|
"epoch": 0.1890359168241966,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998756556895196,
|
||
|
|
"loss": 6.1176,
|
||
|
|
"mean_token_accuracy": 0.12780525609850885,
|
||
|
|
"num_tokens": 4152367.0,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.182886552810669,
|
||
|
|
"epoch": 0.18945599663936147,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000499874658238527,
|
||
|
|
"loss": 6.0979,
|
||
|
|
"mean_token_accuracy": 0.1277949795126915,
|
||
|
|
"num_tokens": 4161126.0,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.106898975372315,
|
||
|
|
"epoch": 0.18987607645452637,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998736568039957,
|
||
|
|
"loss": 6.0094,
|
||
|
|
"mean_token_accuracy": 0.13100193440914154,
|
||
|
|
"num_tokens": 4169910.0,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.133787775039673,
|
||
|
|
"epoch": 0.19029615626969124,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004998726513859432,
|
||
|
|
"loss": 6.1599,
|
||
|
|
"mean_token_accuracy": 0.12446666359901429,
|
||
|
|
"num_tokens": 4179893.0,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.202354001998901,
|
||
|
|
"epoch": 0.19071623608485613,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004998716419843875,
|
||
|
|
"loss": 6.1617,
|
||
|
|
"mean_token_accuracy": 0.1319762259721756,
|
||
|
|
"num_tokens": 4190065.0,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.011490678787231,
|
||
|
|
"epoch": 0.191136315900021,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004998706285993465,
|
||
|
|
"loss": 6.069,
|
||
|
|
"mean_token_accuracy": 0.13331144750118257,
|
||
|
|
"num_tokens": 4198395.0,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.173086833953858,
|
||
|
|
"epoch": 0.19155639571518587,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998696112308381,
|
||
|
|
"loss": 6.093,
|
||
|
|
"mean_token_accuracy": 0.1271330051124096,
|
||
|
|
"num_tokens": 4207555.0,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0555767059326175,
|
||
|
|
"epoch": 0.19197647553035077,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998685898788803,
|
||
|
|
"loss": 6.0375,
|
||
|
|
"mean_token_accuracy": 0.1309538424015045,
|
||
|
|
"num_tokens": 4216533.0,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.211866235733032,
|
||
|
|
"epoch": 0.19239655534551564,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004998675645434914,
|
||
|
|
"loss": 6.1419,
|
||
|
|
"mean_token_accuracy": 0.1353093557059765,
|
||
|
|
"num_tokens": 4225575.0,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.018606328964234,
|
||
|
|
"epoch": 0.19281663516068054,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004998665352246891,
|
||
|
|
"loss": 5.9193,
|
||
|
|
"mean_token_accuracy": 0.13810657039284707,
|
||
|
|
"num_tokens": 4234306.0,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.014672660827637,
|
||
|
|
"epoch": 0.1932367149758454,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998655019224921,
|
||
|
|
"loss": 6.1267,
|
||
|
|
"mean_token_accuracy": 0.12904786244034766,
|
||
|
|
"num_tokens": 4243998.0,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134347867965698,
|
||
|
|
"epoch": 0.19365679479101028,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998644646369185,
|
||
|
|
"loss": 6.0238,
|
||
|
|
"mean_token_accuracy": 0.12680166810750962,
|
||
|
|
"num_tokens": 4253653.0,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.066501617431641,
|
||
|
|
"epoch": 0.19407687460617518,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998634233679865,
|
||
|
|
"loss": 6.0895,
|
||
|
|
"mean_token_accuracy": 0.12311211153864861,
|
||
|
|
"num_tokens": 4263305.0,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.049868440628051,
|
||
|
|
"epoch": 0.19449695442134005,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499862378115715,
|
||
|
|
"loss": 5.983,
|
||
|
|
"mean_token_accuracy": 0.13395097106695175,
|
||
|
|
"num_tokens": 4272212.0,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.165916633605957,
|
||
|
|
"epoch": 0.19491703423650494,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004998613288801221,
|
||
|
|
"loss": 6.1922,
|
||
|
|
"mean_token_accuracy": 0.1247316338121891,
|
||
|
|
"num_tokens": 4281445.0,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.179806041717529,
|
||
|
|
"epoch": 0.1953371140516698,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004998602756612267,
|
||
|
|
"loss": 6.0898,
|
||
|
|
"mean_token_accuracy": 0.12693395391106604,
|
||
|
|
"num_tokens": 4290938.0,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.070136451721192,
|
||
|
|
"epoch": 0.1957571938668347,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998592184590471,
|
||
|
|
"loss": 6.1397,
|
||
|
|
"mean_token_accuracy": 0.12676772177219392,
|
||
|
|
"num_tokens": 4300022.0,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.06673412322998,
|
||
|
|
"epoch": 0.19617727368199958,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998581572736024,
|
||
|
|
"loss": 6.0179,
|
||
|
|
"mean_token_accuracy": 0.13165862262248992,
|
||
|
|
"num_tokens": 4308910.0,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.994941234588623,
|
||
|
|
"epoch": 0.19659735349716445,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998570921049112,
|
||
|
|
"loss": 5.9863,
|
||
|
|
"mean_token_accuracy": 0.135918989777565,
|
||
|
|
"num_tokens": 4317136.0,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.102301931381225,
|
||
|
|
"epoch": 0.19701743331232935,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004998560229529924,
|
||
|
|
"loss": 6.0425,
|
||
|
|
"mean_token_accuracy": 0.13503788635134698,
|
||
|
|
"num_tokens": 4326163.0,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.227736186981201,
|
||
|
|
"epoch": 0.19743751312749422,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998549498178649,
|
||
|
|
"loss": 6.1881,
|
||
|
|
"mean_token_accuracy": 0.13264173418283462,
|
||
|
|
"num_tokens": 4335837.0,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1506922245025635,
|
||
|
|
"epoch": 0.19785759294265912,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004998538726995477,
|
||
|
|
"loss": 6.1094,
|
||
|
|
"mean_token_accuracy": 0.13223380818963051,
|
||
|
|
"num_tokens": 4345108.0,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.144142389297485,
|
||
|
|
"epoch": 0.198277672757824,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00049985279159806,
|
||
|
|
"loss": 6.1229,
|
||
|
|
"mean_token_accuracy": 0.1271647334098816,
|
||
|
|
"num_tokens": 4353761.0,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1053972244262695,
|
||
|
|
"epoch": 0.19869775257298886,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998517065134208,
|
||
|
|
"loss": 6.0771,
|
||
|
|
"mean_token_accuracy": 0.1304875746369362,
|
||
|
|
"num_tokens": 4363244.0,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.125473690032959,
|
||
|
|
"epoch": 0.19911783238815375,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004998506174456494,
|
||
|
|
"loss": 6.0856,
|
||
|
|
"mean_token_accuracy": 0.1269718214869499,
|
||
|
|
"num_tokens": 4373034.0,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.056502437591552,
|
||
|
|
"epoch": 0.19953791220331862,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998495243947653,
|
||
|
|
"loss": 6.0113,
|
||
|
|
"mean_token_accuracy": 0.12611002326011658,
|
||
|
|
"num_tokens": 4382554.0,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.116158485412598,
|
||
|
|
"epoch": 0.19995799201848352,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004998484273607875,
|
||
|
|
"loss": 6.0324,
|
||
|
|
"mean_token_accuracy": 0.13722692728042601,
|
||
|
|
"num_tokens": 4391001.0,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.908738136291504,
|
||
|
|
"epoch": 0.2003780718336484,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998473263437356,
|
||
|
|
"loss": 5.9468,
|
||
|
|
"mean_token_accuracy": 0.1328367456793785,
|
||
|
|
"num_tokens": 4400632.0,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.068370723724366,
|
||
|
|
"epoch": 0.20079815164881326,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499846221343629,
|
||
|
|
"loss": 6.0486,
|
||
|
|
"mean_token_accuracy": 0.12969876527786256,
|
||
|
|
"num_tokens": 4409565.0,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.078929996490478,
|
||
|
|
"epoch": 0.20121823146397816,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998451123604875,
|
||
|
|
"loss": 5.9972,
|
||
|
|
"mean_token_accuracy": 0.13624220937490464,
|
||
|
|
"num_tokens": 4418384.0,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.103708171844483,
|
||
|
|
"epoch": 0.20163831127914303,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004998439993943306,
|
||
|
|
"loss": 6.11,
|
||
|
|
"mean_token_accuracy": 0.13608327358961106,
|
||
|
|
"num_tokens": 4427581.0,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2018999576568605,
|
||
|
|
"epoch": 0.20205839109430793,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004998428824451779,
|
||
|
|
"loss": 6.1047,
|
||
|
|
"mean_token_accuracy": 0.1272777199745178,
|
||
|
|
"num_tokens": 4436572.0,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.056638908386231,
|
||
|
|
"epoch": 0.2024784709094728,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004998417615130495,
|
||
|
|
"loss": 6.1099,
|
||
|
|
"mean_token_accuracy": 0.12568870037794114,
|
||
|
|
"num_tokens": 4445230.0,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.192966461181641,
|
||
|
|
"epoch": 0.2028985507246377,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004998406365979649,
|
||
|
|
"loss": 6.1712,
|
||
|
|
"mean_token_accuracy": 0.12947247475385665,
|
||
|
|
"num_tokens": 4454251.0,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0738544940948485,
|
||
|
|
"epoch": 0.20331863053980256,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998395076999443,
|
||
|
|
"loss": 6.0246,
|
||
|
|
"mean_token_accuracy": 0.1331735722720623,
|
||
|
|
"num_tokens": 4463949.0,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.164913845062256,
|
||
|
|
"epoch": 0.20373871035496743,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004998383748190076,
|
||
|
|
"loss": 6.2178,
|
||
|
|
"mean_token_accuracy": 0.12642809972167016,
|
||
|
|
"num_tokens": 4473373.0,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.169246625900269,
|
||
|
|
"epoch": 0.20415879017013233,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998372379551748,
|
||
|
|
"loss": 6.0443,
|
||
|
|
"mean_token_accuracy": 0.13512365892529488,
|
||
|
|
"num_tokens": 4482303.0,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.000651454925537,
|
||
|
|
"epoch": 0.2045788699852972,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998360971084663,
|
||
|
|
"loss": 6.0248,
|
||
|
|
"mean_token_accuracy": 0.1257840245962143,
|
||
|
|
"num_tokens": 4491214.0,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.060888242721558,
|
||
|
|
"epoch": 0.2049989498004621,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998349522789019,
|
||
|
|
"loss": 5.9365,
|
||
|
|
"mean_token_accuracy": 0.14086327105760574,
|
||
|
|
"num_tokens": 4500099.0,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.020166492462158,
|
||
|
|
"epoch": 0.20541902961562697,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998338034665021,
|
||
|
|
"loss": 6.0199,
|
||
|
|
"mean_token_accuracy": 0.13966668471693994,
|
||
|
|
"num_tokens": 4509893.0,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.064390420913696,
|
||
|
|
"epoch": 0.20583910943079184,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998326506712872,
|
||
|
|
"loss": 5.9974,
|
||
|
|
"mean_token_accuracy": 0.13378938734531404,
|
||
|
|
"num_tokens": 4518606.0,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.097909021377563,
|
||
|
|
"epoch": 0.20625918924595674,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004998314938932778,
|
||
|
|
"loss": 6.0759,
|
||
|
|
"mean_token_accuracy": 0.1298009656369686,
|
||
|
|
"num_tokens": 4528392.0,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1035826206207275,
|
||
|
|
"epoch": 0.2066792690611216,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004998303331324943,
|
||
|
|
"loss": 6.0416,
|
||
|
|
"mean_token_accuracy": 0.13463694974780083,
|
||
|
|
"num_tokens": 4536983.0,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9858495712280275,
|
||
|
|
"epoch": 0.2070993488762865,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004998291683889571,
|
||
|
|
"loss": 5.9442,
|
||
|
|
"mean_token_accuracy": 0.13662122339010238,
|
||
|
|
"num_tokens": 4544967.0,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.056029415130615,
|
||
|
|
"epoch": 0.20751942869145137,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.000499827999662687,
|
||
|
|
"loss": 6.0242,
|
||
|
|
"mean_token_accuracy": 0.12964650020003318,
|
||
|
|
"num_tokens": 4554646.0,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.118838214874268,
|
||
|
|
"epoch": 0.20793950850661624,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998268269537046,
|
||
|
|
"loss": 6.0401,
|
||
|
|
"mean_token_accuracy": 0.13539641574025155,
|
||
|
|
"num_tokens": 4564040.0,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.022972631454468,
|
||
|
|
"epoch": 0.20835958832178114,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998256502620308,
|
||
|
|
"loss": 6.0624,
|
||
|
|
"mean_token_accuracy": 0.13345976546406746,
|
||
|
|
"num_tokens": 4573758.0,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.193491125106812,
|
||
|
|
"epoch": 0.208779668136946,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004998244695876864,
|
||
|
|
"loss": 6.0874,
|
||
|
|
"mean_token_accuracy": 0.13196430653333663,
|
||
|
|
"num_tokens": 4582097.0,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.018001937866211,
|
||
|
|
"epoch": 0.2091997479521109,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004998232849306921,
|
||
|
|
"loss": 6.064,
|
||
|
|
"mean_token_accuracy": 0.1368905283510685,
|
||
|
|
"num_tokens": 4590687.0,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.152202367782593,
|
||
|
|
"epoch": 0.20961982776727578,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004998220962910693,
|
||
|
|
"loss": 6.0475,
|
||
|
|
"mean_token_accuracy": 0.12533890679478646,
|
||
|
|
"num_tokens": 4599497.0,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059301280975342,
|
||
|
|
"epoch": 0.21003990758244068,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004998209036688386,
|
||
|
|
"loss": 6.0091,
|
||
|
|
"mean_token_accuracy": 0.12979092076420784,
|
||
|
|
"num_tokens": 4607958.0,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.12682089805603,
|
||
|
|
"epoch": 0.21045998739760555,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998197070640216,
|
||
|
|
"loss": 6.1445,
|
||
|
|
"mean_token_accuracy": 0.12323907017707825,
|
||
|
|
"num_tokens": 4617515.0,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.13975419998169,
|
||
|
|
"epoch": 0.21088006721277042,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998185064766391,
|
||
|
|
"loss": 6.028,
|
||
|
|
"mean_token_accuracy": 0.13126113414764404,
|
||
|
|
"num_tokens": 4627037.0,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.999127197265625,
|
||
|
|
"epoch": 0.21130014702793531,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004998173019067127,
|
||
|
|
"loss": 6.0335,
|
||
|
|
"mean_token_accuracy": 0.13387575298547744,
|
||
|
|
"num_tokens": 4637393.0,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.049172449111938,
|
||
|
|
"epoch": 0.21172022684310018,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998160933542633,
|
||
|
|
"loss": 6.0685,
|
||
|
|
"mean_token_accuracy": 0.12128801420331001,
|
||
|
|
"num_tokens": 4646832.0,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.16112699508667,
|
||
|
|
"epoch": 0.21214030665826508,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004998148808193128,
|
||
|
|
"loss": 6.095,
|
||
|
|
"mean_token_accuracy": 0.1346332848072052,
|
||
|
|
"num_tokens": 4655719.0,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126083850860596,
|
||
|
|
"epoch": 0.21256038647342995,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998136643018823,
|
||
|
|
"loss": 6.0477,
|
||
|
|
"mean_token_accuracy": 0.12910717576742173,
|
||
|
|
"num_tokens": 4665364.0,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.087383460998535,
|
||
|
|
"epoch": 0.21298046628859482,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004998124438019935,
|
||
|
|
"loss": 6.0166,
|
||
|
|
"mean_token_accuracy": 0.1316668502986431,
|
||
|
|
"num_tokens": 4674760.0,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.993421936035157,
|
||
|
|
"epoch": 0.21340054610375972,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004998112193196681,
|
||
|
|
"loss": 5.9488,
|
||
|
|
"mean_token_accuracy": 0.13391186147928238,
|
||
|
|
"num_tokens": 4683900.0,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.969591331481934,
|
||
|
|
"epoch": 0.2138206259189246,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004998099908549277,
|
||
|
|
"loss": 5.9886,
|
||
|
|
"mean_token_accuracy": 0.1273488573729992,
|
||
|
|
"num_tokens": 4693915.0,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9875883102417,
|
||
|
|
"epoch": 0.2142407057340895,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499808758407794,
|
||
|
|
"loss": 5.8619,
|
||
|
|
"mean_token_accuracy": 0.13991126343607901,
|
||
|
|
"num_tokens": 4703102.0,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.031775951385498,
|
||
|
|
"epoch": 0.21466078554925436,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998075219782889,
|
||
|
|
"loss": 6.0787,
|
||
|
|
"mean_token_accuracy": 0.1323968604207039,
|
||
|
|
"num_tokens": 4712925.0,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.099209594726562,
|
||
|
|
"epoch": 0.21508086536441923,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004998062815664344,
|
||
|
|
"loss": 6.0069,
|
||
|
|
"mean_token_accuracy": 0.12949655801057816,
|
||
|
|
"num_tokens": 4722641.0,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.046544742584229,
|
||
|
|
"epoch": 0.21550094517958412,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004998050371722524,
|
||
|
|
"loss": 6.0781,
|
||
|
|
"mean_token_accuracy": 0.12990766763687134,
|
||
|
|
"num_tokens": 4732603.0,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.932075929641724,
|
||
|
|
"epoch": 0.215921024994749,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998037887957649,
|
||
|
|
"loss": 5.9211,
|
||
|
|
"mean_token_accuracy": 0.13785294219851493,
|
||
|
|
"num_tokens": 4742644.0,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.21406192779541,
|
||
|
|
"epoch": 0.2163411048099139,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998025364369939,
|
||
|
|
"loss": 6.2335,
|
||
|
|
"mean_token_accuracy": 0.1234040841460228,
|
||
|
|
"num_tokens": 4751482.0,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.237205886840821,
|
||
|
|
"epoch": 0.21676118462507876,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004998012800959619,
|
||
|
|
"loss": 6.0891,
|
||
|
|
"mean_token_accuracy": 0.12757375389337539,
|
||
|
|
"num_tokens": 4760593.0,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.093921661376953,
|
||
|
|
"epoch": 0.21718126444024366,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004998000197726909,
|
||
|
|
"loss": 6.0827,
|
||
|
|
"mean_token_accuracy": 0.13335589170455933,
|
||
|
|
"num_tokens": 4769294.0,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.031546688079834,
|
||
|
|
"epoch": 0.21760134425540853,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004997987554672033,
|
||
|
|
"loss": 6.0081,
|
||
|
|
"mean_token_accuracy": 0.13305121287703514,
|
||
|
|
"num_tokens": 4779239.0,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059205436706543,
|
||
|
|
"epoch": 0.2180214240705734,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004997974871795215,
|
||
|
|
"loss": 6.0716,
|
||
|
|
"mean_token_accuracy": 0.13057481795549392,
|
||
|
|
"num_tokens": 4788211.0,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.109251928329468,
|
||
|
|
"epoch": 0.2184415038857383,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499796214909668,
|
||
|
|
"loss": 6.0447,
|
||
|
|
"mean_token_accuracy": 0.13531798869371414,
|
||
|
|
"num_tokens": 4797921.0,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.092241191864014,
|
||
|
|
"epoch": 0.21886158370090317,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004997949386576653,
|
||
|
|
"loss": 6.0378,
|
||
|
|
"mean_token_accuracy": 0.13213689997792244,
|
||
|
|
"num_tokens": 4807772.0,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.042962265014649,
|
||
|
|
"epoch": 0.21928166351606806,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000499793658423536,
|
||
|
|
"loss": 6.0593,
|
||
|
|
"mean_token_accuracy": 0.13149860948324205,
|
||
|
|
"num_tokens": 4817999.0,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.057756137847901,
|
||
|
|
"epoch": 0.21970174333123293,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004997923742073028,
|
||
|
|
"loss": 6.0136,
|
||
|
|
"mean_token_accuracy": 0.13949006497859956,
|
||
|
|
"num_tokens": 4826679.0,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.998235082626342,
|
||
|
|
"epoch": 0.2201218231463978,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004997910860089884,
|
||
|
|
"loss": 6.0157,
|
||
|
|
"mean_token_accuracy": 0.13456794619560242,
|
||
|
|
"num_tokens": 4834998.0,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.064208889007569,
|
||
|
|
"epoch": 0.2205419029615627,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004997897938286156,
|
||
|
|
"loss": 5.9717,
|
||
|
|
"mean_token_accuracy": 0.1337368108332157,
|
||
|
|
"num_tokens": 4843635.0,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.085119295120239,
|
||
|
|
"epoch": 0.22096198277672757,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004997884976662075,
|
||
|
|
"loss": 6.0919,
|
||
|
|
"mean_token_accuracy": 0.12607687711715698,
|
||
|
|
"num_tokens": 4852027.0,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.183318328857422,
|
||
|
|
"epoch": 0.22138206259189247,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004997871975217868,
|
||
|
|
"loss": 6.0165,
|
||
|
|
"mean_token_accuracy": 0.1429324761033058,
|
||
|
|
"num_tokens": 4861244.0,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.912706756591797,
|
||
|
|
"epoch": 0.22180214240705734,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004997858933953768,
|
||
|
|
"loss": 5.9326,
|
||
|
|
"mean_token_accuracy": 0.1404939979314804,
|
||
|
|
"num_tokens": 4869902.0,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.963629674911499,
|
||
|
|
"epoch": 0.2222222222222222,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004997845852870004,
|
||
|
|
"loss": 5.8982,
|
||
|
|
"mean_token_accuracy": 0.14085923954844476,
|
||
|
|
"num_tokens": 4878502.0,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.986082458496094,
|
||
|
|
"epoch": 0.2226423020373871,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004997832731966806,
|
||
|
|
"loss": 5.964,
|
||
|
|
"mean_token_accuracy": 0.14047276899218558,
|
||
|
|
"num_tokens": 4888348.0,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.051373815536499,
|
||
|
|
"epoch": 0.22306238185255198,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004997819571244411,
|
||
|
|
"loss": 6.0172,
|
||
|
|
"mean_token_accuracy": 0.13845039829611777,
|
||
|
|
"num_tokens": 4897302.0,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.01381549835205,
|
||
|
|
"epoch": 0.22348246166771688,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004997806370703049,
|
||
|
|
"loss": 6.0476,
|
||
|
|
"mean_token_accuracy": 0.13289312049746513,
|
||
|
|
"num_tokens": 4907078.0,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.983912467956543,
|
||
|
|
"epoch": 0.22390254148288175,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004997793130342954,
|
||
|
|
"loss": 5.8784,
|
||
|
|
"mean_token_accuracy": 0.1382697917521,
|
||
|
|
"num_tokens": 4917489.0,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.94772891998291,
|
||
|
|
"epoch": 0.22432262129804661,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004997779850164363,
|
||
|
|
"loss": 5.9836,
|
||
|
|
"mean_token_accuracy": 0.13369291126728058,
|
||
|
|
"num_tokens": 4927073.0,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.121642923355102,
|
||
|
|
"epoch": 0.2247427011132115,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997766530167508,
|
||
|
|
"loss": 6.0821,
|
||
|
|
"mean_token_accuracy": 0.1270790107548237,
|
||
|
|
"num_tokens": 4935464.0,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.221409273147583,
|
||
|
|
"epoch": 0.22516278092837638,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004997753170352627,
|
||
|
|
"loss": 6.1649,
|
||
|
|
"mean_token_accuracy": 0.12717002481222153,
|
||
|
|
"num_tokens": 4944718.0,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084948205947876,
|
||
|
|
"epoch": 0.22558286074354128,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004997739770719955,
|
||
|
|
"loss": 6.0396,
|
||
|
|
"mean_token_accuracy": 0.1332695096731186,
|
||
|
|
"num_tokens": 4954223.0,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.003955984115601,
|
||
|
|
"epoch": 0.22600294055870615,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000499772633126973,
|
||
|
|
"loss": 6.0733,
|
||
|
|
"mean_token_accuracy": 0.1317312702536583,
|
||
|
|
"num_tokens": 4963371.0,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.013844203948975,
|
||
|
|
"epoch": 0.22642302037387105,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004997712852002192,
|
||
|
|
"loss": 5.9358,
|
||
|
|
"mean_token_accuracy": 0.14093514010310174,
|
||
|
|
"num_tokens": 4972973.0,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059261226654053,
|
||
|
|
"epoch": 0.22684310018903592,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004997699332917578,
|
||
|
|
"loss": 6.1739,
|
||
|
|
"mean_token_accuracy": 0.12389883399009705,
|
||
|
|
"num_tokens": 4982808.0,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.180717802047729,
|
||
|
|
"epoch": 0.2272631800042008,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997685774016127,
|
||
|
|
"loss": 6.0444,
|
||
|
|
"mean_token_accuracy": 0.13330344706773758,
|
||
|
|
"num_tokens": 4992427.0,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1143828392028805,
|
||
|
|
"epoch": 0.22768325981936569,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.000499767217529808,
|
||
|
|
"loss": 6.2262,
|
||
|
|
"mean_token_accuracy": 0.12522902861237525,
|
||
|
|
"num_tokens": 5003562.0,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.120408248901367,
|
||
|
|
"epoch": 0.22810333963453056,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004997658536763678,
|
||
|
|
"loss": 5.9207,
|
||
|
|
"mean_token_accuracy": 0.13713482916355133,
|
||
|
|
"num_tokens": 5013429.0,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.080751562118531,
|
||
|
|
"epoch": 0.22852341944969545,
|
||
|
|
"grad_norm": 1.2265625,
|
||
|
|
"learning_rate": 0.0004997644858413163,
|
||
|
|
"loss": 6.046,
|
||
|
|
"mean_token_accuracy": 0.13544052764773368,
|
||
|
|
"num_tokens": 5022045.0,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.984566640853882,
|
||
|
|
"epoch": 0.22894349926486032,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997631140246775,
|
||
|
|
"loss": 5.8853,
|
||
|
|
"mean_token_accuracy": 0.14113514721393586,
|
||
|
|
"num_tokens": 5032260.0,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9389331340789795,
|
||
|
|
"epoch": 0.2293635790800252,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.000499761738226476,
|
||
|
|
"loss": 5.9276,
|
||
|
|
"mean_token_accuracy": 0.13583676218986512,
|
||
|
|
"num_tokens": 5041688.0,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.007482099533081,
|
||
|
|
"epoch": 0.2297836588951901,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.000499760358446736,
|
||
|
|
"loss": 6.0417,
|
||
|
|
"mean_token_accuracy": 0.1291549324989319,
|
||
|
|
"num_tokens": 5051005.0,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1208288192749025,
|
||
|
|
"epoch": 0.23020373871035496,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000499758974685482,
|
||
|
|
"loss": 5.9698,
|
||
|
|
"mean_token_accuracy": 0.13492617905139923,
|
||
|
|
"num_tokens": 5060084.0,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.010481119155884,
|
||
|
|
"epoch": 0.23062381852551986,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.0004997575869427385,
|
||
|
|
"loss": 5.9731,
|
||
|
|
"mean_token_accuracy": 0.14254927188158034,
|
||
|
|
"num_tokens": 5069081.0,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.021266603469849,
|
||
|
|
"epoch": 0.23104389834068473,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00049975619521853,
|
||
|
|
"loss": 5.9703,
|
||
|
|
"mean_token_accuracy": 0.13409337997436524,
|
||
|
|
"num_tokens": 5078597.0,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.943169069290161,
|
||
|
|
"epoch": 0.2314639781558496,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004997547995128814,
|
||
|
|
"loss": 6.0084,
|
||
|
|
"mean_token_accuracy": 0.13727526888251304,
|
||
|
|
"num_tokens": 5087607.0,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.111000204086304,
|
||
|
|
"epoch": 0.2318840579710145,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004997533998258171,
|
||
|
|
"loss": 6.0123,
|
||
|
|
"mean_token_accuracy": 0.1351937808096409,
|
||
|
|
"num_tokens": 5097412.0,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.129235696792603,
|
||
|
|
"epoch": 0.23230413778617937,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004997519961573622,
|
||
|
|
"loss": 6.0735,
|
||
|
|
"mean_token_accuracy": 0.1282409645617008,
|
||
|
|
"num_tokens": 5105817.0,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1673665046691895,
|
||
|
|
"epoch": 0.23272421760134426,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004997505885075414,
|
||
|
|
"loss": 6.1269,
|
||
|
|
"mean_token_accuracy": 0.12907201573252677,
|
||
|
|
"num_tokens": 5114958.0,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.069322109222412,
|
||
|
|
"epoch": 0.23314429741650913,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997491768763795,
|
||
|
|
"loss": 6.0425,
|
||
|
|
"mean_token_accuracy": 0.13409897387027742,
|
||
|
|
"num_tokens": 5123728.0,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.003434944152832,
|
||
|
|
"epoch": 0.23356437723167403,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.0004997477612639018,
|
||
|
|
"loss": 6.0871,
|
||
|
|
"mean_token_accuracy": 0.12734304070472718,
|
||
|
|
"num_tokens": 5134099.0,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.186435317993164,
|
||
|
|
"epoch": 0.2339844570468389,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004997463416701332,
|
||
|
|
"loss": 6.094,
|
||
|
|
"mean_token_accuracy": 0.1274227410554886,
|
||
|
|
"num_tokens": 5142934.0,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.043578577041626,
|
||
|
|
"epoch": 0.23440453686200377,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004997449180950989,
|
||
|
|
"loss": 5.9298,
|
||
|
|
"mean_token_accuracy": 0.1532392293214798,
|
||
|
|
"num_tokens": 5151835.0,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.953121995925903,
|
||
|
|
"epoch": 0.23482461667716867,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004997434905388241,
|
||
|
|
"loss": 5.9842,
|
||
|
|
"mean_token_accuracy": 0.1413706734776497,
|
||
|
|
"num_tokens": 5161136.0,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0334107875823975,
|
||
|
|
"epoch": 0.23524469649233354,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000499742059001334,
|
||
|
|
"loss": 5.9191,
|
||
|
|
"mean_token_accuracy": 0.1378956101834774,
|
||
|
|
"num_tokens": 5170741.0,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.991379880905152,
|
||
|
|
"epoch": 0.23566477630749844,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004997406234826541,
|
||
|
|
"loss": 5.9539,
|
||
|
|
"mean_token_accuracy": 0.14059103950858115,
|
||
|
|
"num_tokens": 5180549.0,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.995284509658814,
|
||
|
|
"epoch": 0.2360848561226633,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004997391839828098,
|
||
|
|
"loss": 5.9249,
|
||
|
|
"mean_token_accuracy": 0.14390118718147277,
|
||
|
|
"num_tokens": 5189486.0,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.030531978607177,
|
||
|
|
"epoch": 0.23650493593782818,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004997377405018266,
|
||
|
|
"loss": 6.0032,
|
||
|
|
"mean_token_accuracy": 0.13120983093976973,
|
||
|
|
"num_tokens": 5198525.0,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0725666046142575,
|
||
|
|
"epoch": 0.23692501575299307,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00049973629303973,
|
||
|
|
"loss": 6.0662,
|
||
|
|
"mean_token_accuracy": 0.1294946141541004,
|
||
|
|
"num_tokens": 5207124.0,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.958557415008545,
|
||
|
|
"epoch": 0.23734509556815794,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004997348415965457,
|
||
|
|
"loss": 5.878,
|
||
|
|
"mean_token_accuracy": 0.13335178643465043,
|
||
|
|
"num_tokens": 5216529.0,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.007561159133911,
|
||
|
|
"epoch": 0.23776517538332284,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0004997333861722995,
|
||
|
|
"loss": 6.0169,
|
||
|
|
"mean_token_accuracy": 0.13635273203253745,
|
||
|
|
"num_tokens": 5225796.0,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.125902462005615,
|
||
|
|
"epoch": 0.2381852551984877,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000499731926767017,
|
||
|
|
"loss": 6.0359,
|
||
|
|
"mean_token_accuracy": 0.1375264048576355,
|
||
|
|
"num_tokens": 5233876.0,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.989985036849975,
|
||
|
|
"epoch": 0.23860533501365258,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004997304633807242,
|
||
|
|
"loss": 6.0396,
|
||
|
|
"mean_token_accuracy": 0.12682786211371422,
|
||
|
|
"num_tokens": 5244782.0,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.019674825668335,
|
||
|
|
"epoch": 0.23902541482881748,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004997289960134468,
|
||
|
|
"loss": 5.9886,
|
||
|
|
"mean_token_accuracy": 0.13695719763636588,
|
||
|
|
"num_tokens": 5253453.0,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0026778221130375,
|
||
|
|
"epoch": 0.23944549464398235,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004997275246652111,
|
||
|
|
"loss": 6.0149,
|
||
|
|
"mean_token_accuracy": 0.13926383331418038,
|
||
|
|
"num_tokens": 5262355.0,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99656400680542,
|
||
|
|
"epoch": 0.23986557445914725,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000499726049336043,
|
||
|
|
"loss": 5.9374,
|
||
|
|
"mean_token_accuracy": 0.13838583379983901,
|
||
|
|
"num_tokens": 5271959.0,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.058608770370483,
|
||
|
|
"epoch": 0.24028565427431212,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004997245700259686,
|
||
|
|
"loss": 5.9673,
|
||
|
|
"mean_token_accuracy": 0.1403045229613781,
|
||
|
|
"num_tokens": 5281393.0,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.061829471588135,
|
||
|
|
"epoch": 0.240705734089477,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997230867350141,
|
||
|
|
"loss": 6.0878,
|
||
|
|
"mean_token_accuracy": 0.1320396728813648,
|
||
|
|
"num_tokens": 5290979.0,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.128190040588379,
|
||
|
|
"epoch": 0.24112581390464188,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004997215994632059,
|
||
|
|
"loss": 6.0392,
|
||
|
|
"mean_token_accuracy": 0.13521442338824272,
|
||
|
|
"num_tokens": 5300263.0,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.065250301361084,
|
||
|
|
"epoch": 0.24154589371980675,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004997201082105704,
|
||
|
|
"loss": 6.0654,
|
||
|
|
"mean_token_accuracy": 0.12793515026569366,
|
||
|
|
"num_tokens": 5309522.0,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059223175048828,
|
||
|
|
"epoch": 0.24196597353497165,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004997186129771338,
|
||
|
|
"loss": 6.0625,
|
||
|
|
"mean_token_accuracy": 0.13326726630330085,
|
||
|
|
"num_tokens": 5319770.0,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.18207311630249,
|
||
|
|
"epoch": 0.24238605335013652,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004997171137629226,
|
||
|
|
"loss": 6.0695,
|
||
|
|
"mean_token_accuracy": 0.13562847971916198,
|
||
|
|
"num_tokens": 5328400.0,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.968668270111084,
|
||
|
|
"epoch": 0.24280613316530142,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.0004997156105679636,
|
||
|
|
"loss": 5.8716,
|
||
|
|
"mean_token_accuracy": 0.14514228701591492,
|
||
|
|
"num_tokens": 5336338.0,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89683952331543,
|
||
|
|
"epoch": 0.2432262129804663,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004997141033922832,
|
||
|
|
"loss": 5.9748,
|
||
|
|
"mean_token_accuracy": 0.1309155747294426,
|
||
|
|
"num_tokens": 5345391.0,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.103964805603027,
|
||
|
|
"epoch": 0.24364629279563116,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997125922359081,
|
||
|
|
"loss": 6.0044,
|
||
|
|
"mean_token_accuracy": 0.12651756703853606,
|
||
|
|
"num_tokens": 5354709.0,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.039173555374146,
|
||
|
|
"epoch": 0.24406637261079606,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004997110770988652,
|
||
|
|
"loss": 5.9187,
|
||
|
|
"mean_token_accuracy": 0.13533097133040428,
|
||
|
|
"num_tokens": 5363738.0,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.009365177154541,
|
||
|
|
"epoch": 0.24448645242596093,
|
||
|
|
"grad_norm": 1.34375,
|
||
|
|
"learning_rate": 0.0004997095579811813,
|
||
|
|
"loss": 6.0492,
|
||
|
|
"mean_token_accuracy": 0.13356854170560836,
|
||
|
|
"num_tokens": 5373583.0,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.10346941947937,
|
||
|
|
"epoch": 0.24490653224112582,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004997080348828833,
|
||
|
|
"loss": 6.0964,
|
||
|
|
"mean_token_accuracy": 0.1329493686556816,
|
||
|
|
"num_tokens": 5383486.0,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.022554492950439,
|
||
|
|
"epoch": 0.2453266120562907,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004997065078039981,
|
||
|
|
"loss": 5.995,
|
||
|
|
"mean_token_accuracy": 0.1254143126308918,
|
||
|
|
"num_tokens": 5391974.0,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.089977025985718,
|
||
|
|
"epoch": 0.24574669187145556,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004997049767445529,
|
||
|
|
"loss": 6.0288,
|
||
|
|
"mean_token_accuracy": 0.12984034791588783,
|
||
|
|
"num_tokens": 5400882.0,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.110510158538818,
|
||
|
|
"epoch": 0.24616677168662046,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004997034417045746,
|
||
|
|
"loss": 5.9927,
|
||
|
|
"mean_token_accuracy": 0.1267140880227089,
|
||
|
|
"num_tokens": 5410538.0,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.971307563781738,
|
||
|
|
"epoch": 0.24658685150178533,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004997019026840907,
|
||
|
|
"loss": 5.8743,
|
||
|
|
"mean_token_accuracy": 0.13612414821982383,
|
||
|
|
"num_tokens": 5419406.0,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.88221755027771,
|
||
|
|
"epoch": 0.24700693131695023,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004997003596831282,
|
||
|
|
"loss": 5.9978,
|
||
|
|
"mean_token_accuracy": 0.13463943675160409,
|
||
|
|
"num_tokens": 5428817.0,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0984635829925535,
|
||
|
|
"epoch": 0.2474270111321151,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004996988127017145,
|
||
|
|
"loss": 6.0253,
|
||
|
|
"mean_token_accuracy": 0.13181837573647498,
|
||
|
|
"num_tokens": 5438277.0,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0544061183929445,
|
||
|
|
"epoch": 0.24784709094728,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0004996972617398772,
|
||
|
|
"loss": 6.042,
|
||
|
|
"mean_token_accuracy": 0.13205936923623085,
|
||
|
|
"num_tokens": 5447440.0,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0680958271026615,
|
||
|
|
"epoch": 0.24826717076244487,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004996957067976435,
|
||
|
|
"loss": 5.9541,
|
||
|
|
"mean_token_accuracy": 0.1357963502407074,
|
||
|
|
"num_tokens": 5455988.0,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0058001518249515,
|
||
|
|
"epoch": 0.24868725057760974,
|
||
|
|
"grad_norm": 1.3203125,
|
||
|
|
"learning_rate": 0.0004996941478750411,
|
||
|
|
"loss": 5.9769,
|
||
|
|
"mean_token_accuracy": 0.1373401865363121,
|
||
|
|
"num_tokens": 5464996.0,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.083559465408325,
|
||
|
|
"epoch": 0.24910733039277463,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004996925849720975,
|
||
|
|
"loss": 6.1025,
|
||
|
|
"mean_token_accuracy": 0.12863337025046348,
|
||
|
|
"num_tokens": 5474174.0,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.146986627578736,
|
||
|
|
"epoch": 0.2495274102079395,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004996910180888405,
|
||
|
|
"loss": 5.9994,
|
||
|
|
"mean_token_accuracy": 0.13324794694781303,
|
||
|
|
"num_tokens": 5482838.0,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.005090427398682,
|
||
|
|
"epoch": 0.2499474900231044,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004996894472252977,
|
||
|
|
"loss": 6.0195,
|
||
|
|
"mean_token_accuracy": 0.13370491713285446,
|
||
|
|
"num_tokens": 5491616.0,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99453763961792,
|
||
|
|
"epoch": 0.25036756983826924,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004996878723814973,
|
||
|
|
"loss": 5.9972,
|
||
|
|
"mean_token_accuracy": 0.12933446019887923,
|
||
|
|
"num_tokens": 5500942.0,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.035016107559204,
|
||
|
|
"epoch": 0.25078764965343414,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004996862935574667,
|
||
|
|
"loss": 5.9539,
|
||
|
|
"mean_token_accuracy": 0.13152176290750503,
|
||
|
|
"num_tokens": 5510078.0,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9494434833526615,
|
||
|
|
"epoch": 0.25120772946859904,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004996847107532342,
|
||
|
|
"loss": 5.9763,
|
||
|
|
"mean_token_accuracy": 0.13343006893992423,
|
||
|
|
"num_tokens": 5518924.0,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.115957880020142,
|
||
|
|
"epoch": 0.25162780928376394,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004996831239688277,
|
||
|
|
"loss": 5.9896,
|
||
|
|
"mean_token_accuracy": 0.12950923070311546,
|
||
|
|
"num_tokens": 5527385.0,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.96525821685791,
|
||
|
|
"epoch": 0.2520478890989288,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004996815332042754,
|
||
|
|
"loss": 5.8456,
|
||
|
|
"mean_token_accuracy": 0.14307771176099776,
|
||
|
|
"num_tokens": 5536781.0,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2520478890989288,
|
||
|
|
"eval_entropy": 5.826104599310177,
|
||
|
|
"eval_loss": 6.01594352722168,
|
||
|
|
"eval_mean_token_accuracy": 0.13980411247313787,
|
||
|
|
"eval_num_tokens": 5536781.0,
|
||
|
|
"eval_runtime": 27.3461,
|
||
|
|
"eval_samples_per_second": 1366.412,
|
||
|
|
"eval_steps_per_second": 170.811,
|
||
|
|
"step": 3000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 119020,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 3000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 8117673873408000.0,
|
||
|
|
"train_batch_size": 16,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|