18068 lines
494 KiB
JSON
18068 lines
494 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.7561436672967864,
|
|
"eval_steps": 3000,
|
|
"global_step": 9000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 10.69201192855835,
|
|
"epoch": 0.0004200798151648813,
|
|
"grad_norm": 13.375,
|
|
"learning_rate": 2e-06,
|
|
"loss": 10.8001,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 8348.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 10.691978454589844,
|
|
"epoch": 0.0008401596303297626,
|
|
"grad_norm": 12.5,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 10.7548,
|
|
"mean_token_accuracy": 0.00010881392518058419,
|
|
"num_tokens": 17465.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 10.691164684295654,
|
|
"epoch": 0.001260239445494644,
|
|
"grad_norm": 9.9375,
|
|
"learning_rate": 7e-06,
|
|
"loss": 10.5365,
|
|
"mean_token_accuracy": 0.021085147676058114,
|
|
"num_tokens": 26627.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 10.678658771514893,
|
|
"epoch": 0.0016803192606595252,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 10.2026,
|
|
"mean_token_accuracy": 0.046403773874044416,
|
|
"num_tokens": 36069.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 10.598964595794678,
|
|
"epoch": 0.002100399075824407,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 9.8984,
|
|
"mean_token_accuracy": 0.04546841159462929,
|
|
"num_tokens": 44967.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.592682838439941,
|
|
"epoch": 0.002520478890989288,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 9.8253,
|
|
"mean_token_accuracy": 0.04163686409592628,
|
|
"num_tokens": 55132.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.616032028198243,
|
|
"epoch": 0.0029405587061541692,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 9.6909,
|
|
"mean_token_accuracy": 0.04541983306407928,
|
|
"num_tokens": 65141.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.587666893005371,
|
|
"epoch": 0.0033606385213190504,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 9.6967,
|
|
"mean_token_accuracy": 0.040509892627596855,
|
|
"num_tokens": 74007.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.587863063812256,
|
|
"epoch": 0.003780718336483932,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.6278,
|
|
"mean_token_accuracy": 0.04380051270127296,
|
|
"num_tokens": 83736.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.581284713745116,
|
|
"epoch": 0.004200798151648814,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.5554,
|
|
"mean_token_accuracy": 0.04462047629058361,
|
|
"num_tokens": 92525.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.579821586608887,
|
|
"epoch": 0.004620877966813695,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.5042,
|
|
"mean_token_accuracy": 0.0499776991084218,
|
|
"num_tokens": 102015.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.527470588684082,
|
|
"epoch": 0.005040957781978576,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.4648,
|
|
"mean_token_accuracy": 0.05102687180042267,
|
|
"num_tokens": 110887.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.398450374603271,
|
|
"epoch": 0.005461037597143457,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.3768,
|
|
"mean_token_accuracy": 0.05401572398841381,
|
|
"num_tokens": 120442.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.466637897491456,
|
|
"epoch": 0.0058811174123083385,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.2516,
|
|
"mean_token_accuracy": 0.05276094898581505,
|
|
"num_tokens": 129297.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.477723217010498,
|
|
"epoch": 0.00630119722747322,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.1585,
|
|
"mean_token_accuracy": 0.05686353407800197,
|
|
"num_tokens": 138305.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.401033782958985,
|
|
"epoch": 0.006721277042638101,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.0976,
|
|
"mean_token_accuracy": 0.055690228939056396,
|
|
"num_tokens": 147640.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.44783878326416,
|
|
"epoch": 0.007141356857802983,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 8.9803,
|
|
"mean_token_accuracy": 0.05669833719730377,
|
|
"num_tokens": 157633.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.396310806274414,
|
|
"epoch": 0.007561436672967864,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 8.9499,
|
|
"mean_token_accuracy": 0.05056734494864941,
|
|
"num_tokens": 167984.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.333494663238525,
|
|
"epoch": 0.007981516488132745,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.8301,
|
|
"mean_token_accuracy": 0.06639725379645825,
|
|
"num_tokens": 176984.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.28737268447876,
|
|
"epoch": 0.008401596303297627,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.654,
|
|
"mean_token_accuracy": 0.06538619883358479,
|
|
"num_tokens": 185931.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.208460235595703,
|
|
"epoch": 0.008821676118462508,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.6478,
|
|
"mean_token_accuracy": 0.050938266515731814,
|
|
"num_tokens": 195065.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.092334175109864,
|
|
"epoch": 0.00924175593362739,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.5099,
|
|
"mean_token_accuracy": 0.06477361544966698,
|
|
"num_tokens": 203687.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.105284690856934,
|
|
"epoch": 0.00966183574879227,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.4081,
|
|
"mean_token_accuracy": 0.0666894868016243,
|
|
"num_tokens": 212847.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.957781219482422,
|
|
"epoch": 0.010081915563957152,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3004,
|
|
"mean_token_accuracy": 0.0674133587628603,
|
|
"num_tokens": 222593.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.889359092712402,
|
|
"epoch": 0.010501995379122032,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.129,
|
|
"mean_token_accuracy": 0.07197456955909728,
|
|
"num_tokens": 231174.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.669556808471679,
|
|
"epoch": 0.010922075194286915,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 7.9843,
|
|
"mean_token_accuracy": 0.07425511926412583,
|
|
"num_tokens": 239833.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.519672775268555,
|
|
"epoch": 0.011342155009451797,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0143,
|
|
"mean_token_accuracy": 0.07254141308367253,
|
|
"num_tokens": 248794.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.303325176239014,
|
|
"epoch": 0.011762234824616677,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9537,
|
|
"mean_token_accuracy": 0.07010119631886483,
|
|
"num_tokens": 257123.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 9.143257808685302,
|
|
"epoch": 0.012182314639781559,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.6458,
|
|
"mean_token_accuracy": 0.07959595024585724,
|
|
"num_tokens": 266088.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.888239574432372,
|
|
"epoch": 0.01260239445494644,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.8236,
|
|
"mean_token_accuracy": 0.07102414257824421,
|
|
"num_tokens": 276074.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.727731895446777,
|
|
"epoch": 0.013022474270111321,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.7082,
|
|
"mean_token_accuracy": 0.07570267021656037,
|
|
"num_tokens": 285280.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.563877964019776,
|
|
"epoch": 0.013442554085276202,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.6962,
|
|
"mean_token_accuracy": 0.06895132511854171,
|
|
"num_tokens": 296115.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.412875747680664,
|
|
"epoch": 0.013862633900441084,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.5497,
|
|
"mean_token_accuracy": 0.07601302340626717,
|
|
"num_tokens": 305483.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.340911769866944,
|
|
"epoch": 0.014282713715605966,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.5593,
|
|
"mean_token_accuracy": 0.07040085420012474,
|
|
"num_tokens": 314000.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.245043659210205,
|
|
"epoch": 0.014702793530770846,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5541,
|
|
"mean_token_accuracy": 0.07777635231614113,
|
|
"num_tokens": 323667.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.15629415512085,
|
|
"epoch": 0.015122873345935728,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.5554,
|
|
"mean_token_accuracy": 0.07515333034098148,
|
|
"num_tokens": 332695.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.065321111679078,
|
|
"epoch": 0.015542953161100609,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.3947,
|
|
"mean_token_accuracy": 0.07709791958332061,
|
|
"num_tokens": 342428.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.054158020019532,
|
|
"epoch": 0.01596303297626549,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5079,
|
|
"mean_token_accuracy": 0.0735605925321579,
|
|
"num_tokens": 353587.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.988022661209106,
|
|
"epoch": 0.01638311279143037,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.443,
|
|
"mean_token_accuracy": 0.07551693692803382,
|
|
"num_tokens": 362997.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.02585473060608,
|
|
"epoch": 0.016803192606595255,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.4821,
|
|
"mean_token_accuracy": 0.07873391062021255,
|
|
"num_tokens": 372346.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.984146022796631,
|
|
"epoch": 0.017223272421760135,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3473,
|
|
"mean_token_accuracy": 0.07624267861247062,
|
|
"num_tokens": 381575.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.912975454330445,
|
|
"epoch": 0.017643352236925015,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.4236,
|
|
"mean_token_accuracy": 0.0766436841338873,
|
|
"num_tokens": 390706.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.888600492477417,
|
|
"epoch": 0.018063432052089896,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.4209,
|
|
"mean_token_accuracy": 0.0734835498034954,
|
|
"num_tokens": 400000.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.803367996215821,
|
|
"epoch": 0.01848351186725478,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.3774,
|
|
"mean_token_accuracy": 0.08182684779167175,
|
|
"num_tokens": 409447.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.875886058807373,
|
|
"epoch": 0.01890359168241966,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.3393,
|
|
"mean_token_accuracy": 0.08449244052171707,
|
|
"num_tokens": 418417.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.78724856376648,
|
|
"epoch": 0.01932367149758454,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.3048,
|
|
"mean_token_accuracy": 0.08006256446242332,
|
|
"num_tokens": 427619.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.736767053604126,
|
|
"epoch": 0.019743751312749424,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.372,
|
|
"mean_token_accuracy": 0.07579129710793495,
|
|
"num_tokens": 437931.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.841858673095703,
|
|
"epoch": 0.020163831127914304,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.4001,
|
|
"mean_token_accuracy": 0.08351109325885772,
|
|
"num_tokens": 447595.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.7983135223388675,
|
|
"epoch": 0.020583910943079185,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.2633,
|
|
"mean_token_accuracy": 0.07488272562623025,
|
|
"num_tokens": 457062.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.813820743560791,
|
|
"epoch": 0.021003990758244065,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.3567,
|
|
"mean_token_accuracy": 0.07759504988789559,
|
|
"num_tokens": 466191.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.757200431823731,
|
|
"epoch": 0.02142407057340895,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.3146,
|
|
"mean_token_accuracy": 0.08031945005059242,
|
|
"num_tokens": 475693.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.7279805660247805,
|
|
"epoch": 0.02184415038857383,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.3269,
|
|
"mean_token_accuracy": 0.08141026981174945,
|
|
"num_tokens": 485173.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.724671411514282,
|
|
"epoch": 0.02226423020373871,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.2369,
|
|
"mean_token_accuracy": 0.083962532132864,
|
|
"num_tokens": 493985.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.6601485252380375,
|
|
"epoch": 0.022684310018903593,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.2687,
|
|
"mean_token_accuracy": 0.08190520852804184,
|
|
"num_tokens": 502837.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.751116943359375,
|
|
"epoch": 0.023104389834068473,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.2065,
|
|
"mean_token_accuracy": 0.0843705341219902,
|
|
"num_tokens": 511503.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.717013120651245,
|
|
"epoch": 0.023524469649233354,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.4058,
|
|
"mean_token_accuracy": 0.08034609854221345,
|
|
"num_tokens": 521499.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.592406368255615,
|
|
"epoch": 0.023944549464398234,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.166,
|
|
"mean_token_accuracy": 0.08277052193880081,
|
|
"num_tokens": 530067.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.6297852993011475,
|
|
"epoch": 0.024364629279563118,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.1721,
|
|
"mean_token_accuracy": 0.08475914299488067,
|
|
"num_tokens": 538559.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.705462646484375,
|
|
"epoch": 0.024784709094728,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.3653,
|
|
"mean_token_accuracy": 0.07328721843659877,
|
|
"num_tokens": 547288.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.596541261672973,
|
|
"epoch": 0.02520478890989288,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.2357,
|
|
"mean_token_accuracy": 0.07816045507788658,
|
|
"num_tokens": 557269.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.701767444610596,
|
|
"epoch": 0.025624868725057762,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.2628,
|
|
"mean_token_accuracy": 0.07311495915055274,
|
|
"num_tokens": 567280.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.602482271194458,
|
|
"epoch": 0.026044948540222643,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.0908,
|
|
"mean_token_accuracy": 0.08299101889133453,
|
|
"num_tokens": 576609.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.399111747741699,
|
|
"epoch": 0.026465028355387523,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0032,
|
|
"mean_token_accuracy": 0.09095181971788406,
|
|
"num_tokens": 586053.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.507453203201294,
|
|
"epoch": 0.026885108170552403,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.203,
|
|
"mean_token_accuracy": 0.08823259696364402,
|
|
"num_tokens": 594649.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.599713850021362,
|
|
"epoch": 0.027305187985717287,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.1383,
|
|
"mean_token_accuracy": 0.08195743858814239,
|
|
"num_tokens": 603445.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.587759685516358,
|
|
"epoch": 0.027725267800882167,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.2543,
|
|
"mean_token_accuracy": 0.07800514288246632,
|
|
"num_tokens": 613611.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.745543384552002,
|
|
"epoch": 0.028145347616047048,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.429,
|
|
"mean_token_accuracy": 0.07839688062667846,
|
|
"num_tokens": 623024.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.4431709289550785,
|
|
"epoch": 0.02856542743121193,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.1028,
|
|
"mean_token_accuracy": 0.08672705665230751,
|
|
"num_tokens": 631624.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.574361371994018,
|
|
"epoch": 0.028985507246376812,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.0557,
|
|
"mean_token_accuracy": 0.08923942148685456,
|
|
"num_tokens": 640473.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.541849613189697,
|
|
"epoch": 0.029405587061541692,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.2383,
|
|
"mean_token_accuracy": 0.08173563033342361,
|
|
"num_tokens": 649692.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.571516275405884,
|
|
"epoch": 0.029825666876706573,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.1875,
|
|
"mean_token_accuracy": 0.08110572174191474,
|
|
"num_tokens": 658236.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.34685640335083,
|
|
"epoch": 0.030245746691871456,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.9645,
|
|
"mean_token_accuracy": 0.08569629490375519,
|
|
"num_tokens": 667175.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.556408214569092,
|
|
"epoch": 0.030665826507036337,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.2834,
|
|
"mean_token_accuracy": 0.08148858584463596,
|
|
"num_tokens": 676456.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.606632947921753,
|
|
"epoch": 0.031085906322201217,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.2448,
|
|
"mean_token_accuracy": 0.08052070513367653,
|
|
"num_tokens": 686881.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.371811389923096,
|
|
"epoch": 0.0315059861373661,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.0307,
|
|
"mean_token_accuracy": 0.08108055517077446,
|
|
"num_tokens": 696045.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.382633686065674,
|
|
"epoch": 0.03192606595253098,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.003,
|
|
"mean_token_accuracy": 0.09089459106326103,
|
|
"num_tokens": 704729.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.353933048248291,
|
|
"epoch": 0.032346145767695865,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.0639,
|
|
"mean_token_accuracy": 0.08123919740319252,
|
|
"num_tokens": 714331.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.430750465393066,
|
|
"epoch": 0.03276622558286074,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.0163,
|
|
"mean_token_accuracy": 0.08898987770080566,
|
|
"num_tokens": 722788.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.388132476806641,
|
|
"epoch": 0.033186305398025626,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.0996,
|
|
"mean_token_accuracy": 0.0889863982796669,
|
|
"num_tokens": 731417.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.394377708435059,
|
|
"epoch": 0.03360638521319051,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.0686,
|
|
"mean_token_accuracy": 0.0865507885813713,
|
|
"num_tokens": 741034.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.370957660675049,
|
|
"epoch": 0.034026465028355386,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.063,
|
|
"mean_token_accuracy": 0.08408316597342491,
|
|
"num_tokens": 749596.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.360737991333008,
|
|
"epoch": 0.03444654484352027,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.0166,
|
|
"mean_token_accuracy": 0.08443826884031295,
|
|
"num_tokens": 758931.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.253893661499023,
|
|
"epoch": 0.03486662465868515,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.9221,
|
|
"mean_token_accuracy": 0.08874604031443596,
|
|
"num_tokens": 767534.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.336139726638794,
|
|
"epoch": 0.03528670447385003,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.9742,
|
|
"mean_token_accuracy": 0.08901742175221443,
|
|
"num_tokens": 776456.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.32063570022583,
|
|
"epoch": 0.035706784289014915,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.0512,
|
|
"mean_token_accuracy": 0.0825334556400776,
|
|
"num_tokens": 786172.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.2836973667144775,
|
|
"epoch": 0.03612686410417979,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.9281,
|
|
"mean_token_accuracy": 0.09393875077366828,
|
|
"num_tokens": 795081.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.279390621185303,
|
|
"epoch": 0.036546943919344675,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.9729,
|
|
"mean_token_accuracy": 0.08336275964975357,
|
|
"num_tokens": 804259.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.3233130931854244,
|
|
"epoch": 0.03696702373450956,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.9836,
|
|
"mean_token_accuracy": 0.08346287980675697,
|
|
"num_tokens": 813463.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.265643119812012,
|
|
"epoch": 0.037387103549674436,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.915,
|
|
"mean_token_accuracy": 0.09436434507369995,
|
|
"num_tokens": 823029.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.2830162525177,
|
|
"epoch": 0.03780718336483932,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.9822,
|
|
"mean_token_accuracy": 0.08020757511258125,
|
|
"num_tokens": 832902.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.172808027267456,
|
|
"epoch": 0.0382272631800042,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.9269,
|
|
"mean_token_accuracy": 0.08937018439173698,
|
|
"num_tokens": 842162.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.261403322219849,
|
|
"epoch": 0.03864734299516908,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.9709,
|
|
"mean_token_accuracy": 0.09120814129710197,
|
|
"num_tokens": 852328.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.207744789123535,
|
|
"epoch": 0.039067422810333964,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.9283,
|
|
"mean_token_accuracy": 0.08966456726193428,
|
|
"num_tokens": 860929.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.253277540206909,
|
|
"epoch": 0.03948750262549885,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.0043,
|
|
"mean_token_accuracy": 0.0854820430278778,
|
|
"num_tokens": 869144.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.303921031951904,
|
|
"epoch": 0.039907582440663725,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.9451,
|
|
"mean_token_accuracy": 0.09673570543527603,
|
|
"num_tokens": 877447.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.20126519203186,
|
|
"epoch": 0.04032766225582861,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.9017,
|
|
"mean_token_accuracy": 0.08463463708758354,
|
|
"num_tokens": 887020.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.1618622779846195,
|
|
"epoch": 0.040747742070993485,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.9503,
|
|
"mean_token_accuracy": 0.08903224021196365,
|
|
"num_tokens": 895937.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.172050189971924,
|
|
"epoch": 0.04116782188615837,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.9573,
|
|
"mean_token_accuracy": 0.08436014279723167,
|
|
"num_tokens": 905446.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.1261190414428714,
|
|
"epoch": 0.04158790170132325,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.8507,
|
|
"mean_token_accuracy": 0.09782563373446465,
|
|
"num_tokens": 914547.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.219514274597168,
|
|
"epoch": 0.04200798151648813,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.8597,
|
|
"mean_token_accuracy": 0.09429225027561187,
|
|
"num_tokens": 922900.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.174054384231567,
|
|
"epoch": 0.042428061331653014,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.9026,
|
|
"mean_token_accuracy": 0.09461246877908706,
|
|
"num_tokens": 930876.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.149679851531983,
|
|
"epoch": 0.0428481411468179,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.9327,
|
|
"mean_token_accuracy": 0.09384474828839302,
|
|
"num_tokens": 939871.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.1536510467529295,
|
|
"epoch": 0.043268220961982774,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.9204,
|
|
"mean_token_accuracy": 0.08957441225647926,
|
|
"num_tokens": 948673.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.07887830734253,
|
|
"epoch": 0.04368830077714766,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.8686,
|
|
"mean_token_accuracy": 0.08727961704134941,
|
|
"num_tokens": 957603.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.11884388923645,
|
|
"epoch": 0.04410838059231254,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.9378,
|
|
"mean_token_accuracy": 0.08589621968567371,
|
|
"num_tokens": 967731.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.1688611030578615,
|
|
"epoch": 0.04452846040747742,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.9387,
|
|
"mean_token_accuracy": 0.09485394582152366,
|
|
"num_tokens": 977427.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.146421909332275,
|
|
"epoch": 0.0449485402226423,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.9243,
|
|
"mean_token_accuracy": 0.08625848963856697,
|
|
"num_tokens": 986758.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.25874433517456,
|
|
"epoch": 0.045368620037807186,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.92,
|
|
"mean_token_accuracy": 0.09832347258925438,
|
|
"num_tokens": 996377.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.057836389541626,
|
|
"epoch": 0.04578869985297206,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.9742,
|
|
"mean_token_accuracy": 0.08528567403554917,
|
|
"num_tokens": 1006483.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 6.995539855957031,
|
|
"epoch": 0.04620877966813695,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.8574,
|
|
"mean_token_accuracy": 0.08858747258782387,
|
|
"num_tokens": 1016132.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.106180238723755,
|
|
"epoch": 0.04662885948330183,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.7984,
|
|
"mean_token_accuracy": 0.09407598823308945,
|
|
"num_tokens": 1024970.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.142482328414917,
|
|
"epoch": 0.04704893929846671,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.8936,
|
|
"mean_token_accuracy": 0.08978619575500488,
|
|
"num_tokens": 1034335.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.139913558959961,
|
|
"epoch": 0.04746901911363159,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.9495,
|
|
"mean_token_accuracy": 0.0973325490951538,
|
|
"num_tokens": 1043954.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.08342981338501,
|
|
"epoch": 0.04788909892879647,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.8806,
|
|
"mean_token_accuracy": 0.09276892617344856,
|
|
"num_tokens": 1053554.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.0591119766235355,
|
|
"epoch": 0.04830917874396135,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.8354,
|
|
"mean_token_accuracy": 0.09314879402518272,
|
|
"num_tokens": 1062008.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.029165410995484,
|
|
"epoch": 0.048729258559126236,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.9074,
|
|
"mean_token_accuracy": 0.09056607261300087,
|
|
"num_tokens": 1070740.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.027670526504517,
|
|
"epoch": 0.04914933837429111,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.8895,
|
|
"mean_token_accuracy": 0.09351922869682312,
|
|
"num_tokens": 1079681.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.076567363739014,
|
|
"epoch": 0.049569418189456,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.7669,
|
|
"mean_token_accuracy": 0.0963557355105877,
|
|
"num_tokens": 1088979.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 6.955168056488037,
|
|
"epoch": 0.04998949800462088,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.7794,
|
|
"mean_token_accuracy": 0.09716788977384568,
|
|
"num_tokens": 1097870.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.0498795986175535,
|
|
"epoch": 0.05040957781978576,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.8985,
|
|
"mean_token_accuracy": 0.08934849128127098,
|
|
"num_tokens": 1107948.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.038954401016236,
|
|
"epoch": 0.05082965763495064,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.8034,
|
|
"mean_token_accuracy": 0.09711324200034141,
|
|
"num_tokens": 1117032.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.016556072235107,
|
|
"epoch": 0.051249737450115525,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.7736,
|
|
"mean_token_accuracy": 0.10140406414866447,
|
|
"num_tokens": 1127834.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.053543567657471,
|
|
"epoch": 0.0516698172652804,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.8664,
|
|
"mean_token_accuracy": 0.10583841800689697,
|
|
"num_tokens": 1137382.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 6.960672283172608,
|
|
"epoch": 0.052089897080445285,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.7295,
|
|
"mean_token_accuracy": 0.09906250685453415,
|
|
"num_tokens": 1146095.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 6.916978216171264,
|
|
"epoch": 0.05250997689561017,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.7648,
|
|
"mean_token_accuracy": 0.1004838652908802,
|
|
"num_tokens": 1154981.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 6.948708629608154,
|
|
"epoch": 0.052930056710775046,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.7765,
|
|
"mean_token_accuracy": 0.10312124192714692,
|
|
"num_tokens": 1164939.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.024917793273926,
|
|
"epoch": 0.05335013652593993,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.8939,
|
|
"mean_token_accuracy": 0.09090543612837791,
|
|
"num_tokens": 1174991.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.0208131790161135,
|
|
"epoch": 0.05377021634110481,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.9459,
|
|
"mean_token_accuracy": 0.08811391443014145,
|
|
"num_tokens": 1184885.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.984617424011231,
|
|
"epoch": 0.05419029615626969,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.8348,
|
|
"mean_token_accuracy": 0.09274234399199485,
|
|
"num_tokens": 1193637.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 6.901879405975341,
|
|
"epoch": 0.054610375971434574,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.6237,
|
|
"mean_token_accuracy": 0.10028594210743905,
|
|
"num_tokens": 1202188.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 6.964693355560303,
|
|
"epoch": 0.05503045578659945,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.7513,
|
|
"mean_token_accuracy": 0.09297072812914849,
|
|
"num_tokens": 1210768.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.921257066726684,
|
|
"epoch": 0.055450535601764335,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.7581,
|
|
"mean_token_accuracy": 0.09513410851359368,
|
|
"num_tokens": 1219819.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 6.969961500167846,
|
|
"epoch": 0.05587061541692922,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.8151,
|
|
"mean_token_accuracy": 0.08720013573765754,
|
|
"num_tokens": 1229703.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.008356428146362,
|
|
"epoch": 0.056290695232094096,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.8385,
|
|
"mean_token_accuracy": 0.09394309446215629,
|
|
"num_tokens": 1238942.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.041683959960937,
|
|
"epoch": 0.05671077504725898,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.8901,
|
|
"mean_token_accuracy": 0.0907767005264759,
|
|
"num_tokens": 1248943.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 6.869440269470215,
|
|
"epoch": 0.05713085486242386,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.7728,
|
|
"mean_token_accuracy": 0.09719423428177834,
|
|
"num_tokens": 1257761.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.80675859451294,
|
|
"epoch": 0.05755093467758874,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.722,
|
|
"mean_token_accuracy": 0.09433782026171685,
|
|
"num_tokens": 1267216.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 6.962690448760986,
|
|
"epoch": 0.057971014492753624,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.8182,
|
|
"mean_token_accuracy": 0.09524153247475624,
|
|
"num_tokens": 1277210.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 6.910012054443359,
|
|
"epoch": 0.05839109430791851,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.7268,
|
|
"mean_token_accuracy": 0.09480128362774849,
|
|
"num_tokens": 1285310.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.9359142780303955,
|
|
"epoch": 0.058811174123083385,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.7418,
|
|
"mean_token_accuracy": 0.09830545634031296,
|
|
"num_tokens": 1294421.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.773298215866089,
|
|
"epoch": 0.05923125393824827,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.5648,
|
|
"mean_token_accuracy": 0.10509093776345253,
|
|
"num_tokens": 1303281.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.848818397521972,
|
|
"epoch": 0.059651333753413145,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.7413,
|
|
"mean_token_accuracy": 0.10247144997119903,
|
|
"num_tokens": 1312280.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.792526483535767,
|
|
"epoch": 0.06007141356857803,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.703,
|
|
"mean_token_accuracy": 0.09476525709033012,
|
|
"num_tokens": 1321243.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.8667539119720455,
|
|
"epoch": 0.06049149338374291,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.8092,
|
|
"mean_token_accuracy": 0.10024766996502876,
|
|
"num_tokens": 1330324.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.874475002288818,
|
|
"epoch": 0.06091157319890779,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.6476,
|
|
"mean_token_accuracy": 0.10230677276849746,
|
|
"num_tokens": 1339485.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.930787801742554,
|
|
"epoch": 0.06133165301407267,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.8065,
|
|
"mean_token_accuracy": 0.09302590638399125,
|
|
"num_tokens": 1348640.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.799437236785889,
|
|
"epoch": 0.06175173282923756,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.6978,
|
|
"mean_token_accuracy": 0.09949951842427254,
|
|
"num_tokens": 1357581.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.888378238677978,
|
|
"epoch": 0.062171812644402434,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.7652,
|
|
"mean_token_accuracy": 0.09876005351543427,
|
|
"num_tokens": 1367883.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.812366771697998,
|
|
"epoch": 0.06259189245956731,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.7175,
|
|
"mean_token_accuracy": 0.09678780436515808,
|
|
"num_tokens": 1376936.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.708990812301636,
|
|
"epoch": 0.0630119722747322,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.6402,
|
|
"mean_token_accuracy": 0.09989499375224113,
|
|
"num_tokens": 1386359.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.86722469329834,
|
|
"epoch": 0.06343205208989708,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.6965,
|
|
"mean_token_accuracy": 0.10066593587398528,
|
|
"num_tokens": 1395223.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.944450616836548,
|
|
"epoch": 0.06385213190506196,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.847,
|
|
"mean_token_accuracy": 0.09334802627563477,
|
|
"num_tokens": 1404917.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.823553276062012,
|
|
"epoch": 0.06427221172022685,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.7474,
|
|
"mean_token_accuracy": 0.10658529698848725,
|
|
"num_tokens": 1413348.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.7500804424285885,
|
|
"epoch": 0.06469229153539173,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.7193,
|
|
"mean_token_accuracy": 0.09804128184914589,
|
|
"num_tokens": 1421726.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.822430419921875,
|
|
"epoch": 0.0651123713505566,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.7314,
|
|
"mean_token_accuracy": 0.09830505326390267,
|
|
"num_tokens": 1430686.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.889693403244019,
|
|
"epoch": 0.06553245116572148,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.7193,
|
|
"mean_token_accuracy": 0.1001870684325695,
|
|
"num_tokens": 1439499.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.836849641799927,
|
|
"epoch": 0.06595253098088637,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.7144,
|
|
"mean_token_accuracy": 0.10016432479023933,
|
|
"num_tokens": 1448220.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.703166866302491,
|
|
"epoch": 0.06637261079605125,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.7252,
|
|
"mean_token_accuracy": 0.09049011170864105,
|
|
"num_tokens": 1458217.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.805354738235474,
|
|
"epoch": 0.06679269061121614,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.6229,
|
|
"mean_token_accuracy": 0.0928824745118618,
|
|
"num_tokens": 1467422.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.788901376724243,
|
|
"epoch": 0.06721277042638102,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.6204,
|
|
"mean_token_accuracy": 0.10320913046598434,
|
|
"num_tokens": 1476152.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.731419372558594,
|
|
"epoch": 0.06763285024154589,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.7128,
|
|
"mean_token_accuracy": 0.09539571255445481,
|
|
"num_tokens": 1485248.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.7255181789398195,
|
|
"epoch": 0.06805293005671077,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.6711,
|
|
"mean_token_accuracy": 0.09965705946087837,
|
|
"num_tokens": 1494248.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.825131368637085,
|
|
"epoch": 0.06847300987187566,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.785,
|
|
"mean_token_accuracy": 0.09547284319996834,
|
|
"num_tokens": 1503565.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.932170867919922,
|
|
"epoch": 0.06889308968704054,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.8605,
|
|
"mean_token_accuracy": 0.09502148702740669,
|
|
"num_tokens": 1513227.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.8283134460449215,
|
|
"epoch": 0.06931316950220542,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.6616,
|
|
"mean_token_accuracy": 0.1039304107427597,
|
|
"num_tokens": 1522312.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.6956737518310545,
|
|
"epoch": 0.0697332493173703,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.5989,
|
|
"mean_token_accuracy": 0.10552669763565063,
|
|
"num_tokens": 1531720.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.70291919708252,
|
|
"epoch": 0.07015332913253518,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.7026,
|
|
"mean_token_accuracy": 0.09495449438691139,
|
|
"num_tokens": 1541238.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.867031812667847,
|
|
"epoch": 0.07057340894770006,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.7955,
|
|
"mean_token_accuracy": 0.09560235142707825,
|
|
"num_tokens": 1550875.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.679243516921997,
|
|
"epoch": 0.07099348876286495,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.7373,
|
|
"mean_token_accuracy": 0.10205229669809342,
|
|
"num_tokens": 1560287.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.812178373336792,
|
|
"epoch": 0.07141356857802983,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.6139,
|
|
"mean_token_accuracy": 0.10624400898814201,
|
|
"num_tokens": 1569043.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.66694450378418,
|
|
"epoch": 0.07183364839319471,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.6372,
|
|
"mean_token_accuracy": 0.10226837545633316,
|
|
"num_tokens": 1578112.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.592900228500366,
|
|
"epoch": 0.07225372820835958,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.5542,
|
|
"mean_token_accuracy": 0.10482543483376502,
|
|
"num_tokens": 1586587.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.831333017349243,
|
|
"epoch": 0.07267380802352447,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.7191,
|
|
"mean_token_accuracy": 0.0988001950085163,
|
|
"num_tokens": 1595585.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.7406104564666744,
|
|
"epoch": 0.07309388783868935,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.6715,
|
|
"mean_token_accuracy": 0.1029144361615181,
|
|
"num_tokens": 1605355.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.673774909973145,
|
|
"epoch": 0.07351396765385423,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.7087,
|
|
"mean_token_accuracy": 0.0972638413310051,
|
|
"num_tokens": 1613637.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.780192899703979,
|
|
"epoch": 0.07393404746901912,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.6547,
|
|
"mean_token_accuracy": 0.10374342575669289,
|
|
"num_tokens": 1622731.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.733386611938476,
|
|
"epoch": 0.074354127284184,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.6411,
|
|
"mean_token_accuracy": 0.09785914570093154,
|
|
"num_tokens": 1632098.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.656809377670288,
|
|
"epoch": 0.07477420709934887,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.6333,
|
|
"mean_token_accuracy": 0.09908856153488159,
|
|
"num_tokens": 1641259.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.787235689163208,
|
|
"epoch": 0.07519428691451376,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.7023,
|
|
"mean_token_accuracy": 0.09753435328602791,
|
|
"num_tokens": 1651362.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.644986867904663,
|
|
"epoch": 0.07561436672967864,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.6169,
|
|
"mean_token_accuracy": 0.09910911172628403,
|
|
"num_tokens": 1660190.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.722699403762817,
|
|
"epoch": 0.07603444654484352,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.659,
|
|
"mean_token_accuracy": 0.09519267976284027,
|
|
"num_tokens": 1669020.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.747388315200806,
|
|
"epoch": 0.0764545263600084,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.6775,
|
|
"mean_token_accuracy": 0.10076266825199127,
|
|
"num_tokens": 1678158.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.702866649627685,
|
|
"epoch": 0.07687460617517328,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.6868,
|
|
"mean_token_accuracy": 0.09906790256500245,
|
|
"num_tokens": 1687481.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.647071504592896,
|
|
"epoch": 0.07729468599033816,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.6511,
|
|
"mean_token_accuracy": 0.10402323752641678,
|
|
"num_tokens": 1696782.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.6832818508148195,
|
|
"epoch": 0.07771476580550304,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.6575,
|
|
"mean_token_accuracy": 0.10666462555527687,
|
|
"num_tokens": 1706153.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.698217678070068,
|
|
"epoch": 0.07813484562066793,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.6895,
|
|
"mean_token_accuracy": 0.10017500966787338,
|
|
"num_tokens": 1715585.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.823991441726685,
|
|
"epoch": 0.07855492543583281,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.8005,
|
|
"mean_token_accuracy": 0.09734346494078636,
|
|
"num_tokens": 1724857.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.700028705596924,
|
|
"epoch": 0.0789750052509977,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.6103,
|
|
"mean_token_accuracy": 0.10624456107616424,
|
|
"num_tokens": 1733528.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.742655563354492,
|
|
"epoch": 0.07939508506616257,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.7304,
|
|
"mean_token_accuracy": 0.10352228581905365,
|
|
"num_tokens": 1742953.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.669600582122802,
|
|
"epoch": 0.07981516488132745,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.6746,
|
|
"mean_token_accuracy": 0.10271603912115097,
|
|
"num_tokens": 1752155.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.660818243026734,
|
|
"epoch": 0.08023524469649233,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.5695,
|
|
"mean_token_accuracy": 0.10144439786672592,
|
|
"num_tokens": 1760562.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.623502588272094,
|
|
"epoch": 0.08065532451165722,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.5902,
|
|
"mean_token_accuracy": 0.1015326887369156,
|
|
"num_tokens": 1769631.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.647875261306763,
|
|
"epoch": 0.0810754043268221,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.624,
|
|
"mean_token_accuracy": 0.10202456414699554,
|
|
"num_tokens": 1779080.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.654635858535767,
|
|
"epoch": 0.08149548414198697,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.6146,
|
|
"mean_token_accuracy": 0.10121759623289109,
|
|
"num_tokens": 1787830.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.546731615066529,
|
|
"epoch": 0.08191556395715185,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.5331,
|
|
"mean_token_accuracy": 0.10186785906553268,
|
|
"num_tokens": 1796998.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.6796527862548825,
|
|
"epoch": 0.08233564377231674,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.619,
|
|
"mean_token_accuracy": 0.10591355115175247,
|
|
"num_tokens": 1806194.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.40926570892334,
|
|
"epoch": 0.08275572358748162,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.514,
|
|
"mean_token_accuracy": 0.10517977550625801,
|
|
"num_tokens": 1815751.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.57440676689148,
|
|
"epoch": 0.0831758034026465,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.5942,
|
|
"mean_token_accuracy": 0.10343918055295945,
|
|
"num_tokens": 1825379.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.637695789337158,
|
|
"epoch": 0.08359588321781139,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.5522,
|
|
"mean_token_accuracy": 0.10346684157848358,
|
|
"num_tokens": 1834158.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.537919807434082,
|
|
"epoch": 0.08401596303297626,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.5098,
|
|
"mean_token_accuracy": 0.10425886288285255,
|
|
"num_tokens": 1842724.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.62498288154602,
|
|
"epoch": 0.08443604284814114,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499999998724557,
|
|
"loss": 6.5288,
|
|
"mean_token_accuracy": 0.10198150128126145,
|
|
"num_tokens": 1852485.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.57701358795166,
|
|
"epoch": 0.08485612266330603,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999935430703,
|
|
"loss": 6.5545,
|
|
"mean_token_accuracy": 0.11041983366012573,
|
|
"num_tokens": 1861303.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.423639154434204,
|
|
"epoch": 0.08527620247847091,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999999843758243,
|
|
"loss": 6.5428,
|
|
"mean_token_accuracy": 0.11022127270698548,
|
|
"num_tokens": 1870859.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.760848808288574,
|
|
"epoch": 0.0856962822936358,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999999712228196,
|
|
"loss": 6.7105,
|
|
"mean_token_accuracy": 0.09618140533566474,
|
|
"num_tokens": 1880295.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.645368003845215,
|
|
"epoch": 0.08611636210880068,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999999540840562,
|
|
"loss": 6.6079,
|
|
"mean_token_accuracy": 0.1056639552116394,
|
|
"num_tokens": 1889193.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.568785905838013,
|
|
"epoch": 0.08653644192396555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999999329595345,
|
|
"loss": 6.7096,
|
|
"mean_token_accuracy": 0.09398577436804771,
|
|
"num_tokens": 1899437.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.708119821548462,
|
|
"epoch": 0.08695652173913043,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999999078492548,
|
|
"loss": 6.5939,
|
|
"mean_token_accuracy": 0.1046712227165699,
|
|
"num_tokens": 1907882.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.493611288070679,
|
|
"epoch": 0.08737660155429532,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999998787532176,
|
|
"loss": 6.5021,
|
|
"mean_token_accuracy": 0.10290396809577942,
|
|
"num_tokens": 1916872.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.608988046646118,
|
|
"epoch": 0.0877966813694602,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999998456714234,
|
|
"loss": 6.675,
|
|
"mean_token_accuracy": 0.10352342054247857,
|
|
"num_tokens": 1926636.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.586896228790283,
|
|
"epoch": 0.08821676118462508,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999998086038729,
|
|
"loss": 6.5742,
|
|
"mean_token_accuracy": 0.10714709535241126,
|
|
"num_tokens": 1935962.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.579021549224853,
|
|
"epoch": 0.08863684099978995,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999997675505665,
|
|
"loss": 6.5514,
|
|
"mean_token_accuracy": 0.10487730801105499,
|
|
"num_tokens": 1944600.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.625632095336914,
|
|
"epoch": 0.08905692081495484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999997225115052,
|
|
"loss": 6.7269,
|
|
"mean_token_accuracy": 0.10071012005209923,
|
|
"num_tokens": 1954234.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.7796577453613285,
|
|
"epoch": 0.08947700063011972,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999996734866896,
|
|
"loss": 6.683,
|
|
"mean_token_accuracy": 0.09888390973210334,
|
|
"num_tokens": 1964499.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.377533006668091,
|
|
"epoch": 0.0898970804452846,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999996204761206,
|
|
"loss": 6.3832,
|
|
"mean_token_accuracy": 0.11216704472899437,
|
|
"num_tokens": 1973635.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.54502387046814,
|
|
"epoch": 0.09031716026044949,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999995634797993,
|
|
"loss": 6.5308,
|
|
"mean_token_accuracy": 0.11021102443337441,
|
|
"num_tokens": 1983509.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.567485332489014,
|
|
"epoch": 0.09073724007561437,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999995024977265,
|
|
"loss": 6.5197,
|
|
"mean_token_accuracy": 0.11247633025050163,
|
|
"num_tokens": 1992336.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.545616102218628,
|
|
"epoch": 0.09115731989077924,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999994375299034,
|
|
"loss": 6.5532,
|
|
"mean_token_accuracy": 0.10819393768906593,
|
|
"num_tokens": 2001931.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.484406518936157,
|
|
"epoch": 0.09157739970594413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499999368576331,
|
|
"loss": 6.4218,
|
|
"mean_token_accuracy": 0.11132358983159066,
|
|
"num_tokens": 2010935.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.49219536781311,
|
|
"epoch": 0.09199747952110901,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999992956370109,
|
|
"loss": 6.4842,
|
|
"mean_token_accuracy": 0.10731736794114113,
|
|
"num_tokens": 2020587.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.410812473297119,
|
|
"epoch": 0.0924175593362739,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499999218711944,
|
|
"loss": 6.5089,
|
|
"mean_token_accuracy": 0.11067400127649307,
|
|
"num_tokens": 2029743.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.581059837341309,
|
|
"epoch": 0.09283763915143878,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999991378011317,
|
|
"loss": 6.5257,
|
|
"mean_token_accuracy": 0.10916591510176658,
|
|
"num_tokens": 2038468.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.456353855133057,
|
|
"epoch": 0.09325771896660366,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999990529045757,
|
|
"loss": 6.4482,
|
|
"mean_token_accuracy": 0.10893432199954986,
|
|
"num_tokens": 2047456.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.627411127090454,
|
|
"epoch": 0.09367779878176853,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999989640222771,
|
|
"loss": 6.7525,
|
|
"mean_token_accuracy": 0.09431043416261672,
|
|
"num_tokens": 2056691.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.684362411499023,
|
|
"epoch": 0.09409787859693342,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499998871154238,
|
|
"loss": 6.5462,
|
|
"mean_token_accuracy": 0.10591837242245675,
|
|
"num_tokens": 2066068.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.578407287597656,
|
|
"epoch": 0.0945179584120983,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999987743004597,
|
|
"loss": 6.4733,
|
|
"mean_token_accuracy": 0.1102992869913578,
|
|
"num_tokens": 2075113.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.506056404113769,
|
|
"epoch": 0.09493803822726318,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999986734609438,
|
|
"loss": 6.6105,
|
|
"mean_token_accuracy": 0.10494827926158905,
|
|
"num_tokens": 2084557.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.6157310009002686,
|
|
"epoch": 0.09535811804242807,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999985686356923,
|
|
"loss": 6.5139,
|
|
"mean_token_accuracy": 0.1062320664525032,
|
|
"num_tokens": 2093424.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.539625740051269,
|
|
"epoch": 0.09577819785759294,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499998459824707,
|
|
"loss": 6.6346,
|
|
"mean_token_accuracy": 0.10304314494132996,
|
|
"num_tokens": 2103066.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.53157410621643,
|
|
"epoch": 0.09619827767275782,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00049999834702799,
|
|
"loss": 6.5013,
|
|
"mean_token_accuracy": 0.10883507803082466,
|
|
"num_tokens": 2112447.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.507535743713379,
|
|
"epoch": 0.0966183574879227,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999982302455431,
|
|
"loss": 6.5269,
|
|
"mean_token_accuracy": 0.11191204637289047,
|
|
"num_tokens": 2121949.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.507864904403687,
|
|
"epoch": 0.09703843730308759,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999981094773683,
|
|
"loss": 6.4328,
|
|
"mean_token_accuracy": 0.11216317638754844,
|
|
"num_tokens": 2130464.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.520567464828491,
|
|
"epoch": 0.09745851711825247,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499997984723468,
|
|
"loss": 6.5942,
|
|
"mean_token_accuracy": 0.10294081419706344,
|
|
"num_tokens": 2139577.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.288797092437744,
|
|
"epoch": 0.09787859693341736,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999978559838441,
|
|
"loss": 6.3204,
|
|
"mean_token_accuracy": 0.11208199337124825,
|
|
"num_tokens": 2147919.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.472030353546143,
|
|
"epoch": 0.09829867674858223,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999977232584991,
|
|
"loss": 6.4949,
|
|
"mean_token_accuracy": 0.10832359045743942,
|
|
"num_tokens": 2156936.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.558899450302124,
|
|
"epoch": 0.09871875656374711,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999975865474354,
|
|
"loss": 6.5512,
|
|
"mean_token_accuracy": 0.10766256302595138,
|
|
"num_tokens": 2165362.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.469175338745117,
|
|
"epoch": 0.099138836378912,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999974458506551,
|
|
"loss": 6.4643,
|
|
"mean_token_accuracy": 0.10836688205599784,
|
|
"num_tokens": 2173665.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.551422071456909,
|
|
"epoch": 0.09955891619407688,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499997301168161,
|
|
"loss": 6.4532,
|
|
"mean_token_accuracy": 0.11138271391391755,
|
|
"num_tokens": 2182222.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.531885147094727,
|
|
"epoch": 0.09997899600924176,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999971524999556,
|
|
"loss": 6.5228,
|
|
"mean_token_accuracy": 0.11111016869544983,
|
|
"num_tokens": 2192358.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.534890985488891,
|
|
"epoch": 0.10039907582440663,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999969998460414,
|
|
"loss": 6.5355,
|
|
"mean_token_accuracy": 0.10454710125923157,
|
|
"num_tokens": 2201889.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.433488464355468,
|
|
"epoch": 0.10081915563957151,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004999968432064213,
|
|
"loss": 6.5322,
|
|
"mean_token_accuracy": 0.1198379322886467,
|
|
"num_tokens": 2211810.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.474250078201294,
|
|
"epoch": 0.1012392354547364,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999966825810979,
|
|
"loss": 6.4684,
|
|
"mean_token_accuracy": 0.10700508952140808,
|
|
"num_tokens": 2221123.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.384520959854126,
|
|
"epoch": 0.10165931526990128,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999965179700742,
|
|
"loss": 6.3986,
|
|
"mean_token_accuracy": 0.11781087368726731,
|
|
"num_tokens": 2230129.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.4176534652709964,
|
|
"epoch": 0.10207939508506617,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499996349373353,
|
|
"loss": 6.4609,
|
|
"mean_token_accuracy": 0.10817519575357437,
|
|
"num_tokens": 2239929.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.5110820770263675,
|
|
"epoch": 0.10249947490023105,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999961767909374,
|
|
"loss": 6.4372,
|
|
"mean_token_accuracy": 0.1148509480059147,
|
|
"num_tokens": 2248078.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.4125104427337645,
|
|
"epoch": 0.10291955471539592,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999960002228303,
|
|
"loss": 6.5274,
|
|
"mean_token_accuracy": 0.10999985039234161,
|
|
"num_tokens": 2256975.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.474673461914063,
|
|
"epoch": 0.1033396345305608,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999958196690349,
|
|
"loss": 6.3849,
|
|
"mean_token_accuracy": 0.11320202201604843,
|
|
"num_tokens": 2265797.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.479385900497436,
|
|
"epoch": 0.10375971434572569,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999956351295545,
|
|
"loss": 6.4946,
|
|
"mean_token_accuracy": 0.11450825035572051,
|
|
"num_tokens": 2274099.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.3540520668029785,
|
|
"epoch": 0.10417979416089057,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999954466043922,
|
|
"loss": 6.3917,
|
|
"mean_token_accuracy": 0.11258968263864517,
|
|
"num_tokens": 2282360.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.481705999374389,
|
|
"epoch": 0.10459987397605545,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999952540935514,
|
|
"loss": 6.5009,
|
|
"mean_token_accuracy": 0.10285271480679511,
|
|
"num_tokens": 2292714.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.455303287506103,
|
|
"epoch": 0.10501995379122034,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999950575970356,
|
|
"loss": 6.426,
|
|
"mean_token_accuracy": 0.11442826837301254,
|
|
"num_tokens": 2301633.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.465747499465943,
|
|
"epoch": 0.10544003360638521,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999948571148482,
|
|
"loss": 6.4138,
|
|
"mean_token_accuracy": 0.11426257789134979,
|
|
"num_tokens": 2310067.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.466140460968018,
|
|
"epoch": 0.10586011342155009,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999946526469927,
|
|
"loss": 6.4932,
|
|
"mean_token_accuracy": 0.11244904398918151,
|
|
"num_tokens": 2320090.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.438083505630493,
|
|
"epoch": 0.10628019323671498,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999944441934728,
|
|
"loss": 6.4509,
|
|
"mean_token_accuracy": 0.11593573912978172,
|
|
"num_tokens": 2329255.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.467304992675781,
|
|
"epoch": 0.10670027305187986,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999942317542922,
|
|
"loss": 6.5481,
|
|
"mean_token_accuracy": 0.10965899974107743,
|
|
"num_tokens": 2339535.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.434674501419067,
|
|
"epoch": 0.10712035286704474,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999940153294546,
|
|
"loss": 6.4448,
|
|
"mean_token_accuracy": 0.11061845496296882,
|
|
"num_tokens": 2348948.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.447847843170166,
|
|
"epoch": 0.10754043268220961,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499993794918964,
|
|
"loss": 6.4628,
|
|
"mean_token_accuracy": 0.10641181394457817,
|
|
"num_tokens": 2359141.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.401166343688965,
|
|
"epoch": 0.1079605124973745,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999935705228241,
|
|
"loss": 6.5084,
|
|
"mean_token_accuracy": 0.1094856470823288,
|
|
"num_tokens": 2368906.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.554097080230713,
|
|
"epoch": 0.10838059231253938,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999933421410389,
|
|
"loss": 6.4839,
|
|
"mean_token_accuracy": 0.11065066531300545,
|
|
"num_tokens": 2377029.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.5027672290802006,
|
|
"epoch": 0.10880067212770426,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999931097736125,
|
|
"loss": 6.5541,
|
|
"mean_token_accuracy": 0.10604767650365829,
|
|
"num_tokens": 2387088.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.470385646820068,
|
|
"epoch": 0.10922075194286915,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999928734205492,
|
|
"loss": 6.4468,
|
|
"mean_token_accuracy": 0.11056585833430291,
|
|
"num_tokens": 2395596.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.403819370269775,
|
|
"epoch": 0.10964083175803403,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999926330818528,
|
|
"loss": 6.4393,
|
|
"mean_token_accuracy": 0.11377019882202148,
|
|
"num_tokens": 2404506.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.469174242019653,
|
|
"epoch": 0.1100609115731989,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999923887575278,
|
|
"loss": 6.4777,
|
|
"mean_token_accuracy": 0.11094499379396439,
|
|
"num_tokens": 2414342.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.476234006881714,
|
|
"epoch": 0.11048099138836379,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999921404475785,
|
|
"loss": 6.4422,
|
|
"mean_token_accuracy": 0.11336205825209618,
|
|
"num_tokens": 2423076.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.415568065643311,
|
|
"epoch": 0.11090107120352867,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999918881520093,
|
|
"loss": 6.391,
|
|
"mean_token_accuracy": 0.11621783077716827,
|
|
"num_tokens": 2432492.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.362053871154785,
|
|
"epoch": 0.11132115101869355,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999916318708246,
|
|
"loss": 6.354,
|
|
"mean_token_accuracy": 0.11400164812803268,
|
|
"num_tokens": 2441916.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.406490755081177,
|
|
"epoch": 0.11174123083385844,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999913716040291,
|
|
"loss": 6.4072,
|
|
"mean_token_accuracy": 0.11762610748410225,
|
|
"num_tokens": 2450932.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.336502504348755,
|
|
"epoch": 0.11216131064902331,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999911073516272,
|
|
"loss": 6.4319,
|
|
"mean_token_accuracy": 0.11254018545150757,
|
|
"num_tokens": 2460058.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.392711496353149,
|
|
"epoch": 0.11258139046418819,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999908391136237,
|
|
"loss": 6.3569,
|
|
"mean_token_accuracy": 0.11563631743192673,
|
|
"num_tokens": 2469607.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.441662883758545,
|
|
"epoch": 0.11300147027935308,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999905668900234,
|
|
"loss": 6.4002,
|
|
"mean_token_accuracy": 0.11395884156227112,
|
|
"num_tokens": 2478345.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.438292360305786,
|
|
"epoch": 0.11342155009451796,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499990290680831,
|
|
"loss": 6.3261,
|
|
"mean_token_accuracy": 0.11877992302179337,
|
|
"num_tokens": 2486662.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.379430055618286,
|
|
"epoch": 0.11384162990968284,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999900104860516,
|
|
"loss": 6.472,
|
|
"mean_token_accuracy": 0.11443257331848145,
|
|
"num_tokens": 2495392.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.437303638458252,
|
|
"epoch": 0.11426170972484773,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999897263056898,
|
|
"loss": 6.4969,
|
|
"mean_token_accuracy": 0.10801200717687606,
|
|
"num_tokens": 2505254.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.457095766067505,
|
|
"epoch": 0.1146817895400126,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499989438139751,
|
|
"loss": 6.3155,
|
|
"mean_token_accuracy": 0.11900854557752609,
|
|
"num_tokens": 2514096.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.339952230453491,
|
|
"epoch": 0.11510186935517748,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999891459882401,
|
|
"loss": 6.3262,
|
|
"mean_token_accuracy": 0.1178194098174572,
|
|
"num_tokens": 2523635.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.318808507919312,
|
|
"epoch": 0.11552194917034236,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999888498511624,
|
|
"loss": 6.3954,
|
|
"mean_token_accuracy": 0.11501155719161034,
|
|
"num_tokens": 2532528.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.366592121124268,
|
|
"epoch": 0.11594202898550725,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999885497285229,
|
|
"loss": 6.307,
|
|
"mean_token_accuracy": 0.11583952903747559,
|
|
"num_tokens": 2541893.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.354608488082886,
|
|
"epoch": 0.11636210880067213,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999882456203273,
|
|
"loss": 6.3581,
|
|
"mean_token_accuracy": 0.11632645949721336,
|
|
"num_tokens": 2551551.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.349077987670898,
|
|
"epoch": 0.11678218861583702,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999879375265806,
|
|
"loss": 6.3146,
|
|
"mean_token_accuracy": 0.1158558964729309,
|
|
"num_tokens": 2560183.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.344199848175049,
|
|
"epoch": 0.11720226843100189,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999876254472886,
|
|
"loss": 6.1959,
|
|
"mean_token_accuracy": 0.12459081262350083,
|
|
"num_tokens": 2568697.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.348653078079224,
|
|
"epoch": 0.11762234824616677,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999873093824565,
|
|
"loss": 6.4194,
|
|
"mean_token_accuracy": 0.11410524025559425,
|
|
"num_tokens": 2578151.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.50674262046814,
|
|
"epoch": 0.11804242806133165,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999869893320902,
|
|
"loss": 6.5289,
|
|
"mean_token_accuracy": 0.1147321492433548,
|
|
"num_tokens": 2585901.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.338491153717041,
|
|
"epoch": 0.11846250787649654,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999866652961952,
|
|
"loss": 6.3629,
|
|
"mean_token_accuracy": 0.11298267319798469,
|
|
"num_tokens": 2595655.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.389230489730835,
|
|
"epoch": 0.11888258769166142,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999863372747773,
|
|
"loss": 6.3335,
|
|
"mean_token_accuracy": 0.11225836053490638,
|
|
"num_tokens": 2604949.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.439256811141968,
|
|
"epoch": 0.11930266750682629,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999860052678423,
|
|
"loss": 6.3989,
|
|
"mean_token_accuracy": 0.11546840667724609,
|
|
"num_tokens": 2614260.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.299542999267578,
|
|
"epoch": 0.11972274732199117,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999856692753959,
|
|
"loss": 6.3905,
|
|
"mean_token_accuracy": 0.11243033632636071,
|
|
"num_tokens": 2623740.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.37091474533081,
|
|
"epoch": 0.12014282713715606,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999853292974444,
|
|
"loss": 6.2964,
|
|
"mean_token_accuracy": 0.1178373210132122,
|
|
"num_tokens": 2631998.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.372178649902343,
|
|
"epoch": 0.12056290695232094,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999849853339936,
|
|
"loss": 6.4358,
|
|
"mean_token_accuracy": 0.11526904925704003,
|
|
"num_tokens": 2641169.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.44800329208374,
|
|
"epoch": 0.12098298676748583,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999846373850497,
|
|
"loss": 6.2945,
|
|
"mean_token_accuracy": 0.11855239495635032,
|
|
"num_tokens": 2650576.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.257949161529541,
|
|
"epoch": 0.12140306658265071,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999842854506186,
|
|
"loss": 6.3807,
|
|
"mean_token_accuracy": 0.11334980726242065,
|
|
"num_tokens": 2660817.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.38723406791687,
|
|
"epoch": 0.12182314639781558,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999839295307069,
|
|
"loss": 6.3212,
|
|
"mean_token_accuracy": 0.11455826535820961,
|
|
"num_tokens": 2669338.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.404263877868653,
|
|
"epoch": 0.12224322621298046,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999835696253206,
|
|
"loss": 6.3789,
|
|
"mean_token_accuracy": 0.11618088632822036,
|
|
"num_tokens": 2679108.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.435732698440551,
|
|
"epoch": 0.12266330602814535,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999832057344664,
|
|
"loss": 6.3325,
|
|
"mean_token_accuracy": 0.1142914392054081,
|
|
"num_tokens": 2688126.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.152384519577026,
|
|
"epoch": 0.12308338584331023,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999828378581504,
|
|
"loss": 6.3063,
|
|
"mean_token_accuracy": 0.12400648295879364,
|
|
"num_tokens": 2697245.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.425075197219849,
|
|
"epoch": 0.12350346565847511,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999824659963793,
|
|
"loss": 6.3465,
|
|
"mean_token_accuracy": 0.1198640413582325,
|
|
"num_tokens": 2705934.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.265953540802002,
|
|
"epoch": 0.12392354547364,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999820901491598,
|
|
"loss": 6.2796,
|
|
"mean_token_accuracy": 0.12351771965622901,
|
|
"num_tokens": 2714367.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.334036827087402,
|
|
"epoch": 0.12434362528880487,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999817103164983,
|
|
"loss": 6.3413,
|
|
"mean_token_accuracy": 0.11931266412138938,
|
|
"num_tokens": 2724366.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.360864496231079,
|
|
"epoch": 0.12476370510396975,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999813264984017,
|
|
"loss": 6.3448,
|
|
"mean_token_accuracy": 0.11467731669545174,
|
|
"num_tokens": 2733980.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.366592979431152,
|
|
"epoch": 0.12518378491913462,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999809386948767,
|
|
"loss": 6.3342,
|
|
"mean_token_accuracy": 0.12208072617650031,
|
|
"num_tokens": 2744013.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.299022817611695,
|
|
"epoch": 0.12560386473429952,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999805469059302,
|
|
"loss": 6.4186,
|
|
"mean_token_accuracy": 0.11027913689613342,
|
|
"num_tokens": 2753385.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.366168975830078,
|
|
"epoch": 0.1260239445494644,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999801511315693,
|
|
"loss": 6.256,
|
|
"mean_token_accuracy": 0.11804210916161537,
|
|
"num_tokens": 2762875.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.342552661895752,
|
|
"epoch": 0.1264440243646293,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999797513718007,
|
|
"loss": 6.3108,
|
|
"mean_token_accuracy": 0.12443676739931106,
|
|
"num_tokens": 2772182.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.206664896011352,
|
|
"epoch": 0.12686410417979416,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999793476266317,
|
|
"loss": 6.2711,
|
|
"mean_token_accuracy": 0.12031201645731926,
|
|
"num_tokens": 2780814.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.639998197555542,
|
|
"epoch": 0.12728418399495905,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999789398960695,
|
|
"loss": 6.5474,
|
|
"mean_token_accuracy": 0.1183062419295311,
|
|
"num_tokens": 2791104.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.19776029586792,
|
|
"epoch": 0.12770426381012392,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999785281801212,
|
|
"loss": 6.256,
|
|
"mean_token_accuracy": 0.11993122175335884,
|
|
"num_tokens": 2800081.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.334916496276856,
|
|
"epoch": 0.1281243436252888,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499978112478794,
|
|
"loss": 6.3835,
|
|
"mean_token_accuracy": 0.11843734234571457,
|
|
"num_tokens": 2809096.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.403998374938965,
|
|
"epoch": 0.1285444234404537,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999776927920955,
|
|
"loss": 6.3545,
|
|
"mean_token_accuracy": 0.12085104510188102,
|
|
"num_tokens": 2818857.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.3299469470977785,
|
|
"epoch": 0.12896450325561856,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499977269120033,
|
|
"loss": 6.4167,
|
|
"mean_token_accuracy": 0.11449578031897545,
|
|
"num_tokens": 2829332.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.3263038158416744,
|
|
"epoch": 0.12938458307078346,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499976841462614,
|
|
"loss": 6.3436,
|
|
"mean_token_accuracy": 0.11686776131391526,
|
|
"num_tokens": 2839193.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.397625589370728,
|
|
"epoch": 0.12980466288594833,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499976409819846,
|
|
"loss": 6.3117,
|
|
"mean_token_accuracy": 0.11800177842378616,
|
|
"num_tokens": 2848535.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.116656970977783,
|
|
"epoch": 0.1302247427011132,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999759741917369,
|
|
"loss": 6.2278,
|
|
"mean_token_accuracy": 0.12729543596506118,
|
|
"num_tokens": 2858090.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.364631414413452,
|
|
"epoch": 0.1306448225162781,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999755345782941,
|
|
"loss": 6.378,
|
|
"mean_token_accuracy": 0.11326263695955277,
|
|
"num_tokens": 2866984.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.246821451187134,
|
|
"epoch": 0.13106490233144297,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999750909795256,
|
|
"loss": 6.1885,
|
|
"mean_token_accuracy": 0.1256905347108841,
|
|
"num_tokens": 2876550.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.341800737380981,
|
|
"epoch": 0.13148498214660786,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999746433954394,
|
|
"loss": 6.286,
|
|
"mean_token_accuracy": 0.12146776840090752,
|
|
"num_tokens": 2885782.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.275845241546631,
|
|
"epoch": 0.13190506196177273,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499974191826043,
|
|
"loss": 6.2653,
|
|
"mean_token_accuracy": 0.13301032781600952,
|
|
"num_tokens": 2894807.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.351547765731811,
|
|
"epoch": 0.1323251417769376,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999737362713448,
|
|
"loss": 6.304,
|
|
"mean_token_accuracy": 0.12145641520619392,
|
|
"num_tokens": 2904076.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.267245769500732,
|
|
"epoch": 0.1327452215921025,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999732767313527,
|
|
"loss": 6.2029,
|
|
"mean_token_accuracy": 0.12209122702479362,
|
|
"num_tokens": 2913761.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.383308267593383,
|
|
"epoch": 0.13316530140726737,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999728132060746,
|
|
"loss": 6.439,
|
|
"mean_token_accuracy": 0.12098384723067283,
|
|
"num_tokens": 2922848.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.364631271362304,
|
|
"epoch": 0.13358538122243227,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999723456955192,
|
|
"loss": 6.3245,
|
|
"mean_token_accuracy": 0.11949731931090354,
|
|
"num_tokens": 2932718.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.2494594097137455,
|
|
"epoch": 0.13400546103759714,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999718741996945,
|
|
"loss": 6.2837,
|
|
"mean_token_accuracy": 0.12003797963261605,
|
|
"num_tokens": 2942686.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.2547472476959225,
|
|
"epoch": 0.13442554085276204,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499971398718609,
|
|
"loss": 6.2407,
|
|
"mean_token_accuracy": 0.1179835021495819,
|
|
"num_tokens": 2952096.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.3157384395599365,
|
|
"epoch": 0.1348456206679269,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999709192522708,
|
|
"loss": 6.3129,
|
|
"mean_token_accuracy": 0.12474863901734352,
|
|
"num_tokens": 2960660.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.379588079452515,
|
|
"epoch": 0.13526570048309178,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999704358006887,
|
|
"loss": 6.3158,
|
|
"mean_token_accuracy": 0.11744728311896324,
|
|
"num_tokens": 2969834.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.285486459732056,
|
|
"epoch": 0.13568578029825668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999699483638712,
|
|
"loss": 6.311,
|
|
"mean_token_accuracy": 0.12142582982778549,
|
|
"num_tokens": 2979023.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.294291210174561,
|
|
"epoch": 0.13610586011342155,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999694569418269,
|
|
"loss": 6.3063,
|
|
"mean_token_accuracy": 0.12201808094978332,
|
|
"num_tokens": 2988083.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.2657451152801515,
|
|
"epoch": 0.13652593992858644,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999689615345645,
|
|
"loss": 6.2388,
|
|
"mean_token_accuracy": 0.1231310561299324,
|
|
"num_tokens": 2997240.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.308252573013306,
|
|
"epoch": 0.1369460197437513,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999684621420928,
|
|
"loss": 6.3111,
|
|
"mean_token_accuracy": 0.1184695117175579,
|
|
"num_tokens": 3007077.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.319302654266357,
|
|
"epoch": 0.13736609955891618,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999679587644205,
|
|
"loss": 6.3497,
|
|
"mean_token_accuracy": 0.11671060770750045,
|
|
"num_tokens": 3015821.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.236631298065186,
|
|
"epoch": 0.13778617937408108,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999674514015568,
|
|
"loss": 6.2724,
|
|
"mean_token_accuracy": 0.11908711194992065,
|
|
"num_tokens": 3025858.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.3658030986785885,
|
|
"epoch": 0.13820625918924595,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999669400535105,
|
|
"loss": 6.2416,
|
|
"mean_token_accuracy": 0.11343135982751847,
|
|
"num_tokens": 3035537.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.147812271118164,
|
|
"epoch": 0.13862633900441085,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999664247202907,
|
|
"loss": 6.1617,
|
|
"mean_token_accuracy": 0.11974595785140991,
|
|
"num_tokens": 3044204.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.327428913116455,
|
|
"epoch": 0.13904641881957572,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999659054019066,
|
|
"loss": 6.3345,
|
|
"mean_token_accuracy": 0.11974811106920243,
|
|
"num_tokens": 3053111.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.258665418624878,
|
|
"epoch": 0.1394664986347406,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999653820983673,
|
|
"loss": 6.2415,
|
|
"mean_token_accuracy": 0.12036412507295609,
|
|
"num_tokens": 3062456.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.2644579887390135,
|
|
"epoch": 0.13988657844990549,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499964854809682,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.12668107002973555,
|
|
"num_tokens": 3071132.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.261227464675903,
|
|
"epoch": 0.14030665826507036,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999643235358602,
|
|
"loss": 6.222,
|
|
"mean_token_accuracy": 0.125965429097414,
|
|
"num_tokens": 3080892.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.215318775177002,
|
|
"epoch": 0.14072673808023525,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999637882769112,
|
|
"loss": 6.1526,
|
|
"mean_token_accuracy": 0.12532262802124022,
|
|
"num_tokens": 3089874.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.308867406845093,
|
|
"epoch": 0.14114681789540012,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999632490328447,
|
|
"loss": 6.3008,
|
|
"mean_token_accuracy": 0.12098695039749145,
|
|
"num_tokens": 3099535.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.281496620178222,
|
|
"epoch": 0.14156689771056502,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999627058036699,
|
|
"loss": 6.2552,
|
|
"mean_token_accuracy": 0.12044425159692765,
|
|
"num_tokens": 3108772.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.311051607131958,
|
|
"epoch": 0.1419869775257299,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999621585893966,
|
|
"loss": 6.2799,
|
|
"mean_token_accuracy": 0.11901640743017197,
|
|
"num_tokens": 3118333.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.305313062667847,
|
|
"epoch": 0.14240705734089476,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999616073900346,
|
|
"loss": 6.3091,
|
|
"mean_token_accuracy": 0.12129790410399437,
|
|
"num_tokens": 3127356.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.2683678150177,
|
|
"epoch": 0.14282713715605966,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999610522055935,
|
|
"loss": 6.2794,
|
|
"mean_token_accuracy": 0.11691329404711723,
|
|
"num_tokens": 3136859.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.303126668930053,
|
|
"epoch": 0.14324721697122453,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999604930360832,
|
|
"loss": 6.304,
|
|
"mean_token_accuracy": 0.11767303720116615,
|
|
"num_tokens": 3146607.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.214645338058472,
|
|
"epoch": 0.14366729678638943,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999599298815136,
|
|
"loss": 6.2515,
|
|
"mean_token_accuracy": 0.12662419229745864,
|
|
"num_tokens": 3156327.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.21446213722229,
|
|
"epoch": 0.1440873766015543,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004999593627418947,
|
|
"loss": 6.2009,
|
|
"mean_token_accuracy": 0.1281860999763012,
|
|
"num_tokens": 3165559.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.299745416641235,
|
|
"epoch": 0.14450745641671917,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999587916172365,
|
|
"loss": 6.2848,
|
|
"mean_token_accuracy": 0.11663243547081947,
|
|
"num_tokens": 3173850.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.324022483825684,
|
|
"epoch": 0.14492753623188406,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999582165075492,
|
|
"loss": 6.2353,
|
|
"mean_token_accuracy": 0.11788406521081925,
|
|
"num_tokens": 3182838.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.144151782989502,
|
|
"epoch": 0.14534761604704893,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999576374128429,
|
|
"loss": 6.2299,
|
|
"mean_token_accuracy": 0.1223968394100666,
|
|
"num_tokens": 3191692.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.343899536132812,
|
|
"epoch": 0.14576769586221383,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999570543331279,
|
|
"loss": 6.2507,
|
|
"mean_token_accuracy": 0.12281694263219833,
|
|
"num_tokens": 3200069.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.2878196239471436,
|
|
"epoch": 0.1461877756773787,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999564672684145,
|
|
"loss": 6.3406,
|
|
"mean_token_accuracy": 0.11862553879618645,
|
|
"num_tokens": 3209653.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.361492061614991,
|
|
"epoch": 0.14660785549254357,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999558762187131,
|
|
"loss": 6.2041,
|
|
"mean_token_accuracy": 0.12774061411619186,
|
|
"num_tokens": 3218313.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.146276044845581,
|
|
"epoch": 0.14702793530770847,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999552811840342,
|
|
"loss": 6.1521,
|
|
"mean_token_accuracy": 0.1273271396756172,
|
|
"num_tokens": 3227525.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.241751718521118,
|
|
"epoch": 0.14744801512287334,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999546821643884,
|
|
"loss": 6.2657,
|
|
"mean_token_accuracy": 0.121260417252779,
|
|
"num_tokens": 3237022.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.169715499877929,
|
|
"epoch": 0.14786809493803824,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999540791597861,
|
|
"loss": 6.156,
|
|
"mean_token_accuracy": 0.12248859778046609,
|
|
"num_tokens": 3246605.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.1003180027008055,
|
|
"epoch": 0.1482881747532031,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999534721702383,
|
|
"loss": 6.1054,
|
|
"mean_token_accuracy": 0.12855856791138648,
|
|
"num_tokens": 3255587.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.226248407363892,
|
|
"epoch": 0.148708254568368,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999528611957553,
|
|
"loss": 6.2171,
|
|
"mean_token_accuracy": 0.12187446802854537,
|
|
"num_tokens": 3265669.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.278449535369873,
|
|
"epoch": 0.14912833438353287,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999522462363485,
|
|
"loss": 6.1919,
|
|
"mean_token_accuracy": 0.1278035633265972,
|
|
"num_tokens": 3275013.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.265809679031372,
|
|
"epoch": 0.14954841419869774,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999516272920283,
|
|
"loss": 6.311,
|
|
"mean_token_accuracy": 0.1240921102464199,
|
|
"num_tokens": 3284723.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.131893539428711,
|
|
"epoch": 0.14996849401386264,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499951004362806,
|
|
"loss": 6.1325,
|
|
"mean_token_accuracy": 0.12936908155679702,
|
|
"num_tokens": 3293860.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.151740789413452,
|
|
"epoch": 0.1503885738290275,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999503774486924,
|
|
"loss": 6.1833,
|
|
"mean_token_accuracy": 0.12577988132834433,
|
|
"num_tokens": 3303158.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.184361696243286,
|
|
"epoch": 0.1508086536441924,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999497465496987,
|
|
"loss": 6.1137,
|
|
"mean_token_accuracy": 0.11985947787761689,
|
|
"num_tokens": 3313068.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.191692352294922,
|
|
"epoch": 0.15122873345935728,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499949111665836,
|
|
"loss": 6.2033,
|
|
"mean_token_accuracy": 0.12312208265066146,
|
|
"num_tokens": 3321885.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.25971827507019,
|
|
"epoch": 0.15164881327452215,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999484727971158,
|
|
"loss": 6.1858,
|
|
"mean_token_accuracy": 0.12474783286452293,
|
|
"num_tokens": 3330924.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.176667261123657,
|
|
"epoch": 0.15206889308968705,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499947829943549,
|
|
"loss": 6.2248,
|
|
"mean_token_accuracy": 0.12161886692047119,
|
|
"num_tokens": 3340070.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.295008039474487,
|
|
"epoch": 0.15248897290485192,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999471831051474,
|
|
"loss": 6.213,
|
|
"mean_token_accuracy": 0.13358828723430632,
|
|
"num_tokens": 3349870.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.278341436386109,
|
|
"epoch": 0.1529090527200168,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999465322819222,
|
|
"loss": 6.2576,
|
|
"mean_token_accuracy": 0.11560158357024193,
|
|
"num_tokens": 3359573.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.279096603393555,
|
|
"epoch": 0.15332913253518168,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999458774738851,
|
|
"loss": 6.1999,
|
|
"mean_token_accuracy": 0.13126230910420417,
|
|
"num_tokens": 3368577.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.1456389904022215,
|
|
"epoch": 0.15374921235034655,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999452186810476,
|
|
"loss": 6.1662,
|
|
"mean_token_accuracy": 0.12922282814979552,
|
|
"num_tokens": 3377801.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.282723903656006,
|
|
"epoch": 0.15416929216551145,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999445559034214,
|
|
"loss": 6.2248,
|
|
"mean_token_accuracy": 0.12709890604019164,
|
|
"num_tokens": 3386666.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.3540504455566404,
|
|
"epoch": 0.15458937198067632,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999438891410181,
|
|
"loss": 6.3599,
|
|
"mean_token_accuracy": 0.12122973501682281,
|
|
"num_tokens": 3396086.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.2125379085540775,
|
|
"epoch": 0.15500945179584122,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999432183938496,
|
|
"loss": 6.2646,
|
|
"mean_token_accuracy": 0.1275039754807949,
|
|
"num_tokens": 3404894.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.214909315109253,
|
|
"epoch": 0.1554295316110061,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999425436619279,
|
|
"loss": 6.2499,
|
|
"mean_token_accuracy": 0.12167986705899239,
|
|
"num_tokens": 3414172.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.310878896713257,
|
|
"epoch": 0.15584961142617096,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499941864945265,
|
|
"loss": 6.2176,
|
|
"mean_token_accuracy": 0.11906537339091301,
|
|
"num_tokens": 3423409.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.134654092788696,
|
|
"epoch": 0.15626969124133586,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999411822438726,
|
|
"loss": 6.1799,
|
|
"mean_token_accuracy": 0.12394418343901634,
|
|
"num_tokens": 3433047.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.2948554992675785,
|
|
"epoch": 0.15668977105650073,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499940495557763,
|
|
"loss": 6.173,
|
|
"mean_token_accuracy": 0.12352384477853776,
|
|
"num_tokens": 3442490.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.233772277832031,
|
|
"epoch": 0.15710985087166562,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999398048869485,
|
|
"loss": 6.2356,
|
|
"mean_token_accuracy": 0.1239772841334343,
|
|
"num_tokens": 3451804.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.296554517745972,
|
|
"epoch": 0.1575299306868305,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499939110231441,
|
|
"loss": 6.2223,
|
|
"mean_token_accuracy": 0.12610766440629959,
|
|
"num_tokens": 3461481.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.218039226531983,
|
|
"epoch": 0.1579500105019954,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999384115912531,
|
|
"loss": 6.2673,
|
|
"mean_token_accuracy": 0.1208581991493702,
|
|
"num_tokens": 3471798.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.088755655288696,
|
|
"epoch": 0.15837009031716026,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499937708966397,
|
|
"loss": 6.1755,
|
|
"mean_token_accuracy": 0.12277546525001526,
|
|
"num_tokens": 3481386.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.257310009002685,
|
|
"epoch": 0.15879017013232513,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999370023568853,
|
|
"loss": 6.1643,
|
|
"mean_token_accuracy": 0.12328559309244155,
|
|
"num_tokens": 3489981.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.140112638473511,
|
|
"epoch": 0.15921024994749003,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999362917627304,
|
|
"loss": 6.1438,
|
|
"mean_token_accuracy": 0.12805134281516076,
|
|
"num_tokens": 3498551.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.224145746231079,
|
|
"epoch": 0.1596303297626549,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999355771839448,
|
|
"loss": 6.1267,
|
|
"mean_token_accuracy": 0.1276252895593643,
|
|
"num_tokens": 3507921.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.316604804992676,
|
|
"epoch": 0.1600504095778198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999348586205414,
|
|
"loss": 6.2984,
|
|
"mean_token_accuracy": 0.12361158952116966,
|
|
"num_tokens": 3517570.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.265382909774781,
|
|
"epoch": 0.16047048939298467,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999341360725327,
|
|
"loss": 6.2786,
|
|
"mean_token_accuracy": 0.11925147697329522,
|
|
"num_tokens": 3526774.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.244428873062134,
|
|
"epoch": 0.16089056920814954,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999334095399317,
|
|
"loss": 6.2167,
|
|
"mean_token_accuracy": 0.1289656363427639,
|
|
"num_tokens": 3535319.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.091944026947021,
|
|
"epoch": 0.16131064902331443,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999326790227512,
|
|
"loss": 6.1819,
|
|
"mean_token_accuracy": 0.12599623277783395,
|
|
"num_tokens": 3544468.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.069698667526245,
|
|
"epoch": 0.1617307288384793,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999319445210041,
|
|
"loss": 6.0574,
|
|
"mean_token_accuracy": 0.13135963827371597,
|
|
"num_tokens": 3553529.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.176232147216797,
|
|
"epoch": 0.1621508086536442,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999312060347034,
|
|
"loss": 6.1206,
|
|
"mean_token_accuracy": 0.12521466836333275,
|
|
"num_tokens": 3563053.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.155474901199341,
|
|
"epoch": 0.16257088846880907,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999304635638621,
|
|
"loss": 6.0713,
|
|
"mean_token_accuracy": 0.13156753256917,
|
|
"num_tokens": 3571877.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.117454576492309,
|
|
"epoch": 0.16299096828397394,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999297171084935,
|
|
"loss": 6.1211,
|
|
"mean_token_accuracy": 0.12843042388558387,
|
|
"num_tokens": 3581496.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.246276712417602,
|
|
"epoch": 0.16341104809913884,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999289666686109,
|
|
"loss": 6.1408,
|
|
"mean_token_accuracy": 0.12944318503141403,
|
|
"num_tokens": 3590752.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.026504850387573,
|
|
"epoch": 0.1638311279143037,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999282122442274,
|
|
"loss": 6.1427,
|
|
"mean_token_accuracy": 0.12940528690814973,
|
|
"num_tokens": 3599885.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.306515789031982,
|
|
"epoch": 0.1642512077294686,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999274538353564,
|
|
"loss": 6.2127,
|
|
"mean_token_accuracy": 0.12124313414096832,
|
|
"num_tokens": 3610039.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.1400439739227295,
|
|
"epoch": 0.16467128754463348,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999266914420114,
|
|
"loss": 6.1432,
|
|
"mean_token_accuracy": 0.12274663522839546,
|
|
"num_tokens": 3619954.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.1886210441589355,
|
|
"epoch": 0.16509136735979837,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499925925064206,
|
|
"loss": 6.0913,
|
|
"mean_token_accuracy": 0.13008279874920844,
|
|
"num_tokens": 3628164.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.256851673126221,
|
|
"epoch": 0.16551144717496324,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999251547019535,
|
|
"loss": 6.2411,
|
|
"mean_token_accuracy": 0.1288958877325058,
|
|
"num_tokens": 3636778.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.259689378738403,
|
|
"epoch": 0.16593152699012811,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999243803552678,
|
|
"loss": 6.2104,
|
|
"mean_token_accuracy": 0.1265132576227188,
|
|
"num_tokens": 3647046.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.134534025192261,
|
|
"epoch": 0.166351606805293,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999236020241625,
|
|
"loss": 6.1237,
|
|
"mean_token_accuracy": 0.1289564423263073,
|
|
"num_tokens": 3656130.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.189244413375855,
|
|
"epoch": 0.16677168662045788,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999228197086514,
|
|
"loss": 6.2018,
|
|
"mean_token_accuracy": 0.11904976442456246,
|
|
"num_tokens": 3666145.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.2379295349121096,
|
|
"epoch": 0.16719176643562278,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999220334087484,
|
|
"loss": 6.2356,
|
|
"mean_token_accuracy": 0.12509587332606315,
|
|
"num_tokens": 3676722.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.233392667770386,
|
|
"epoch": 0.16761184625078765,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999212431244673,
|
|
"loss": 6.2382,
|
|
"mean_token_accuracy": 0.1240171104669571,
|
|
"num_tokens": 3685880.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.1124889850616455,
|
|
"epoch": 0.16803192606595252,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999204488558222,
|
|
"loss": 6.0582,
|
|
"mean_token_accuracy": 0.13227254450321196,
|
|
"num_tokens": 3695167.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.222057247161866,
|
|
"epoch": 0.16845200588111742,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999196506028273,
|
|
"loss": 6.1797,
|
|
"mean_token_accuracy": 0.12606113404035568,
|
|
"num_tokens": 3703700.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.204267930984497,
|
|
"epoch": 0.1688720856962823,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999188483654965,
|
|
"loss": 6.1263,
|
|
"mean_token_accuracy": 0.12780678346753122,
|
|
"num_tokens": 3712825.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 6.068148231506347,
|
|
"epoch": 0.16929216551144718,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999180421438442,
|
|
"loss": 6.0953,
|
|
"mean_token_accuracy": 0.12944422513246537,
|
|
"num_tokens": 3721807.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 6.252347660064697,
|
|
"epoch": 0.16971224532661205,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999172319378846,
|
|
"loss": 6.2617,
|
|
"mean_token_accuracy": 0.12066083624958993,
|
|
"num_tokens": 3730502.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 6.223606538772583,
|
|
"epoch": 0.17013232514177692,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999164177476319,
|
|
"loss": 6.1457,
|
|
"mean_token_accuracy": 0.13003366217017173,
|
|
"num_tokens": 3739696.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 6.0265522480010985,
|
|
"epoch": 0.17055240495694182,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999155995731009,
|
|
"loss": 6.1404,
|
|
"mean_token_accuracy": 0.1299336552619934,
|
|
"num_tokens": 3748675.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.380355882644653,
|
|
"epoch": 0.1709724847721067,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999147774143057,
|
|
"loss": 6.2221,
|
|
"mean_token_accuracy": 0.12048738449811935,
|
|
"num_tokens": 3757714.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 6.067580938339233,
|
|
"epoch": 0.1713925645872716,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499913951271261,
|
|
"loss": 6.0375,
|
|
"mean_token_accuracy": 0.13202561810612679,
|
|
"num_tokens": 3767589.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.142302322387695,
|
|
"epoch": 0.17181264440243646,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004999131211439816,
|
|
"loss": 6.1596,
|
|
"mean_token_accuracy": 0.12828587144613265,
|
|
"num_tokens": 3777261.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.232779121398925,
|
|
"epoch": 0.17223272421760136,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499912287032482,
|
|
"loss": 6.1001,
|
|
"mean_token_accuracy": 0.1372594192624092,
|
|
"num_tokens": 3786658.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 6.025224256515503,
|
|
"epoch": 0.17265280403276623,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499911448936777,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.13396917879581452,
|
|
"num_tokens": 3794977.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 6.084959363937378,
|
|
"epoch": 0.1730728838479311,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999106068568816,
|
|
"loss": 6.1787,
|
|
"mean_token_accuracy": 0.12529570311307908,
|
|
"num_tokens": 3805138.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.263661098480225,
|
|
"epoch": 0.173492963663096,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999097607928106,
|
|
"loss": 6.1258,
|
|
"mean_token_accuracy": 0.13813115134835244,
|
|
"num_tokens": 3814444.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.166193580627441,
|
|
"epoch": 0.17391304347826086,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999089107445788,
|
|
"loss": 6.0785,
|
|
"mean_token_accuracy": 0.12874337583780288,
|
|
"num_tokens": 3822859.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 6.0040192127227785,
|
|
"epoch": 0.17433312329342576,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999080567122016,
|
|
"loss": 6.102,
|
|
"mean_token_accuracy": 0.1266925446689129,
|
|
"num_tokens": 3833159.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 6.185031747817993,
|
|
"epoch": 0.17475320310859063,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999071986956941,
|
|
"loss": 6.1269,
|
|
"mean_token_accuracy": 0.1295515276491642,
|
|
"num_tokens": 3842136.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.116478013992309,
|
|
"epoch": 0.1751732829237555,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999063366950713,
|
|
"loss": 6.1939,
|
|
"mean_token_accuracy": 0.1253967322409153,
|
|
"num_tokens": 3851406.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 6.1408590316772464,
|
|
"epoch": 0.1755933627389204,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999054707103486,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.1274511694908142,
|
|
"num_tokens": 3861061.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 6.164148044586182,
|
|
"epoch": 0.17601344255408527,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999046007415412,
|
|
"loss": 6.067,
|
|
"mean_token_accuracy": 0.12591860070824623,
|
|
"num_tokens": 3870357.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 6.192416858673096,
|
|
"epoch": 0.17643352236925017,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999037267886646,
|
|
"loss": 6.0964,
|
|
"mean_token_accuracy": 0.1299741767346859,
|
|
"num_tokens": 3879393.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 6.0785363674163815,
|
|
"epoch": 0.17685360218441504,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999028488517343,
|
|
"loss": 6.1037,
|
|
"mean_token_accuracy": 0.12889744639396666,
|
|
"num_tokens": 3888030.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 6.11736216545105,
|
|
"epoch": 0.1772736819995799,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999019669307659,
|
|
"loss": 6.1275,
|
|
"mean_token_accuracy": 0.13039418011903764,
|
|
"num_tokens": 3897430.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 6.1809111595153805,
|
|
"epoch": 0.1776937618147448,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999010810257749,
|
|
"loss": 6.1428,
|
|
"mean_token_accuracy": 0.1269817218184471,
|
|
"num_tokens": 3907711.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 6.062447786331177,
|
|
"epoch": 0.17811384162990967,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999001911367771,
|
|
"loss": 6.0668,
|
|
"mean_token_accuracy": 0.1323694571852684,
|
|
"num_tokens": 3915816.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 6.1604491710662845,
|
|
"epoch": 0.17853392144507457,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998992972637883,
|
|
"loss": 6.1943,
|
|
"mean_token_accuracy": 0.1183660313487053,
|
|
"num_tokens": 3925162.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 6.203741979598999,
|
|
"epoch": 0.17895400126023944,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998983994068242,
|
|
"loss": 6.0864,
|
|
"mean_token_accuracy": 0.1282353989779949,
|
|
"num_tokens": 3934476.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 6.044822025299072,
|
|
"epoch": 0.17937408107540434,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998974975659006,
|
|
"loss": 6.124,
|
|
"mean_token_accuracy": 0.12441963106393814,
|
|
"num_tokens": 3943501.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.184865283966064,
|
|
"epoch": 0.1797941608905692,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998965917410338,
|
|
"loss": 6.1111,
|
|
"mean_token_accuracy": 0.12969196289777757,
|
|
"num_tokens": 3953663.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 6.129238748550415,
|
|
"epoch": 0.18021424070573408,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998956819322397,
|
|
"loss": 6.0839,
|
|
"mean_token_accuracy": 0.13072072938084603,
|
|
"num_tokens": 3962634.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.135206937789917,
|
|
"epoch": 0.18063432052089898,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998947681395343,
|
|
"loss": 6.0859,
|
|
"mean_token_accuracy": 0.1366378679871559,
|
|
"num_tokens": 3972496.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.271072053909302,
|
|
"epoch": 0.18105440033606385,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499893850362934,
|
|
"loss": 6.3296,
|
|
"mean_token_accuracy": 0.12187584564089775,
|
|
"num_tokens": 3980724.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 6.224115467071533,
|
|
"epoch": 0.18147448015122875,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998929286024548,
|
|
"loss": 6.1594,
|
|
"mean_token_accuracy": 0.12844373360276223,
|
|
"num_tokens": 3989842.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 6.123717546463013,
|
|
"epoch": 0.18189455996639362,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998920028581133,
|
|
"loss": 6.0814,
|
|
"mean_token_accuracy": 0.13656101748347282,
|
|
"num_tokens": 3998534.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.150679874420166,
|
|
"epoch": 0.18231463978155849,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998910731299258,
|
|
"loss": 6.1088,
|
|
"mean_token_accuracy": 0.12456604689359665,
|
|
"num_tokens": 4007677.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.126907587051392,
|
|
"epoch": 0.18273471959672338,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998901394179085,
|
|
"loss": 6.1638,
|
|
"mean_token_accuracy": 0.12525054216384887,
|
|
"num_tokens": 4016347.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 6.135372829437256,
|
|
"epoch": 0.18315479941188825,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998892017220784,
|
|
"loss": 6.0213,
|
|
"mean_token_accuracy": 0.13323480933904647,
|
|
"num_tokens": 4025199.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 6.137722158432007,
|
|
"epoch": 0.18357487922705315,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998882600424519,
|
|
"loss": 6.0876,
|
|
"mean_token_accuracy": 0.12551357075572014,
|
|
"num_tokens": 4033933.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 6.108227968215942,
|
|
"epoch": 0.18399495904221802,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004998873143790455,
|
|
"loss": 6.0183,
|
|
"mean_token_accuracy": 0.1379354938864708,
|
|
"num_tokens": 4042891.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 6.1591612815856935,
|
|
"epoch": 0.1844150388573829,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998863647318763,
|
|
"loss": 6.1366,
|
|
"mean_token_accuracy": 0.1241612270474434,
|
|
"num_tokens": 4051123.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 6.089571523666382,
|
|
"epoch": 0.1848351186725478,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998854111009608,
|
|
"loss": 6.113,
|
|
"mean_token_accuracy": 0.12376126572489739,
|
|
"num_tokens": 4060025.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 6.11730580329895,
|
|
"epoch": 0.18525519848771266,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998844534863161,
|
|
"loss": 6.0217,
|
|
"mean_token_accuracy": 0.12926619052886962,
|
|
"num_tokens": 4069363.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 6.176160907745361,
|
|
"epoch": 0.18567527830287756,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998834918879592,
|
|
"loss": 6.1692,
|
|
"mean_token_accuracy": 0.12947654128074645,
|
|
"num_tokens": 4078855.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 6.131696176528931,
|
|
"epoch": 0.18609535811804243,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499882526305907,
|
|
"loss": 6.1424,
|
|
"mean_token_accuracy": 0.12837494984269143,
|
|
"num_tokens": 4087801.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 6.191353893280029,
|
|
"epoch": 0.18651543793320732,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998815567401765,
|
|
"loss": 6.1351,
|
|
"mean_token_accuracy": 0.12790770679712296,
|
|
"num_tokens": 4096949.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 6.171415328979492,
|
|
"epoch": 0.1869355177483722,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998805831907851,
|
|
"loss": 6.084,
|
|
"mean_token_accuracy": 0.1275387942790985,
|
|
"num_tokens": 4105399.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 6.12052903175354,
|
|
"epoch": 0.18735559756353706,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998796056577501,
|
|
"loss": 6.0391,
|
|
"mean_token_accuracy": 0.1234730213880539,
|
|
"num_tokens": 4113873.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 6.033805179595947,
|
|
"epoch": 0.18777567737870196,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998786241410886,
|
|
"loss": 6.1003,
|
|
"mean_token_accuracy": 0.12796764224767684,
|
|
"num_tokens": 4123528.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.244566345214844,
|
|
"epoch": 0.18819575719386683,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499877638640818,
|
|
"loss": 6.1131,
|
|
"mean_token_accuracy": 0.12414761930704117,
|
|
"num_tokens": 4133370.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 6.0351306915283205,
|
|
"epoch": 0.18861583700903173,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499876649156956,
|
|
"loss": 6.0237,
|
|
"mean_token_accuracy": 0.13068948239088057,
|
|
"num_tokens": 4142370.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 6.075446557998657,
|
|
"epoch": 0.1890359168241966,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998756556895196,
|
|
"loss": 6.1176,
|
|
"mean_token_accuracy": 0.12780525609850885,
|
|
"num_tokens": 4152367.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 6.182886552810669,
|
|
"epoch": 0.18945599663936147,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499874658238527,
|
|
"loss": 6.0979,
|
|
"mean_token_accuracy": 0.1277949795126915,
|
|
"num_tokens": 4161126.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 6.106898975372315,
|
|
"epoch": 0.18987607645452637,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998736568039957,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13100193440914154,
|
|
"num_tokens": 4169910.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 6.133787775039673,
|
|
"epoch": 0.19029615626969124,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998726513859432,
|
|
"loss": 6.1599,
|
|
"mean_token_accuracy": 0.12446666359901429,
|
|
"num_tokens": 4179893.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.202354001998901,
|
|
"epoch": 0.19071623608485613,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998716419843875,
|
|
"loss": 6.1617,
|
|
"mean_token_accuracy": 0.1319762259721756,
|
|
"num_tokens": 4190065.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 6.011490678787231,
|
|
"epoch": 0.191136315900021,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998706285993465,
|
|
"loss": 6.069,
|
|
"mean_token_accuracy": 0.13331144750118257,
|
|
"num_tokens": 4198395.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 6.173086833953858,
|
|
"epoch": 0.19155639571518587,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998696112308381,
|
|
"loss": 6.093,
|
|
"mean_token_accuracy": 0.1271330051124096,
|
|
"num_tokens": 4207555.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 6.0555767059326175,
|
|
"epoch": 0.19197647553035077,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998685898788803,
|
|
"loss": 6.0375,
|
|
"mean_token_accuracy": 0.1309538424015045,
|
|
"num_tokens": 4216533.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 6.211866235733032,
|
|
"epoch": 0.19239655534551564,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998675645434914,
|
|
"loss": 6.1419,
|
|
"mean_token_accuracy": 0.1353093557059765,
|
|
"num_tokens": 4225575.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 6.018606328964234,
|
|
"epoch": 0.19281663516068054,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998665352246891,
|
|
"loss": 5.9193,
|
|
"mean_token_accuracy": 0.13810657039284707,
|
|
"num_tokens": 4234306.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 6.014672660827637,
|
|
"epoch": 0.1932367149758454,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998655019224921,
|
|
"loss": 6.1267,
|
|
"mean_token_accuracy": 0.12904786244034766,
|
|
"num_tokens": 4243998.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 6.134347867965698,
|
|
"epoch": 0.19365679479101028,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998644646369185,
|
|
"loss": 6.0238,
|
|
"mean_token_accuracy": 0.12680166810750962,
|
|
"num_tokens": 4253653.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 6.066501617431641,
|
|
"epoch": 0.19407687460617518,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998634233679865,
|
|
"loss": 6.0895,
|
|
"mean_token_accuracy": 0.12311211153864861,
|
|
"num_tokens": 4263305.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.049868440628051,
|
|
"epoch": 0.19449695442134005,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499862378115715,
|
|
"loss": 5.983,
|
|
"mean_token_accuracy": 0.13395097106695175,
|
|
"num_tokens": 4272212.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 6.165916633605957,
|
|
"epoch": 0.19491703423650494,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998613288801221,
|
|
"loss": 6.1922,
|
|
"mean_token_accuracy": 0.1247316338121891,
|
|
"num_tokens": 4281445.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.179806041717529,
|
|
"epoch": 0.1953371140516698,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998602756612267,
|
|
"loss": 6.0898,
|
|
"mean_token_accuracy": 0.12693395391106604,
|
|
"num_tokens": 4290938.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 6.070136451721192,
|
|
"epoch": 0.1957571938668347,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998592184590471,
|
|
"loss": 6.1397,
|
|
"mean_token_accuracy": 0.12676772177219392,
|
|
"num_tokens": 4300022.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 6.06673412322998,
|
|
"epoch": 0.19617727368199958,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998581572736024,
|
|
"loss": 6.0179,
|
|
"mean_token_accuracy": 0.13165862262248992,
|
|
"num_tokens": 4308910.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.994941234588623,
|
|
"epoch": 0.19659735349716445,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998570921049112,
|
|
"loss": 5.9863,
|
|
"mean_token_accuracy": 0.135918989777565,
|
|
"num_tokens": 4317136.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 6.102301931381225,
|
|
"epoch": 0.19701743331232935,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998560229529924,
|
|
"loss": 6.0425,
|
|
"mean_token_accuracy": 0.13503788635134698,
|
|
"num_tokens": 4326163.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 6.227736186981201,
|
|
"epoch": 0.19743751312749422,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998549498178649,
|
|
"loss": 6.1881,
|
|
"mean_token_accuracy": 0.13264173418283462,
|
|
"num_tokens": 4335837.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 6.1506922245025635,
|
|
"epoch": 0.19785759294265912,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998538726995477,
|
|
"loss": 6.1094,
|
|
"mean_token_accuracy": 0.13223380818963051,
|
|
"num_tokens": 4345108.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 6.144142389297485,
|
|
"epoch": 0.198277672757824,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00049985279159806,
|
|
"loss": 6.1229,
|
|
"mean_token_accuracy": 0.1271647334098816,
|
|
"num_tokens": 4353761.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 6.1053972244262695,
|
|
"epoch": 0.19869775257298886,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998517065134208,
|
|
"loss": 6.0771,
|
|
"mean_token_accuracy": 0.1304875746369362,
|
|
"num_tokens": 4363244.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 6.125473690032959,
|
|
"epoch": 0.19911783238815375,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998506174456494,
|
|
"loss": 6.0856,
|
|
"mean_token_accuracy": 0.1269718214869499,
|
|
"num_tokens": 4373034.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 6.056502437591552,
|
|
"epoch": 0.19953791220331862,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998495243947653,
|
|
"loss": 6.0113,
|
|
"mean_token_accuracy": 0.12611002326011658,
|
|
"num_tokens": 4382554.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 6.116158485412598,
|
|
"epoch": 0.19995799201848352,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998484273607875,
|
|
"loss": 6.0324,
|
|
"mean_token_accuracy": 0.13722692728042601,
|
|
"num_tokens": 4391001.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.908738136291504,
|
|
"epoch": 0.2003780718336484,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998473263437356,
|
|
"loss": 5.9468,
|
|
"mean_token_accuracy": 0.1328367456793785,
|
|
"num_tokens": 4400632.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 6.068370723724366,
|
|
"epoch": 0.20079815164881326,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499846221343629,
|
|
"loss": 6.0486,
|
|
"mean_token_accuracy": 0.12969876527786256,
|
|
"num_tokens": 4409565.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 6.078929996490478,
|
|
"epoch": 0.20121823146397816,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998451123604875,
|
|
"loss": 5.9972,
|
|
"mean_token_accuracy": 0.13624220937490464,
|
|
"num_tokens": 4418384.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 6.103708171844483,
|
|
"epoch": 0.20163831127914303,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998439993943306,
|
|
"loss": 6.11,
|
|
"mean_token_accuracy": 0.13608327358961106,
|
|
"num_tokens": 4427581.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 6.2018999576568605,
|
|
"epoch": 0.20205839109430793,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998428824451779,
|
|
"loss": 6.1047,
|
|
"mean_token_accuracy": 0.1272777199745178,
|
|
"num_tokens": 4436572.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 6.056638908386231,
|
|
"epoch": 0.2024784709094728,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998417615130495,
|
|
"loss": 6.1099,
|
|
"mean_token_accuracy": 0.12568870037794114,
|
|
"num_tokens": 4445230.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 6.192966461181641,
|
|
"epoch": 0.2028985507246377,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998406365979649,
|
|
"loss": 6.1712,
|
|
"mean_token_accuracy": 0.12947247475385665,
|
|
"num_tokens": 4454251.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 6.0738544940948485,
|
|
"epoch": 0.20331863053980256,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998395076999443,
|
|
"loss": 6.0246,
|
|
"mean_token_accuracy": 0.1331735722720623,
|
|
"num_tokens": 4463949.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 6.164913845062256,
|
|
"epoch": 0.20373871035496743,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998383748190076,
|
|
"loss": 6.2178,
|
|
"mean_token_accuracy": 0.12642809972167016,
|
|
"num_tokens": 4473373.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 6.169246625900269,
|
|
"epoch": 0.20415879017013233,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998372379551748,
|
|
"loss": 6.0443,
|
|
"mean_token_accuracy": 0.13512365892529488,
|
|
"num_tokens": 4482303.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 6.000651454925537,
|
|
"epoch": 0.2045788699852972,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998360971084663,
|
|
"loss": 6.0248,
|
|
"mean_token_accuracy": 0.1257840245962143,
|
|
"num_tokens": 4491214.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 6.060888242721558,
|
|
"epoch": 0.2049989498004621,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998349522789019,
|
|
"loss": 5.9365,
|
|
"mean_token_accuracy": 0.14086327105760574,
|
|
"num_tokens": 4500099.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 6.020166492462158,
|
|
"epoch": 0.20541902961562697,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998338034665021,
|
|
"loss": 6.0199,
|
|
"mean_token_accuracy": 0.13966668471693994,
|
|
"num_tokens": 4509893.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 6.064390420913696,
|
|
"epoch": 0.20583910943079184,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998326506712872,
|
|
"loss": 5.9974,
|
|
"mean_token_accuracy": 0.13378938734531404,
|
|
"num_tokens": 4518606.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 6.097909021377563,
|
|
"epoch": 0.20625918924595674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998314938932778,
|
|
"loss": 6.0759,
|
|
"mean_token_accuracy": 0.1298009656369686,
|
|
"num_tokens": 4528392.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 6.1035826206207275,
|
|
"epoch": 0.2066792690611216,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998303331324943,
|
|
"loss": 6.0416,
|
|
"mean_token_accuracy": 0.13463694974780083,
|
|
"num_tokens": 4536983.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.9858495712280275,
|
|
"epoch": 0.2070993488762865,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998291683889571,
|
|
"loss": 5.9442,
|
|
"mean_token_accuracy": 0.13662122339010238,
|
|
"num_tokens": 4544967.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 6.056029415130615,
|
|
"epoch": 0.20751942869145137,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000499827999662687,
|
|
"loss": 6.0242,
|
|
"mean_token_accuracy": 0.12964650020003318,
|
|
"num_tokens": 4554646.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.118838214874268,
|
|
"epoch": 0.20793950850661624,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998268269537046,
|
|
"loss": 6.0401,
|
|
"mean_token_accuracy": 0.13539641574025155,
|
|
"num_tokens": 4564040.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 6.022972631454468,
|
|
"epoch": 0.20835958832178114,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998256502620308,
|
|
"loss": 6.0624,
|
|
"mean_token_accuracy": 0.13345976546406746,
|
|
"num_tokens": 4573758.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 6.193491125106812,
|
|
"epoch": 0.208779668136946,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998244695876864,
|
|
"loss": 6.0874,
|
|
"mean_token_accuracy": 0.13196430653333663,
|
|
"num_tokens": 4582097.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 6.018001937866211,
|
|
"epoch": 0.2091997479521109,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998232849306921,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.1368905283510685,
|
|
"num_tokens": 4590687.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 6.152202367782593,
|
|
"epoch": 0.20961982776727578,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998220962910693,
|
|
"loss": 6.0475,
|
|
"mean_token_accuracy": 0.12533890679478646,
|
|
"num_tokens": 4599497.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 6.059301280975342,
|
|
"epoch": 0.21003990758244068,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998209036688386,
|
|
"loss": 6.0091,
|
|
"mean_token_accuracy": 0.12979092076420784,
|
|
"num_tokens": 4607958.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 6.12682089805603,
|
|
"epoch": 0.21045998739760555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998197070640216,
|
|
"loss": 6.1445,
|
|
"mean_token_accuracy": 0.12323907017707825,
|
|
"num_tokens": 4617515.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 6.13975419998169,
|
|
"epoch": 0.21088006721277042,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998185064766391,
|
|
"loss": 6.028,
|
|
"mean_token_accuracy": 0.13126113414764404,
|
|
"num_tokens": 4627037.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.999127197265625,
|
|
"epoch": 0.21130014702793531,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998173019067127,
|
|
"loss": 6.0335,
|
|
"mean_token_accuracy": 0.13387575298547744,
|
|
"num_tokens": 4637393.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 6.049172449111938,
|
|
"epoch": 0.21172022684310018,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998160933542633,
|
|
"loss": 6.0685,
|
|
"mean_token_accuracy": 0.12128801420331001,
|
|
"num_tokens": 4646832.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 6.16112699508667,
|
|
"epoch": 0.21214030665826508,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004998148808193128,
|
|
"loss": 6.095,
|
|
"mean_token_accuracy": 0.1346332848072052,
|
|
"num_tokens": 4655719.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 6.126083850860596,
|
|
"epoch": 0.21256038647342995,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998136643018823,
|
|
"loss": 6.0477,
|
|
"mean_token_accuracy": 0.12910717576742173,
|
|
"num_tokens": 4665364.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 6.087383460998535,
|
|
"epoch": 0.21298046628859482,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998124438019935,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.1316668502986431,
|
|
"num_tokens": 4674760.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.993421936035157,
|
|
"epoch": 0.21340054610375972,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998112193196681,
|
|
"loss": 5.9488,
|
|
"mean_token_accuracy": 0.13391186147928238,
|
|
"num_tokens": 4683900.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.969591331481934,
|
|
"epoch": 0.2138206259189246,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998099908549277,
|
|
"loss": 5.9886,
|
|
"mean_token_accuracy": 0.1273488573729992,
|
|
"num_tokens": 4693915.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.9875883102417,
|
|
"epoch": 0.2142407057340895,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499808758407794,
|
|
"loss": 5.8619,
|
|
"mean_token_accuracy": 0.13991126343607901,
|
|
"num_tokens": 4703102.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 6.031775951385498,
|
|
"epoch": 0.21466078554925436,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998075219782889,
|
|
"loss": 6.0787,
|
|
"mean_token_accuracy": 0.1323968604207039,
|
|
"num_tokens": 4712925.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 6.099209594726562,
|
|
"epoch": 0.21508086536441923,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998062815664344,
|
|
"loss": 6.0069,
|
|
"mean_token_accuracy": 0.12949655801057816,
|
|
"num_tokens": 4722641.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 6.046544742584229,
|
|
"epoch": 0.21550094517958412,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998050371722524,
|
|
"loss": 6.0781,
|
|
"mean_token_accuracy": 0.12990766763687134,
|
|
"num_tokens": 4732603.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.932075929641724,
|
|
"epoch": 0.215921024994749,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998037887957649,
|
|
"loss": 5.9211,
|
|
"mean_token_accuracy": 0.13785294219851493,
|
|
"num_tokens": 4742644.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 6.21406192779541,
|
|
"epoch": 0.2163411048099139,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998025364369939,
|
|
"loss": 6.2335,
|
|
"mean_token_accuracy": 0.1234040841460228,
|
|
"num_tokens": 4751482.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 6.237205886840821,
|
|
"epoch": 0.21676118462507876,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998012800959619,
|
|
"loss": 6.0891,
|
|
"mean_token_accuracy": 0.12757375389337539,
|
|
"num_tokens": 4760593.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 6.093921661376953,
|
|
"epoch": 0.21718126444024366,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998000197726909,
|
|
"loss": 6.0827,
|
|
"mean_token_accuracy": 0.13335589170455933,
|
|
"num_tokens": 4769294.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 6.031546688079834,
|
|
"epoch": 0.21760134425540853,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004997987554672033,
|
|
"loss": 6.0081,
|
|
"mean_token_accuracy": 0.13305121287703514,
|
|
"num_tokens": 4779239.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 6.059205436706543,
|
|
"epoch": 0.2180214240705734,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997974871795215,
|
|
"loss": 6.0716,
|
|
"mean_token_accuracy": 0.13057481795549392,
|
|
"num_tokens": 4788211.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 6.109251928329468,
|
|
"epoch": 0.2184415038857383,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499796214909668,
|
|
"loss": 6.0447,
|
|
"mean_token_accuracy": 0.13531798869371414,
|
|
"num_tokens": 4797921.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 6.092241191864014,
|
|
"epoch": 0.21886158370090317,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997949386576653,
|
|
"loss": 6.0378,
|
|
"mean_token_accuracy": 0.13213689997792244,
|
|
"num_tokens": 4807772.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 6.042962265014649,
|
|
"epoch": 0.21928166351606806,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499793658423536,
|
|
"loss": 6.0593,
|
|
"mean_token_accuracy": 0.13149860948324205,
|
|
"num_tokens": 4817999.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 6.057756137847901,
|
|
"epoch": 0.21970174333123293,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997923742073028,
|
|
"loss": 6.0136,
|
|
"mean_token_accuracy": 0.13949006497859956,
|
|
"num_tokens": 4826679.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.998235082626342,
|
|
"epoch": 0.2201218231463978,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997910860089884,
|
|
"loss": 6.0157,
|
|
"mean_token_accuracy": 0.13456794619560242,
|
|
"num_tokens": 4834998.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 6.064208889007569,
|
|
"epoch": 0.2205419029615627,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997897938286156,
|
|
"loss": 5.9717,
|
|
"mean_token_accuracy": 0.1337368108332157,
|
|
"num_tokens": 4843635.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 6.085119295120239,
|
|
"epoch": 0.22096198277672757,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997884976662075,
|
|
"loss": 6.0919,
|
|
"mean_token_accuracy": 0.12607687711715698,
|
|
"num_tokens": 4852027.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 6.183318328857422,
|
|
"epoch": 0.22138206259189247,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997871975217868,
|
|
"loss": 6.0165,
|
|
"mean_token_accuracy": 0.1429324761033058,
|
|
"num_tokens": 4861244.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.912706756591797,
|
|
"epoch": 0.22180214240705734,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997858933953768,
|
|
"loss": 5.9326,
|
|
"mean_token_accuracy": 0.1404939979314804,
|
|
"num_tokens": 4869902.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.963629674911499,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997845852870004,
|
|
"loss": 5.8982,
|
|
"mean_token_accuracy": 0.14085923954844476,
|
|
"num_tokens": 4878502.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.986082458496094,
|
|
"epoch": 0.2226423020373871,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997832731966806,
|
|
"loss": 5.964,
|
|
"mean_token_accuracy": 0.14047276899218558,
|
|
"num_tokens": 4888348.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 6.051373815536499,
|
|
"epoch": 0.22306238185255198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997819571244411,
|
|
"loss": 6.0172,
|
|
"mean_token_accuracy": 0.13845039829611777,
|
|
"num_tokens": 4897302.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 6.01381549835205,
|
|
"epoch": 0.22348246166771688,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997806370703049,
|
|
"loss": 6.0476,
|
|
"mean_token_accuracy": 0.13289312049746513,
|
|
"num_tokens": 4907078.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.983912467956543,
|
|
"epoch": 0.22390254148288175,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997793130342954,
|
|
"loss": 5.8784,
|
|
"mean_token_accuracy": 0.1382697917521,
|
|
"num_tokens": 4917489.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.94772891998291,
|
|
"epoch": 0.22432262129804661,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997779850164363,
|
|
"loss": 5.9836,
|
|
"mean_token_accuracy": 0.13369291126728058,
|
|
"num_tokens": 4927073.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 6.121642923355102,
|
|
"epoch": 0.2247427011132115,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997766530167508,
|
|
"loss": 6.0821,
|
|
"mean_token_accuracy": 0.1270790107548237,
|
|
"num_tokens": 4935464.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 6.221409273147583,
|
|
"epoch": 0.22516278092837638,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997753170352627,
|
|
"loss": 6.1649,
|
|
"mean_token_accuracy": 0.12717002481222153,
|
|
"num_tokens": 4944718.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 6.084948205947876,
|
|
"epoch": 0.22558286074354128,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997739770719955,
|
|
"loss": 6.0396,
|
|
"mean_token_accuracy": 0.1332695096731186,
|
|
"num_tokens": 4954223.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 6.003955984115601,
|
|
"epoch": 0.22600294055870615,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499772633126973,
|
|
"loss": 6.0733,
|
|
"mean_token_accuracy": 0.1317312702536583,
|
|
"num_tokens": 4963371.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 6.013844203948975,
|
|
"epoch": 0.22642302037387105,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997712852002192,
|
|
"loss": 5.9358,
|
|
"mean_token_accuracy": 0.14093514010310174,
|
|
"num_tokens": 4972973.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 6.059261226654053,
|
|
"epoch": 0.22684310018903592,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997699332917578,
|
|
"loss": 6.1739,
|
|
"mean_token_accuracy": 0.12389883399009705,
|
|
"num_tokens": 4982808.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 6.180717802047729,
|
|
"epoch": 0.2272631800042008,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997685774016127,
|
|
"loss": 6.0444,
|
|
"mean_token_accuracy": 0.13330344706773758,
|
|
"num_tokens": 4992427.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 6.1143828392028805,
|
|
"epoch": 0.22768325981936569,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499767217529808,
|
|
"loss": 6.2262,
|
|
"mean_token_accuracy": 0.12522902861237525,
|
|
"num_tokens": 5003562.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 6.120408248901367,
|
|
"epoch": 0.22810333963453056,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997658536763678,
|
|
"loss": 5.9207,
|
|
"mean_token_accuracy": 0.13713482916355133,
|
|
"num_tokens": 5013429.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 6.080751562118531,
|
|
"epoch": 0.22852341944969545,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004997644858413163,
|
|
"loss": 6.046,
|
|
"mean_token_accuracy": 0.13544052764773368,
|
|
"num_tokens": 5022045.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.984566640853882,
|
|
"epoch": 0.22894349926486032,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997631140246775,
|
|
"loss": 5.8853,
|
|
"mean_token_accuracy": 0.14113514721393586,
|
|
"num_tokens": 5032260.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.9389331340789795,
|
|
"epoch": 0.2293635790800252,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499761738226476,
|
|
"loss": 5.9276,
|
|
"mean_token_accuracy": 0.13583676218986512,
|
|
"num_tokens": 5041688.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 6.007482099533081,
|
|
"epoch": 0.2297836588951901,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499760358446736,
|
|
"loss": 6.0417,
|
|
"mean_token_accuracy": 0.1291549324989319,
|
|
"num_tokens": 5051005.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 6.1208288192749025,
|
|
"epoch": 0.23020373871035496,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499758974685482,
|
|
"loss": 5.9698,
|
|
"mean_token_accuracy": 0.13492617905139923,
|
|
"num_tokens": 5060084.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 6.010481119155884,
|
|
"epoch": 0.23062381852551986,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004997575869427385,
|
|
"loss": 5.9731,
|
|
"mean_token_accuracy": 0.14254927188158034,
|
|
"num_tokens": 5069081.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 6.021266603469849,
|
|
"epoch": 0.23104389834068473,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00049975619521853,
|
|
"loss": 5.9703,
|
|
"mean_token_accuracy": 0.13409337997436524,
|
|
"num_tokens": 5078597.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.943169069290161,
|
|
"epoch": 0.2314639781558496,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997547995128814,
|
|
"loss": 6.0084,
|
|
"mean_token_accuracy": 0.13727526888251304,
|
|
"num_tokens": 5087607.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 6.111000204086304,
|
|
"epoch": 0.2318840579710145,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004997533998258171,
|
|
"loss": 6.0123,
|
|
"mean_token_accuracy": 0.1351937808096409,
|
|
"num_tokens": 5097412.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 6.129235696792603,
|
|
"epoch": 0.23230413778617937,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997519961573622,
|
|
"loss": 6.0735,
|
|
"mean_token_accuracy": 0.1282409645617008,
|
|
"num_tokens": 5105817.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 6.1673665046691895,
|
|
"epoch": 0.23272421760134426,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997505885075414,
|
|
"loss": 6.1269,
|
|
"mean_token_accuracy": 0.12907201573252677,
|
|
"num_tokens": 5114958.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 6.069322109222412,
|
|
"epoch": 0.23314429741650913,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997491768763795,
|
|
"loss": 6.0425,
|
|
"mean_token_accuracy": 0.13409897387027742,
|
|
"num_tokens": 5123728.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 6.003434944152832,
|
|
"epoch": 0.23356437723167403,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997477612639018,
|
|
"loss": 6.0871,
|
|
"mean_token_accuracy": 0.12734304070472718,
|
|
"num_tokens": 5134099.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 6.186435317993164,
|
|
"epoch": 0.2339844570468389,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997463416701332,
|
|
"loss": 6.094,
|
|
"mean_token_accuracy": 0.1274227410554886,
|
|
"num_tokens": 5142934.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 6.043578577041626,
|
|
"epoch": 0.23440453686200377,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997449180950989,
|
|
"loss": 5.9298,
|
|
"mean_token_accuracy": 0.1532392293214798,
|
|
"num_tokens": 5151835.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.953121995925903,
|
|
"epoch": 0.23482461667716867,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997434905388241,
|
|
"loss": 5.9842,
|
|
"mean_token_accuracy": 0.1413706734776497,
|
|
"num_tokens": 5161136.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 6.0334107875823975,
|
|
"epoch": 0.23524469649233354,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499742059001334,
|
|
"loss": 5.9191,
|
|
"mean_token_accuracy": 0.1378956101834774,
|
|
"num_tokens": 5170741.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.991379880905152,
|
|
"epoch": 0.23566477630749844,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997406234826541,
|
|
"loss": 5.9539,
|
|
"mean_token_accuracy": 0.14059103950858115,
|
|
"num_tokens": 5180549.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.995284509658814,
|
|
"epoch": 0.2360848561226633,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004997391839828098,
|
|
"loss": 5.9249,
|
|
"mean_token_accuracy": 0.14390118718147277,
|
|
"num_tokens": 5189486.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 6.030531978607177,
|
|
"epoch": 0.23650493593782818,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997377405018266,
|
|
"loss": 6.0032,
|
|
"mean_token_accuracy": 0.13120983093976973,
|
|
"num_tokens": 5198525.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 6.0725666046142575,
|
|
"epoch": 0.23692501575299307,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049973629303973,
|
|
"loss": 6.0662,
|
|
"mean_token_accuracy": 0.1294946141541004,
|
|
"num_tokens": 5207124.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.958557415008545,
|
|
"epoch": 0.23734509556815794,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997348415965457,
|
|
"loss": 5.878,
|
|
"mean_token_accuracy": 0.13335178643465043,
|
|
"num_tokens": 5216529.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 6.007561159133911,
|
|
"epoch": 0.23776517538332284,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997333861722995,
|
|
"loss": 6.0169,
|
|
"mean_token_accuracy": 0.13635273203253745,
|
|
"num_tokens": 5225796.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 6.125902462005615,
|
|
"epoch": 0.2381852551984877,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499731926767017,
|
|
"loss": 6.0359,
|
|
"mean_token_accuracy": 0.1375264048576355,
|
|
"num_tokens": 5233876.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.989985036849975,
|
|
"epoch": 0.23860533501365258,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997304633807242,
|
|
"loss": 6.0396,
|
|
"mean_token_accuracy": 0.12682786211371422,
|
|
"num_tokens": 5244782.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 6.019674825668335,
|
|
"epoch": 0.23902541482881748,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997289960134468,
|
|
"loss": 5.9886,
|
|
"mean_token_accuracy": 0.13695719763636588,
|
|
"num_tokens": 5253453.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 6.0026778221130375,
|
|
"epoch": 0.23944549464398235,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997275246652111,
|
|
"loss": 6.0149,
|
|
"mean_token_accuracy": 0.13926383331418038,
|
|
"num_tokens": 5262355.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.99656400680542,
|
|
"epoch": 0.23986557445914725,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499726049336043,
|
|
"loss": 5.9374,
|
|
"mean_token_accuracy": 0.13838583379983901,
|
|
"num_tokens": 5271959.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 6.058608770370483,
|
|
"epoch": 0.24028565427431212,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997245700259686,
|
|
"loss": 5.9673,
|
|
"mean_token_accuracy": 0.1403045229613781,
|
|
"num_tokens": 5281393.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 6.061829471588135,
|
|
"epoch": 0.240705734089477,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997230867350141,
|
|
"loss": 6.0878,
|
|
"mean_token_accuracy": 0.1320396728813648,
|
|
"num_tokens": 5290979.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 6.128190040588379,
|
|
"epoch": 0.24112581390464188,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997215994632059,
|
|
"loss": 6.0392,
|
|
"mean_token_accuracy": 0.13521442338824272,
|
|
"num_tokens": 5300263.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 6.065250301361084,
|
|
"epoch": 0.24154589371980675,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997201082105704,
|
|
"loss": 6.0654,
|
|
"mean_token_accuracy": 0.12793515026569366,
|
|
"num_tokens": 5309522.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 6.059223175048828,
|
|
"epoch": 0.24196597353497165,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997186129771338,
|
|
"loss": 6.0625,
|
|
"mean_token_accuracy": 0.13326726630330085,
|
|
"num_tokens": 5319770.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 6.18207311630249,
|
|
"epoch": 0.24238605335013652,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997171137629226,
|
|
"loss": 6.0695,
|
|
"mean_token_accuracy": 0.13562847971916198,
|
|
"num_tokens": 5328400.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.968668270111084,
|
|
"epoch": 0.24280613316530142,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997156105679636,
|
|
"loss": 5.8716,
|
|
"mean_token_accuracy": 0.14514228701591492,
|
|
"num_tokens": 5336338.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.89683952331543,
|
|
"epoch": 0.2432262129804663,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997141033922832,
|
|
"loss": 5.9748,
|
|
"mean_token_accuracy": 0.1309155747294426,
|
|
"num_tokens": 5345391.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 6.103964805603027,
|
|
"epoch": 0.24364629279563116,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997125922359081,
|
|
"loss": 6.0044,
|
|
"mean_token_accuracy": 0.12651756703853606,
|
|
"num_tokens": 5354709.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 6.039173555374146,
|
|
"epoch": 0.24406637261079606,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997110770988652,
|
|
"loss": 5.9187,
|
|
"mean_token_accuracy": 0.13533097133040428,
|
|
"num_tokens": 5363738.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 6.009365177154541,
|
|
"epoch": 0.24448645242596093,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004997095579811813,
|
|
"loss": 6.0492,
|
|
"mean_token_accuracy": 0.13356854170560836,
|
|
"num_tokens": 5373583.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 6.10346941947937,
|
|
"epoch": 0.24490653224112582,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997080348828833,
|
|
"loss": 6.0964,
|
|
"mean_token_accuracy": 0.1329493686556816,
|
|
"num_tokens": 5383486.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 6.022554492950439,
|
|
"epoch": 0.2453266120562907,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997065078039981,
|
|
"loss": 5.995,
|
|
"mean_token_accuracy": 0.1254143126308918,
|
|
"num_tokens": 5391974.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 6.089977025985718,
|
|
"epoch": 0.24574669187145556,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997049767445529,
|
|
"loss": 6.0288,
|
|
"mean_token_accuracy": 0.12984034791588783,
|
|
"num_tokens": 5400882.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 6.110510158538818,
|
|
"epoch": 0.24616677168662046,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997034417045746,
|
|
"loss": 5.9927,
|
|
"mean_token_accuracy": 0.1267140880227089,
|
|
"num_tokens": 5410538.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.971307563781738,
|
|
"epoch": 0.24658685150178533,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997019026840907,
|
|
"loss": 5.8743,
|
|
"mean_token_accuracy": 0.13612414821982383,
|
|
"num_tokens": 5419406.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.88221755027771,
|
|
"epoch": 0.24700693131695023,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997003596831282,
|
|
"loss": 5.9978,
|
|
"mean_token_accuracy": 0.13463943675160409,
|
|
"num_tokens": 5428817.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 6.0984635829925535,
|
|
"epoch": 0.2474270111321151,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996988127017145,
|
|
"loss": 6.0253,
|
|
"mean_token_accuracy": 0.13181837573647498,
|
|
"num_tokens": 5438277.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 6.0544061183929445,
|
|
"epoch": 0.24784709094728,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004996972617398772,
|
|
"loss": 6.042,
|
|
"mean_token_accuracy": 0.13205936923623085,
|
|
"num_tokens": 5447440.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 6.0680958271026615,
|
|
"epoch": 0.24826717076244487,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996957067976435,
|
|
"loss": 5.9541,
|
|
"mean_token_accuracy": 0.1357963502407074,
|
|
"num_tokens": 5455988.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 6.0058001518249515,
|
|
"epoch": 0.24868725057760974,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004996941478750411,
|
|
"loss": 5.9769,
|
|
"mean_token_accuracy": 0.1373401865363121,
|
|
"num_tokens": 5464996.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 6.083559465408325,
|
|
"epoch": 0.24910733039277463,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996925849720975,
|
|
"loss": 6.1025,
|
|
"mean_token_accuracy": 0.12863337025046348,
|
|
"num_tokens": 5474174.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 6.146986627578736,
|
|
"epoch": 0.2495274102079395,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004996910180888405,
|
|
"loss": 5.9994,
|
|
"mean_token_accuracy": 0.13324794694781303,
|
|
"num_tokens": 5482838.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 6.005090427398682,
|
|
"epoch": 0.2499474900231044,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996894472252977,
|
|
"loss": 6.0195,
|
|
"mean_token_accuracy": 0.13370491713285446,
|
|
"num_tokens": 5491616.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.99453763961792,
|
|
"epoch": 0.25036756983826924,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996878723814973,
|
|
"loss": 5.9972,
|
|
"mean_token_accuracy": 0.12933446019887923,
|
|
"num_tokens": 5500942.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 6.035016107559204,
|
|
"epoch": 0.25078764965343414,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996862935574667,
|
|
"loss": 5.9539,
|
|
"mean_token_accuracy": 0.13152176290750503,
|
|
"num_tokens": 5510078.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.9494434833526615,
|
|
"epoch": 0.25120772946859904,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996847107532342,
|
|
"loss": 5.9763,
|
|
"mean_token_accuracy": 0.13343006893992423,
|
|
"num_tokens": 5518924.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 6.115957880020142,
|
|
"epoch": 0.25162780928376394,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996831239688277,
|
|
"loss": 5.9896,
|
|
"mean_token_accuracy": 0.12950923070311546,
|
|
"num_tokens": 5527385.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.96525821685791,
|
|
"epoch": 0.2520478890989288,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996815332042754,
|
|
"loss": 5.8456,
|
|
"mean_token_accuracy": 0.14307771176099776,
|
|
"num_tokens": 5536781.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2520478890989288,
|
|
"eval_entropy": 5.826104599310177,
|
|
"eval_loss": 6.01594352722168,
|
|
"eval_mean_token_accuracy": 0.13980411247313787,
|
|
"eval_num_tokens": 5536781.0,
|
|
"eval_runtime": 27.3461,
|
|
"eval_samples_per_second": 1366.412,
|
|
"eval_steps_per_second": 170.811,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 6.008435201644898,
|
|
"epoch": 0.2524679689140937,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996799384596054,
|
|
"loss": 6.0261,
|
|
"mean_token_accuracy": 0.1376914620399475,
|
|
"num_tokens": 5545893.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 6.02188720703125,
|
|
"epoch": 0.2528880487292586,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996783397348461,
|
|
"loss": 5.9762,
|
|
"mean_token_accuracy": 0.1329520359635353,
|
|
"num_tokens": 5555818.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 6.045353794097901,
|
|
"epoch": 0.2533081285444234,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996767370300256,
|
|
"loss": 5.9502,
|
|
"mean_token_accuracy": 0.13486573100090027,
|
|
"num_tokens": 5565331.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 6.056732606887818,
|
|
"epoch": 0.2537282083595883,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996751303451724,
|
|
"loss": 5.9577,
|
|
"mean_token_accuracy": 0.13709068223834037,
|
|
"num_tokens": 5574003.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.993344259262085,
|
|
"epoch": 0.2541482881747532,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996735196803149,
|
|
"loss": 5.8551,
|
|
"mean_token_accuracy": 0.1428755633533001,
|
|
"num_tokens": 5582517.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.977582693099976,
|
|
"epoch": 0.2545683679899181,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996719050354818,
|
|
"loss": 6.0686,
|
|
"mean_token_accuracy": 0.13471986055374147,
|
|
"num_tokens": 5591952.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 6.0037376403808596,
|
|
"epoch": 0.25498844780508295,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996702864107015,
|
|
"loss": 5.9609,
|
|
"mean_token_accuracy": 0.1396644115447998,
|
|
"num_tokens": 5601460.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 6.176335668563842,
|
|
"epoch": 0.25540852762024785,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996686638060028,
|
|
"loss": 6.0902,
|
|
"mean_token_accuracy": 0.1306911051273346,
|
|
"num_tokens": 5610776.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.970763540267944,
|
|
"epoch": 0.25582860743541275,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996670372214144,
|
|
"loss": 5.9871,
|
|
"mean_token_accuracy": 0.13826777338981627,
|
|
"num_tokens": 5619627.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.914526128768921,
|
|
"epoch": 0.2562486872505776,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996654066569651,
|
|
"loss": 5.8622,
|
|
"mean_token_accuracy": 0.14179132953286172,
|
|
"num_tokens": 5628969.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.981579828262329,
|
|
"epoch": 0.2566687670657425,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996637721126839,
|
|
"loss": 5.9332,
|
|
"mean_token_accuracy": 0.13520999103784562,
|
|
"num_tokens": 5638629.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 6.005596733093261,
|
|
"epoch": 0.2570888468809074,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004996621335885996,
|
|
"loss": 5.9991,
|
|
"mean_token_accuracy": 0.13599340468645096,
|
|
"num_tokens": 5647571.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 6.013420534133911,
|
|
"epoch": 0.2575089266960722,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004996604910847413,
|
|
"loss": 5.916,
|
|
"mean_token_accuracy": 0.14960622489452363,
|
|
"num_tokens": 5656709.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 6.038319206237793,
|
|
"epoch": 0.2579290065112371,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499658844601138,
|
|
"loss": 6.1017,
|
|
"mean_token_accuracy": 0.13502436354756356,
|
|
"num_tokens": 5665714.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 6.07736644744873,
|
|
"epoch": 0.258349086326402,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499657194137819,
|
|
"loss": 6.0546,
|
|
"mean_token_accuracy": 0.13854038044810296,
|
|
"num_tokens": 5675854.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 6.074629402160644,
|
|
"epoch": 0.2587691661415669,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996555396948136,
|
|
"loss": 5.8721,
|
|
"mean_token_accuracy": 0.13419756293296814,
|
|
"num_tokens": 5685690.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.940470170974732,
|
|
"epoch": 0.25918924595673176,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996538812721509,
|
|
"loss": 5.9341,
|
|
"mean_token_accuracy": 0.14152218475937844,
|
|
"num_tokens": 5695766.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 6.018071937561035,
|
|
"epoch": 0.25960932577189666,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004996522188698603,
|
|
"loss": 5.9909,
|
|
"mean_token_accuracy": 0.13503170683979987,
|
|
"num_tokens": 5704365.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 6.13015513420105,
|
|
"epoch": 0.26002940558706156,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004996505524879714,
|
|
"loss": 6.0965,
|
|
"mean_token_accuracy": 0.13045159131288528,
|
|
"num_tokens": 5713345.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 6.053025817871093,
|
|
"epoch": 0.2604494854022264,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996488821265137,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.14050639048218727,
|
|
"num_tokens": 5722907.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.928135585784912,
|
|
"epoch": 0.2608695652173913,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996472077855166,
|
|
"loss": 5.9387,
|
|
"mean_token_accuracy": 0.13793488591909409,
|
|
"num_tokens": 5731589.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.923902750015259,
|
|
"epoch": 0.2612896450325562,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00049964552946501,
|
|
"loss": 5.9237,
|
|
"mean_token_accuracy": 0.1389499545097351,
|
|
"num_tokens": 5739922.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.905591726303101,
|
|
"epoch": 0.2617097248477211,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996438471650235,
|
|
"loss": 5.8397,
|
|
"mean_token_accuracy": 0.145526784658432,
|
|
"num_tokens": 5749206.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 6.01796875,
|
|
"epoch": 0.26212980466288593,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996421608855869,
|
|
"loss": 5.8992,
|
|
"mean_token_accuracy": 0.1419477328658104,
|
|
"num_tokens": 5758803.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.962277746200561,
|
|
"epoch": 0.26254988447805083,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996404706267301,
|
|
"loss": 5.9991,
|
|
"mean_token_accuracy": 0.1301351211965084,
|
|
"num_tokens": 5768368.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.935734415054322,
|
|
"epoch": 0.26296996429321573,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499638776388483,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14718177318572997,
|
|
"num_tokens": 5776707.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.992966365814209,
|
|
"epoch": 0.26339004410838057,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996370781708757,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.13097626715898514,
|
|
"num_tokens": 5787037.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 6.120069789886474,
|
|
"epoch": 0.26381012392354547,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004996353759739382,
|
|
"loss": 5.9819,
|
|
"mean_token_accuracy": 0.140574112534523,
|
|
"num_tokens": 5796630.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.9368353366851805,
|
|
"epoch": 0.26423020373871037,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004996336697977007,
|
|
"loss": 5.978,
|
|
"mean_token_accuracy": 0.13346768617630006,
|
|
"num_tokens": 5806402.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.97723422050476,
|
|
"epoch": 0.2646502835538752,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004996319596421933,
|
|
"loss": 5.9278,
|
|
"mean_token_accuracy": 0.13734676092863082,
|
|
"num_tokens": 5815742.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.945355033874511,
|
|
"epoch": 0.2650703633690401,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996302455074466,
|
|
"loss": 5.9322,
|
|
"mean_token_accuracy": 0.1382609039545059,
|
|
"num_tokens": 5824915.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 6.0514014720916744,
|
|
"epoch": 0.265490443184205,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996285273934906,
|
|
"loss": 5.9852,
|
|
"mean_token_accuracy": 0.13715496361255647,
|
|
"num_tokens": 5834978.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 6.052202987670898,
|
|
"epoch": 0.2659105229993699,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499626805300356,
|
|
"loss": 6.1228,
|
|
"mean_token_accuracy": 0.1326017878949642,
|
|
"num_tokens": 5845684.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 6.146022653579712,
|
|
"epoch": 0.26633060281453474,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004996250792280732,
|
|
"loss": 5.9964,
|
|
"mean_token_accuracy": 0.13485243916511536,
|
|
"num_tokens": 5854905.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 6.040951061248779,
|
|
"epoch": 0.26675068262969964,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996233491766727,
|
|
"loss": 6.0164,
|
|
"mean_token_accuracy": 0.1350037656724453,
|
|
"num_tokens": 5863654.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 6.058253955841065,
|
|
"epoch": 0.26717076244486454,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996216151461854,
|
|
"loss": 6.0152,
|
|
"mean_token_accuracy": 0.13996267989277839,
|
|
"num_tokens": 5872442.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 6.012804937362671,
|
|
"epoch": 0.2675908422600294,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996198771366417,
|
|
"loss": 5.9378,
|
|
"mean_token_accuracy": 0.13716716319322586,
|
|
"num_tokens": 5882372.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.8219091415405275,
|
|
"epoch": 0.2680109220751943,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996181351480726,
|
|
"loss": 5.7487,
|
|
"mean_token_accuracy": 0.14560527056455613,
|
|
"num_tokens": 5891113.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.941916608810425,
|
|
"epoch": 0.2684310018903592,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996163891805089,
|
|
"loss": 5.9892,
|
|
"mean_token_accuracy": 0.14109294563531877,
|
|
"num_tokens": 5899582.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 6.037355852127075,
|
|
"epoch": 0.2688510817055241,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996146392339815,
|
|
"loss": 5.9353,
|
|
"mean_token_accuracy": 0.1392637461423874,
|
|
"num_tokens": 5908938.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.9513650894165036,
|
|
"epoch": 0.2692711615206889,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996128853085215,
|
|
"loss": 5.9041,
|
|
"mean_token_accuracy": 0.13895752876996995,
|
|
"num_tokens": 5918055.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.997664451599121,
|
|
"epoch": 0.2696912413358538,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996111274041598,
|
|
"loss": 5.8986,
|
|
"mean_token_accuracy": 0.13369553461670874,
|
|
"num_tokens": 5926744.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.959716939926148,
|
|
"epoch": 0.2701113211510187,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996093655209277,
|
|
"loss": 5.9958,
|
|
"mean_token_accuracy": 0.1349453993141651,
|
|
"num_tokens": 5936521.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 6.088764905929565,
|
|
"epoch": 0.27053140096618356,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004996075996588563,
|
|
"loss": 6.0616,
|
|
"mean_token_accuracy": 0.13318859413266182,
|
|
"num_tokens": 5945010.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 6.052014112472534,
|
|
"epoch": 0.27095148078134845,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499605829817977,
|
|
"loss": 5.9638,
|
|
"mean_token_accuracy": 0.14223103746771812,
|
|
"num_tokens": 5953766.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.979779624938965,
|
|
"epoch": 0.27137156059651335,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499604055998321,
|
|
"loss": 5.875,
|
|
"mean_token_accuracy": 0.13957174718379975,
|
|
"num_tokens": 5962168.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.906911420822143,
|
|
"epoch": 0.2717916404116782,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996022781999198,
|
|
"loss": 5.9063,
|
|
"mean_token_accuracy": 0.13852998465299607,
|
|
"num_tokens": 5971627.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.9631248950958256,
|
|
"epoch": 0.2722117202268431,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499600496422805,
|
|
"loss": 5.9925,
|
|
"mean_token_accuracy": 0.13308593779802322,
|
|
"num_tokens": 5981775.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.993693208694458,
|
|
"epoch": 0.272631800042008,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499598710667008,
|
|
"loss": 5.9061,
|
|
"mean_token_accuracy": 0.1379516489803791,
|
|
"num_tokens": 5991097.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.984791469573975,
|
|
"epoch": 0.2730518798571729,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004995969209325604,
|
|
"loss": 5.9693,
|
|
"mean_token_accuracy": 0.13060558065772057,
|
|
"num_tokens": 5999517.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.930228567123413,
|
|
"epoch": 0.2734719596723377,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995951272194941,
|
|
"loss": 5.9479,
|
|
"mean_token_accuracy": 0.12969653084874153,
|
|
"num_tokens": 6008545.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 6.119350004196167,
|
|
"epoch": 0.2738920394875026,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995933295278407,
|
|
"loss": 5.9365,
|
|
"mean_token_accuracy": 0.1350548431277275,
|
|
"num_tokens": 6017366.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.9179764747619625,
|
|
"epoch": 0.2743121193026675,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995915278576321,
|
|
"loss": 5.8875,
|
|
"mean_token_accuracy": 0.14413413256406785,
|
|
"num_tokens": 6025597.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 5.981735897064209,
|
|
"epoch": 0.27473219911783237,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995897222089004,
|
|
"loss": 5.9867,
|
|
"mean_token_accuracy": 0.13929954469203948,
|
|
"num_tokens": 6034239.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 6.11962890625,
|
|
"epoch": 0.27515227893299726,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995879125816772,
|
|
"loss": 6.0068,
|
|
"mean_token_accuracy": 0.13686064183712005,
|
|
"num_tokens": 6043837.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.9640697002410885,
|
|
"epoch": 0.27557235874816216,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995860989759949,
|
|
"loss": 5.956,
|
|
"mean_token_accuracy": 0.1416999839246273,
|
|
"num_tokens": 6053217.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 6.0521222114562985,
|
|
"epoch": 0.27599243856332706,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995842813918855,
|
|
"loss": 5.9551,
|
|
"mean_token_accuracy": 0.13722361102700234,
|
|
"num_tokens": 6061553.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.9697545051574705,
|
|
"epoch": 0.2764125183784919,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995824598293812,
|
|
"loss": 5.8601,
|
|
"mean_token_accuracy": 0.14069184213876723,
|
|
"num_tokens": 6070080.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.995730686187744,
|
|
"epoch": 0.2768325981936568,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995806342885142,
|
|
"loss": 5.9852,
|
|
"mean_token_accuracy": 0.14142092764377595,
|
|
"num_tokens": 6078438.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 6.019344282150269,
|
|
"epoch": 0.2772526780088217,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499578804769317,
|
|
"loss": 5.9771,
|
|
"mean_token_accuracy": 0.13406604304909706,
|
|
"num_tokens": 6087794.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 6.085688066482544,
|
|
"epoch": 0.27767275782398654,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995769712718218,
|
|
"loss": 6.0065,
|
|
"mean_token_accuracy": 0.13597604855895043,
|
|
"num_tokens": 6096709.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.9711473941802975,
|
|
"epoch": 0.27809283763915144,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995751337960613,
|
|
"loss": 5.9269,
|
|
"mean_token_accuracy": 0.13786234930157662,
|
|
"num_tokens": 6105866.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 6.074538946151733,
|
|
"epoch": 0.27851291745431633,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995732923420679,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.13884977921843528,
|
|
"num_tokens": 6114882.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.857705545425415,
|
|
"epoch": 0.2789329972694812,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995714469098743,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.13618046417832375,
|
|
"num_tokens": 6123978.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.886438226699829,
|
|
"epoch": 0.2793530770846461,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499569597499513,
|
|
"loss": 5.9946,
|
|
"mean_token_accuracy": 0.1375075623393059,
|
|
"num_tokens": 6133246.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.993762636184693,
|
|
"epoch": 0.27977315689981097,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995677441110172,
|
|
"loss": 5.8559,
|
|
"mean_token_accuracy": 0.14045721143484116,
|
|
"num_tokens": 6142865.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 6.025714874267578,
|
|
"epoch": 0.28019323671497587,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995658867444192,
|
|
"loss": 5.9512,
|
|
"mean_token_accuracy": 0.13522876128554345,
|
|
"num_tokens": 6152492.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.981087923049927,
|
|
"epoch": 0.2806133165301407,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004995640253997523,
|
|
"loss": 5.959,
|
|
"mean_token_accuracy": 0.1329936422407627,
|
|
"num_tokens": 6161953.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.841523504257202,
|
|
"epoch": 0.2810333963453056,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995621600770492,
|
|
"loss": 5.8129,
|
|
"mean_token_accuracy": 0.1412846788764,
|
|
"num_tokens": 6171467.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.90531325340271,
|
|
"epoch": 0.2814534761604705,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995602907763431,
|
|
"loss": 5.8859,
|
|
"mean_token_accuracy": 0.13736898675560952,
|
|
"num_tokens": 6180646.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.981820106506348,
|
|
"epoch": 0.28187355597563535,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004995584174976672,
|
|
"loss": 5.9116,
|
|
"mean_token_accuracy": 0.13150710314512254,
|
|
"num_tokens": 6189832.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.980225324630737,
|
|
"epoch": 0.28229363579080025,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995565402410544,
|
|
"loss": 5.7994,
|
|
"mean_token_accuracy": 0.14472294151782988,
|
|
"num_tokens": 6198339.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.924914312362671,
|
|
"epoch": 0.28271371560596514,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004995546590065383,
|
|
"loss": 5.8935,
|
|
"mean_token_accuracy": 0.1394026793539524,
|
|
"num_tokens": 6207564.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.931164789199829,
|
|
"epoch": 0.28313379542113004,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995527737941518,
|
|
"loss": 5.9781,
|
|
"mean_token_accuracy": 0.13914698138833045,
|
|
"num_tokens": 6216056.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.968091154098511,
|
|
"epoch": 0.2835538752362949,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995508846039287,
|
|
"loss": 5.9114,
|
|
"mean_token_accuracy": 0.13818917274475098,
|
|
"num_tokens": 6225573.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 6.069493198394776,
|
|
"epoch": 0.2839739550514598,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004995489914359023,
|
|
"loss": 6.0417,
|
|
"mean_token_accuracy": 0.13078732788562775,
|
|
"num_tokens": 6235057.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 6.030756092071533,
|
|
"epoch": 0.2843940348666247,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004995470942901061,
|
|
"loss": 5.9557,
|
|
"mean_token_accuracy": 0.13645285964012147,
|
|
"num_tokens": 6244164.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 6.068174362182617,
|
|
"epoch": 0.2848141146817895,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004995451931665738,
|
|
"loss": 5.9588,
|
|
"mean_token_accuracy": 0.13424528315663337,
|
|
"num_tokens": 6253095.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.918725109100341,
|
|
"epoch": 0.2852341944969544,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499543288065339,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.13533290028572081,
|
|
"num_tokens": 6261134.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.926444101333618,
|
|
"epoch": 0.2856542743121193,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004995413789864354,
|
|
"loss": 5.9066,
|
|
"mean_token_accuracy": 0.1413659855723381,
|
|
"num_tokens": 6270384.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.974505090713501,
|
|
"epoch": 0.28607435412728416,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995394659298971,
|
|
"loss": 5.842,
|
|
"mean_token_accuracy": 0.14783402383327485,
|
|
"num_tokens": 6279702.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.924916839599609,
|
|
"epoch": 0.28649443394244906,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004995375488957576,
|
|
"loss": 5.8871,
|
|
"mean_token_accuracy": 0.1403558671474457,
|
|
"num_tokens": 6288297.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.979348230361938,
|
|
"epoch": 0.28691451375761395,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499535627884051,
|
|
"loss": 5.983,
|
|
"mean_token_accuracy": 0.12937102988362312,
|
|
"num_tokens": 6297288.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 6.12882170677185,
|
|
"epoch": 0.28733459357277885,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995337028948115,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13142260611057283,
|
|
"num_tokens": 6306719.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.93622145652771,
|
|
"epoch": 0.2877546733879437,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995317739280731,
|
|
"loss": 5.8256,
|
|
"mean_token_accuracy": 0.14748729318380355,
|
|
"num_tokens": 6316639.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.951609373092651,
|
|
"epoch": 0.2881747532031086,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004995298409838699,
|
|
"loss": 5.9555,
|
|
"mean_token_accuracy": 0.1391440898180008,
|
|
"num_tokens": 6326879.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.9383097171783445,
|
|
"epoch": 0.2885948330182735,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499527904062236,
|
|
"loss": 5.8671,
|
|
"mean_token_accuracy": 0.139659284055233,
|
|
"num_tokens": 6335729.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.971969127655029,
|
|
"epoch": 0.28901491283343833,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995259631632061,
|
|
"loss": 5.9185,
|
|
"mean_token_accuracy": 0.1310904636979103,
|
|
"num_tokens": 6345154.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.977327823638916,
|
|
"epoch": 0.28943499264860323,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995240182868143,
|
|
"loss": 5.8858,
|
|
"mean_token_accuracy": 0.14063168689608574,
|
|
"num_tokens": 6354309.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.8834575653076175,
|
|
"epoch": 0.2898550724637681,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995220694330951,
|
|
"loss": 5.8586,
|
|
"mean_token_accuracy": 0.14082162082195282,
|
|
"num_tokens": 6363389.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.92822527885437,
|
|
"epoch": 0.290275152278933,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995201166020832,
|
|
"loss": 5.9065,
|
|
"mean_token_accuracy": 0.13562884032726288,
|
|
"num_tokens": 6372475.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 6.024522161483764,
|
|
"epoch": 0.29069523209409787,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499518159793813,
|
|
"loss": 5.8677,
|
|
"mean_token_accuracy": 0.14305904358625413,
|
|
"num_tokens": 6380906.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.884508085250855,
|
|
"epoch": 0.29111531190926276,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499516199008319,
|
|
"loss": 5.8659,
|
|
"mean_token_accuracy": 0.14293192625045775,
|
|
"num_tokens": 6390085.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 6.008301162719727,
|
|
"epoch": 0.29153539172442766,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995142342456364,
|
|
"loss": 5.9391,
|
|
"mean_token_accuracy": 0.13623592853546143,
|
|
"num_tokens": 6399441.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 6.066584539413452,
|
|
"epoch": 0.2919554715395925,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995122655057997,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.13953343629837037,
|
|
"num_tokens": 6408995.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.888063764572143,
|
|
"epoch": 0.2923755513547574,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995102927888437,
|
|
"loss": 5.7722,
|
|
"mean_token_accuracy": 0.1459358014166355,
|
|
"num_tokens": 6418080.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.952468156814575,
|
|
"epoch": 0.2927956311699223,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004995083160948036,
|
|
"loss": 5.9318,
|
|
"mean_token_accuracy": 0.14023924767971038,
|
|
"num_tokens": 6426732.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.971553039550781,
|
|
"epoch": 0.29321571098508714,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004995063354237141,
|
|
"loss": 5.9538,
|
|
"mean_token_accuracy": 0.14043337404727935,
|
|
"num_tokens": 6435957.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.94589900970459,
|
|
"epoch": 0.29363579080025204,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004995043507756107,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.133124540746212,
|
|
"num_tokens": 6445642.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.974902820587158,
|
|
"epoch": 0.29405587061541694,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004995023621505282,
|
|
"loss": 5.9363,
|
|
"mean_token_accuracy": 0.1418766610324383,
|
|
"num_tokens": 6454664.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.940143728256226,
|
|
"epoch": 0.29447595043058183,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000499500369548502,
|
|
"loss": 5.8583,
|
|
"mean_token_accuracy": 0.1379205584526062,
|
|
"num_tokens": 6463224.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 6.120481824874878,
|
|
"epoch": 0.2948960302457467,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994983729695674,
|
|
"loss": 6.0926,
|
|
"mean_token_accuracy": 0.1296972803771496,
|
|
"num_tokens": 6473112.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.980841064453125,
|
|
"epoch": 0.2953161100609116,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004994963724137595,
|
|
"loss": 5.9214,
|
|
"mean_token_accuracy": 0.1389226034283638,
|
|
"num_tokens": 6482062.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.932737588882446,
|
|
"epoch": 0.29573618987607647,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004994943678811142,
|
|
"loss": 5.9004,
|
|
"mean_token_accuracy": 0.13374803215265274,
|
|
"num_tokens": 6490568.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.997820091247559,
|
|
"epoch": 0.2961562696912413,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994923593716667,
|
|
"loss": 5.963,
|
|
"mean_token_accuracy": 0.14052257165312768,
|
|
"num_tokens": 6500815.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.916243839263916,
|
|
"epoch": 0.2965763495064062,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004994903468854527,
|
|
"loss": 5.8376,
|
|
"mean_token_accuracy": 0.14926647543907165,
|
|
"num_tokens": 6509529.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.922206735610962,
|
|
"epoch": 0.2969964293215711,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994883304225077,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.13852014467120172,
|
|
"num_tokens": 6517934.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.9876025199890135,
|
|
"epoch": 0.297416509136736,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994863099828675,
|
|
"loss": 5.8695,
|
|
"mean_token_accuracy": 0.14087166935205458,
|
|
"num_tokens": 6526098.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.935700082778931,
|
|
"epoch": 0.29783658895190085,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499484285566568,
|
|
"loss": 5.906,
|
|
"mean_token_accuracy": 0.13566448390483857,
|
|
"num_tokens": 6535831.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.939550399780273,
|
|
"epoch": 0.29825666876706575,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994822571736449,
|
|
"loss": 5.8255,
|
|
"mean_token_accuracy": 0.13489115089178086,
|
|
"num_tokens": 6545704.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.947116851806641,
|
|
"epoch": 0.29867674858223064,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994802248041342,
|
|
"loss": 5.8548,
|
|
"mean_token_accuracy": 0.14142827019095422,
|
|
"num_tokens": 6554423.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.969081258773803,
|
|
"epoch": 0.2990968283973955,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000499478188458072,
|
|
"loss": 5.9073,
|
|
"mean_token_accuracy": 0.13533755540847778,
|
|
"num_tokens": 6563989.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.9689305305480955,
|
|
"epoch": 0.2995169082125604,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004994761481354943,
|
|
"loss": 6.0328,
|
|
"mean_token_accuracy": 0.13800237625837325,
|
|
"num_tokens": 6572745.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 6.133339929580688,
|
|
"epoch": 0.2999369880277253,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004994741038364371,
|
|
"loss": 6.0333,
|
|
"mean_token_accuracy": 0.13616435453295708,
|
|
"num_tokens": 6581723.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.896167135238647,
|
|
"epoch": 0.3003570678428901,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004994720555609369,
|
|
"loss": 5.7604,
|
|
"mean_token_accuracy": 0.1434899814426899,
|
|
"num_tokens": 6590342.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.878182983398437,
|
|
"epoch": 0.300777147658055,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004994700033090297,
|
|
"loss": 5.8344,
|
|
"mean_token_accuracy": 0.14836035221815108,
|
|
"num_tokens": 6599206.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 6.036917591094971,
|
|
"epoch": 0.3011972274732199,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000499467947080752,
|
|
"loss": 6.1289,
|
|
"mean_token_accuracy": 0.13054108917713164,
|
|
"num_tokens": 6608947.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 6.017320966720581,
|
|
"epoch": 0.3016173072883848,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004994658868761402,
|
|
"loss": 5.9128,
|
|
"mean_token_accuracy": 0.14748418629169463,
|
|
"num_tokens": 6618378.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.987727546691895,
|
|
"epoch": 0.30203738710354966,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994638226952307,
|
|
"loss": 5.9681,
|
|
"mean_token_accuracy": 0.13054394274950026,
|
|
"num_tokens": 6627527.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.996758890151978,
|
|
"epoch": 0.30245746691871456,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004994617545380604,
|
|
"loss": 5.8919,
|
|
"mean_token_accuracy": 0.13826094195246696,
|
|
"num_tokens": 6636964.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.905787420272827,
|
|
"epoch": 0.30287754673387945,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004994596824046656,
|
|
"loss": 5.8569,
|
|
"mean_token_accuracy": 0.141887067258358,
|
|
"num_tokens": 6646074.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.99219708442688,
|
|
"epoch": 0.3032976265490443,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000499457606295083,
|
|
"loss": 5.9311,
|
|
"mean_token_accuracy": 0.13836071118712426,
|
|
"num_tokens": 6655027.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.7845015048980715,
|
|
"epoch": 0.3037177063642092,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004994555262093495,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.15755455046892167,
|
|
"num_tokens": 6663747.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 6.036468362808227,
|
|
"epoch": 0.3041377861793741,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499453442147502,
|
|
"loss": 6.0392,
|
|
"mean_token_accuracy": 0.13115543723106385,
|
|
"num_tokens": 6672922.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.979010963439942,
|
|
"epoch": 0.304557865994539,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004994513541095773,
|
|
"loss": 5.8654,
|
|
"mean_token_accuracy": 0.14586904942989348,
|
|
"num_tokens": 6682233.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.928103733062744,
|
|
"epoch": 0.30497794580970383,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004994492620956126,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.14258120208978653,
|
|
"num_tokens": 6691593.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.953917217254639,
|
|
"epoch": 0.30539802562486873,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994471661056445,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.14142323583364486,
|
|
"num_tokens": 6701318.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 5.986124277114868,
|
|
"epoch": 0.3058181054400336,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994450661397106,
|
|
"loss": 5.9176,
|
|
"mean_token_accuracy": 0.14466760009527208,
|
|
"num_tokens": 6710059.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 6.110535717010498,
|
|
"epoch": 0.30623818525519847,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499442962197848,
|
|
"loss": 6.0091,
|
|
"mean_token_accuracy": 0.1349786825478077,
|
|
"num_tokens": 6719811.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.885643482208252,
|
|
"epoch": 0.30665826507036337,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994408542800937,
|
|
"loss": 5.8848,
|
|
"mean_token_accuracy": 0.13900379538536073,
|
|
"num_tokens": 6728789.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.929373550415039,
|
|
"epoch": 0.30707834488552826,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004994387423864855,
|
|
"loss": 5.8632,
|
|
"mean_token_accuracy": 0.1396006353199482,
|
|
"num_tokens": 6737706.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.928421974182129,
|
|
"epoch": 0.3074984247006931,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994366265170603,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.1530800625681877,
|
|
"num_tokens": 6746861.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 6.01959867477417,
|
|
"epoch": 0.307918504515858,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994345066718558,
|
|
"loss": 6.0207,
|
|
"mean_token_accuracy": 0.13322951793670654,
|
|
"num_tokens": 6755242.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 6.026466798782349,
|
|
"epoch": 0.3083385843310229,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994323828509098,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.13347591310739518,
|
|
"num_tokens": 6764549.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.915293598175049,
|
|
"epoch": 0.3087586641461878,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004994302550542596,
|
|
"loss": 5.9418,
|
|
"mean_token_accuracy": 0.14316236823797227,
|
|
"num_tokens": 6774123.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.850841808319092,
|
|
"epoch": 0.30917874396135264,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000499428123281943,
|
|
"loss": 5.7122,
|
|
"mean_token_accuracy": 0.1474112629890442,
|
|
"num_tokens": 6782922.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.9184730052948,
|
|
"epoch": 0.30959882377651754,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994259875339978,
|
|
"loss": 5.9611,
|
|
"mean_token_accuracy": 0.13746373876929283,
|
|
"num_tokens": 6792042.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 6.05865330696106,
|
|
"epoch": 0.31001890359168244,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004994238478104617,
|
|
"loss": 5.9598,
|
|
"mean_token_accuracy": 0.1366279661655426,
|
|
"num_tokens": 6800994.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.93690128326416,
|
|
"epoch": 0.3104389834068473,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994217041113727,
|
|
"loss": 5.8868,
|
|
"mean_token_accuracy": 0.14316150173544884,
|
|
"num_tokens": 6809938.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 6.014241790771484,
|
|
"epoch": 0.3108590632220122,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994195564367688,
|
|
"loss": 6.0213,
|
|
"mean_token_accuracy": 0.13116879239678383,
|
|
"num_tokens": 6820289.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 6.002475690841675,
|
|
"epoch": 0.3112791430371771,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994174047866882,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14203700423240662,
|
|
"num_tokens": 6830068.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.788861274719238,
|
|
"epoch": 0.3116992228523419,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994152491611686,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.13960717990994453,
|
|
"num_tokens": 6838591.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.89765567779541,
|
|
"epoch": 0.3121193026675068,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994130895602485,
|
|
"loss": 5.8505,
|
|
"mean_token_accuracy": 0.13729089125990868,
|
|
"num_tokens": 6847796.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 6.010899591445923,
|
|
"epoch": 0.3125393824826717,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499410925983966,
|
|
"loss": 5.941,
|
|
"mean_token_accuracy": 0.13994767293334007,
|
|
"num_tokens": 6856585.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.889919090270996,
|
|
"epoch": 0.3129594622978366,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004994087584323596,
|
|
"loss": 5.8502,
|
|
"mean_token_accuracy": 0.14524889141321182,
|
|
"num_tokens": 6865757.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.9244975566864015,
|
|
"epoch": 0.31337954211300145,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004994065869054676,
|
|
"loss": 5.9051,
|
|
"mean_token_accuracy": 0.13346855491399764,
|
|
"num_tokens": 6875371.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.990236139297485,
|
|
"epoch": 0.31379962192816635,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004994044114033283,
|
|
"loss": 5.9445,
|
|
"mean_token_accuracy": 0.13406403809785844,
|
|
"num_tokens": 6884050.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 6.023118162155152,
|
|
"epoch": 0.31421970174333125,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994022319259806,
|
|
"loss": 5.9236,
|
|
"mean_token_accuracy": 0.1428280971944332,
|
|
"num_tokens": 6893079.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.977470397949219,
|
|
"epoch": 0.3146397815584961,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004994000484734629,
|
|
"loss": 6.0157,
|
|
"mean_token_accuracy": 0.14197005555033684,
|
|
"num_tokens": 6903100.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.968418455123901,
|
|
"epoch": 0.315059861373661,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993978610458137,
|
|
"loss": 5.8564,
|
|
"mean_token_accuracy": 0.1436561480164528,
|
|
"num_tokens": 6912164.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.8913768291473385,
|
|
"epoch": 0.3154799411888259,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993956696430721,
|
|
"loss": 5.8793,
|
|
"mean_token_accuracy": 0.13736136257648468,
|
|
"num_tokens": 6921183.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 6.017658281326294,
|
|
"epoch": 0.3159000210039908,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993934742652768,
|
|
"loss": 5.9616,
|
|
"mean_token_accuracy": 0.1389385998249054,
|
|
"num_tokens": 6931325.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 6.002210426330566,
|
|
"epoch": 0.3163201008191556,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004993912749124665,
|
|
"loss": 5.8433,
|
|
"mean_token_accuracy": 0.1487124353647232,
|
|
"num_tokens": 6940234.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.929537010192871,
|
|
"epoch": 0.3167401806343205,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004993890715846804,
|
|
"loss": 5.9507,
|
|
"mean_token_accuracy": 0.14044182747602463,
|
|
"num_tokens": 6949067.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.998405647277832,
|
|
"epoch": 0.3171602604494854,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993868642819574,
|
|
"loss": 5.9194,
|
|
"mean_token_accuracy": 0.13718469440937042,
|
|
"num_tokens": 6959085.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.961022281646729,
|
|
"epoch": 0.31758034026465026,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004993846530043367,
|
|
"loss": 5.9451,
|
|
"mean_token_accuracy": 0.13289572075009345,
|
|
"num_tokens": 6967392.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.938811779022217,
|
|
"epoch": 0.31800042007981516,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993824377518574,
|
|
"loss": 5.8794,
|
|
"mean_token_accuracy": 0.14492053985595704,
|
|
"num_tokens": 6976369.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 6.007278203964233,
|
|
"epoch": 0.31842049989498006,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004993802185245587,
|
|
"loss": 5.8979,
|
|
"mean_token_accuracy": 0.14349642321467398,
|
|
"num_tokens": 6985889.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.902310371398926,
|
|
"epoch": 0.3188405797101449,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00049937799532248,
|
|
"loss": 5.9155,
|
|
"mean_token_accuracy": 0.13254671469330787,
|
|
"num_tokens": 6995396.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 6.108139371871948,
|
|
"epoch": 0.3192606595253098,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993757681456607,
|
|
"loss": 5.974,
|
|
"mean_token_accuracy": 0.13683522641658782,
|
|
"num_tokens": 7004666.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.993764448165893,
|
|
"epoch": 0.3196807393404747,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004993735369941401,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13341464176774026,
|
|
"num_tokens": 7014608.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.958604240417481,
|
|
"epoch": 0.3201008191556396,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993713018679579,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.14026129618287086,
|
|
"num_tokens": 7023671.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.995219898223877,
|
|
"epoch": 0.32052089897080444,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993690627671536,
|
|
"loss": 5.9253,
|
|
"mean_token_accuracy": 0.13401568681001663,
|
|
"num_tokens": 7033786.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.926336812973022,
|
|
"epoch": 0.32094097878596933,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004993668196917669,
|
|
"loss": 5.8311,
|
|
"mean_token_accuracy": 0.14573807418346404,
|
|
"num_tokens": 7042162.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.96917757987976,
|
|
"epoch": 0.32136105860113423,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993645726418375,
|
|
"loss": 5.981,
|
|
"mean_token_accuracy": 0.13832971975207328,
|
|
"num_tokens": 7051903.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.879901790618897,
|
|
"epoch": 0.3217811384162991,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993623216174053,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.15186585038900374,
|
|
"num_tokens": 7060229.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.918556547164917,
|
|
"epoch": 0.32220121823146397,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00049936006661851,
|
|
"loss": 5.8909,
|
|
"mean_token_accuracy": 0.13876768276095391,
|
|
"num_tokens": 7069040.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.9392224788665775,
|
|
"epoch": 0.32262129804662887,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993578076451917,
|
|
"loss": 5.7726,
|
|
"mean_token_accuracy": 0.14143876731395721,
|
|
"num_tokens": 7078409.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.779048347473145,
|
|
"epoch": 0.32304137786179377,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004993555446974903,
|
|
"loss": 5.8733,
|
|
"mean_token_accuracy": 0.13716461956501008,
|
|
"num_tokens": 7087983.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.941289329528809,
|
|
"epoch": 0.3234614576769586,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499353277775446,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.14281788170337678,
|
|
"num_tokens": 7097277.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.894749402999878,
|
|
"epoch": 0.3238815374921235,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004993510068790989,
|
|
"loss": 5.7164,
|
|
"mean_token_accuracy": 0.15665216147899627,
|
|
"num_tokens": 7105918.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.773345851898194,
|
|
"epoch": 0.3243016173072884,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004993487320084892,
|
|
"loss": 5.7838,
|
|
"mean_token_accuracy": 0.15064965635538102,
|
|
"num_tokens": 7115049.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.944450235366821,
|
|
"epoch": 0.32472169712245325,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993464531636573,
|
|
"loss": 5.8883,
|
|
"mean_token_accuracy": 0.13874924927949905,
|
|
"num_tokens": 7124862.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.947724437713623,
|
|
"epoch": 0.32514177693761814,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993441703446435,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.1445206731557846,
|
|
"num_tokens": 7133280.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 6.020012712478637,
|
|
"epoch": 0.32556185675278304,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993418835514882,
|
|
"loss": 5.9743,
|
|
"mean_token_accuracy": 0.1368774726986885,
|
|
"num_tokens": 7142446.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.944014692306519,
|
|
"epoch": 0.3259819365679479,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004993395927842321,
|
|
"loss": 5.8824,
|
|
"mean_token_accuracy": 0.1359010323882103,
|
|
"num_tokens": 7152143.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.993379163742065,
|
|
"epoch": 0.3264020163831128,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004993372980429155,
|
|
"loss": 5.9617,
|
|
"mean_token_accuracy": 0.13282209262251854,
|
|
"num_tokens": 7162046.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.989493370056152,
|
|
"epoch": 0.3268220961982777,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004993349993275792,
|
|
"loss": 5.8488,
|
|
"mean_token_accuracy": 0.14026510193943978,
|
|
"num_tokens": 7171557.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.754479789733887,
|
|
"epoch": 0.3272421760134426,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004993326966382639,
|
|
"loss": 5.7423,
|
|
"mean_token_accuracy": 0.14871106296777725,
|
|
"num_tokens": 7180927.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.8972282886505125,
|
|
"epoch": 0.3276622558286074,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004993303899750104,
|
|
"loss": 5.8311,
|
|
"mean_token_accuracy": 0.1395234152674675,
|
|
"num_tokens": 7189552.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 6.021924352645874,
|
|
"epoch": 0.3280823356437723,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004993280793378595,
|
|
"loss": 5.8549,
|
|
"mean_token_accuracy": 0.13788855373859404,
|
|
"num_tokens": 7197857.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.914785861968994,
|
|
"epoch": 0.3285024154589372,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993257647268522,
|
|
"loss": 5.8281,
|
|
"mean_token_accuracy": 0.14489276185631753,
|
|
"num_tokens": 7206785.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.945201826095581,
|
|
"epoch": 0.32892249527410206,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004993234461420295,
|
|
"loss": 5.9003,
|
|
"mean_token_accuracy": 0.1415283761918545,
|
|
"num_tokens": 7216360.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.844962692260742,
|
|
"epoch": 0.32934257508926695,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004993211235834326,
|
|
"loss": 5.7122,
|
|
"mean_token_accuracy": 0.15939737260341644,
|
|
"num_tokens": 7224890.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.77975697517395,
|
|
"epoch": 0.32976265490443185,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004993187970511023,
|
|
"loss": 5.7707,
|
|
"mean_token_accuracy": 0.16336829960346222,
|
|
"num_tokens": 7234442.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.964393234252929,
|
|
"epoch": 0.33018273471959675,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993164665450801,
|
|
"loss": 5.9279,
|
|
"mean_token_accuracy": 0.1439814858138561,
|
|
"num_tokens": 7244023.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.916021871566772,
|
|
"epoch": 0.3306028145347616,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993141320654072,
|
|
"loss": 5.7793,
|
|
"mean_token_accuracy": 0.14671456664800644,
|
|
"num_tokens": 7253548.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.898174810409546,
|
|
"epoch": 0.3310228943499265,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000499311793612125,
|
|
"loss": 5.8402,
|
|
"mean_token_accuracy": 0.1421785496175289,
|
|
"num_tokens": 7262962.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.964570426940918,
|
|
"epoch": 0.3314429741650914,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993094511852748,
|
|
"loss": 5.863,
|
|
"mean_token_accuracy": 0.14184453189373017,
|
|
"num_tokens": 7272234.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.929952716827392,
|
|
"epoch": 0.33186305398025623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993071047848983,
|
|
"loss": 5.8493,
|
|
"mean_token_accuracy": 0.1383821338415146,
|
|
"num_tokens": 7281524.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.838898372650147,
|
|
"epoch": 0.3322831337954211,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004993047544110368,
|
|
"loss": 5.7384,
|
|
"mean_token_accuracy": 0.14712240919470787,
|
|
"num_tokens": 7289601.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.791057062149048,
|
|
"epoch": 0.332703213610586,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993024000637321,
|
|
"loss": 5.7137,
|
|
"mean_token_accuracy": 0.15096415132284163,
|
|
"num_tokens": 7298508.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.892502069473267,
|
|
"epoch": 0.33312329342575087,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993000417430259,
|
|
"loss": 5.9339,
|
|
"mean_token_accuracy": 0.1390118695795536,
|
|
"num_tokens": 7309065.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 6.066646718978882,
|
|
"epoch": 0.33354337324091576,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00049929767944896,
|
|
"loss": 5.953,
|
|
"mean_token_accuracy": 0.1411003813147545,
|
|
"num_tokens": 7319669.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 6.000399112701416,
|
|
"epoch": 0.33396345305608066,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004992953131815761,
|
|
"loss": 5.9022,
|
|
"mean_token_accuracy": 0.1418354742228985,
|
|
"num_tokens": 7328425.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.8749700546264645,
|
|
"epoch": 0.33438353287124556,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992929429409164,
|
|
"loss": 5.775,
|
|
"mean_token_accuracy": 0.1469979852437973,
|
|
"num_tokens": 7337369.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.913109064102173,
|
|
"epoch": 0.3348036126864104,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992905687270225,
|
|
"loss": 5.8411,
|
|
"mean_token_accuracy": 0.1466023862361908,
|
|
"num_tokens": 7346829.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.973616456985473,
|
|
"epoch": 0.3352236925015753,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992881905399368,
|
|
"loss": 5.9044,
|
|
"mean_token_accuracy": 0.14303565323352813,
|
|
"num_tokens": 7355976.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.9362890243530275,
|
|
"epoch": 0.3356437723167402,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004992858083797013,
|
|
"loss": 5.8555,
|
|
"mean_token_accuracy": 0.13833607137203216,
|
|
"num_tokens": 7365210.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.910732650756836,
|
|
"epoch": 0.33606385213190504,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992834222463581,
|
|
"loss": 5.9097,
|
|
"mean_token_accuracy": 0.13066598325967788,
|
|
"num_tokens": 7374175.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 6.022627830505371,
|
|
"epoch": 0.33648393194706994,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992810321399496,
|
|
"loss": 5.936,
|
|
"mean_token_accuracy": 0.13869498372077943,
|
|
"num_tokens": 7383302.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 6.006158876419067,
|
|
"epoch": 0.33690401176223483,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004992786380605182,
|
|
"loss": 5.9162,
|
|
"mean_token_accuracy": 0.13912810906767845,
|
|
"num_tokens": 7392746.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.839102506637573,
|
|
"epoch": 0.33732409157739973,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992762400081062,
|
|
"loss": 5.7562,
|
|
"mean_token_accuracy": 0.1469271421432495,
|
|
"num_tokens": 7401604.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.856449317932129,
|
|
"epoch": 0.3377441713925646,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004992738379827559,
|
|
"loss": 5.8677,
|
|
"mean_token_accuracy": 0.13804834261536597,
|
|
"num_tokens": 7410594.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.922429132461548,
|
|
"epoch": 0.33816425120772947,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004992714319845101,
|
|
"loss": 5.7704,
|
|
"mean_token_accuracy": 0.15343396067619325,
|
|
"num_tokens": 7418831.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.8475088596344,
|
|
"epoch": 0.33858433102289437,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992690220134116,
|
|
"loss": 5.8188,
|
|
"mean_token_accuracy": 0.144370898604393,
|
|
"num_tokens": 7427731.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 6.030502510070801,
|
|
"epoch": 0.3390044108380592,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992666080695027,
|
|
"loss": 5.9373,
|
|
"mean_token_accuracy": 0.13586149737238884,
|
|
"num_tokens": 7436447.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.901221179962159,
|
|
"epoch": 0.3394244906532241,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992641901528262,
|
|
"loss": 5.8156,
|
|
"mean_token_accuracy": 0.14270046576857567,
|
|
"num_tokens": 7445352.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.946398782730102,
|
|
"epoch": 0.339844570468389,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992617682634252,
|
|
"loss": 5.8858,
|
|
"mean_token_accuracy": 0.1441212549805641,
|
|
"num_tokens": 7454298.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.920703315734864,
|
|
"epoch": 0.34026465028355385,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004992593424013424,
|
|
"loss": 5.8948,
|
|
"mean_token_accuracy": 0.13869627565145493,
|
|
"num_tokens": 7463543.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.9791840553283695,
|
|
"epoch": 0.34068473009871875,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992569125666209,
|
|
"loss": 5.9195,
|
|
"mean_token_accuracy": 0.14178480133414267,
|
|
"num_tokens": 7472701.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 6.054230260848999,
|
|
"epoch": 0.34110480991388364,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992544787593037,
|
|
"loss": 5.9062,
|
|
"mean_token_accuracy": 0.13785406127572059,
|
|
"num_tokens": 7481123.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.989615964889526,
|
|
"epoch": 0.34152488972904854,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992520409794338,
|
|
"loss": 5.9555,
|
|
"mean_token_accuracy": 0.14264528974890708,
|
|
"num_tokens": 7490439.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.894261217117309,
|
|
"epoch": 0.3419449695442134,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992495992270544,
|
|
"loss": 5.8444,
|
|
"mean_token_accuracy": 0.1425054393708706,
|
|
"num_tokens": 7499326.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.95070858001709,
|
|
"epoch": 0.3423650493593783,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992471535022089,
|
|
"loss": 5.8947,
|
|
"mean_token_accuracy": 0.14209673926234245,
|
|
"num_tokens": 7509407.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.978242111206055,
|
|
"epoch": 0.3427851291745432,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992447038049405,
|
|
"loss": 5.9368,
|
|
"mean_token_accuracy": 0.1432798534631729,
|
|
"num_tokens": 7518443.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.854420137405396,
|
|
"epoch": 0.343205208989708,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992422501352927,
|
|
"loss": 5.7979,
|
|
"mean_token_accuracy": 0.15148040205240249,
|
|
"num_tokens": 7527609.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.958763885498047,
|
|
"epoch": 0.3436252888048729,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992397924933089,
|
|
"loss": 5.8829,
|
|
"mean_token_accuracy": 0.14002160280942916,
|
|
"num_tokens": 7536890.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.984218978881836,
|
|
"epoch": 0.3440453686200378,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992373308790325,
|
|
"loss": 5.8445,
|
|
"mean_token_accuracy": 0.14879057705402374,
|
|
"num_tokens": 7546509.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.8121418952941895,
|
|
"epoch": 0.3444654484352027,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992348652925074,
|
|
"loss": 5.8814,
|
|
"mean_token_accuracy": 0.13877593278884887,
|
|
"num_tokens": 7555336.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.959460878372193,
|
|
"epoch": 0.34488552825036756,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992323957337771,
|
|
"loss": 5.8217,
|
|
"mean_token_accuracy": 0.14075680449604988,
|
|
"num_tokens": 7565210.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.997728681564331,
|
|
"epoch": 0.34530560806553245,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992299222028855,
|
|
"loss": 5.9177,
|
|
"mean_token_accuracy": 0.14632946625351906,
|
|
"num_tokens": 7574516.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.837478542327881,
|
|
"epoch": 0.34572568788069735,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992274446998761,
|
|
"loss": 5.7701,
|
|
"mean_token_accuracy": 0.14613791555166245,
|
|
"num_tokens": 7583219.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.990570783615112,
|
|
"epoch": 0.3461457676958622,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992249632247929,
|
|
"loss": 5.9898,
|
|
"mean_token_accuracy": 0.13541294783353805,
|
|
"num_tokens": 7592050.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 6.017976236343384,
|
|
"epoch": 0.3465658475110271,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992224777776802,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.1406927302479744,
|
|
"num_tokens": 7600718.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.928384780883789,
|
|
"epoch": 0.346985927326192,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992199883585816,
|
|
"loss": 5.8623,
|
|
"mean_token_accuracy": 0.14485160112380982,
|
|
"num_tokens": 7609191.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.958423805236817,
|
|
"epoch": 0.34740600714135683,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992174949675413,
|
|
"loss": 5.8819,
|
|
"mean_token_accuracy": 0.14174177944660188,
|
|
"num_tokens": 7618509.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.890047216415406,
|
|
"epoch": 0.34782608695652173,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004992149976046037,
|
|
"loss": 5.8117,
|
|
"mean_token_accuracy": 0.14391598626971244,
|
|
"num_tokens": 7627851.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.892529726028442,
|
|
"epoch": 0.3482461667716866,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992124962698128,
|
|
"loss": 5.8894,
|
|
"mean_token_accuracy": 0.13846235871315002,
|
|
"num_tokens": 7636748.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.952128744125366,
|
|
"epoch": 0.3486662465868515,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499209990963213,
|
|
"loss": 5.7996,
|
|
"mean_token_accuracy": 0.14363356158137322,
|
|
"num_tokens": 7645436.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.9340009689331055,
|
|
"epoch": 0.34908632640201637,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992074816848487,
|
|
"loss": 5.9287,
|
|
"mean_token_accuracy": 0.13951508998870848,
|
|
"num_tokens": 7655414.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.832207345962525,
|
|
"epoch": 0.34950640621718126,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992049684347642,
|
|
"loss": 5.7094,
|
|
"mean_token_accuracy": 0.14780430346727372,
|
|
"num_tokens": 7664295.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.929846525192261,
|
|
"epoch": 0.34992648603234616,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004992024512130042,
|
|
"loss": 5.8569,
|
|
"mean_token_accuracy": 0.14193690866231917,
|
|
"num_tokens": 7673295.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.905185222625732,
|
|
"epoch": 0.350346565847511,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991999300196132,
|
|
"loss": 5.8475,
|
|
"mean_token_accuracy": 0.13919475451111793,
|
|
"num_tokens": 7682932.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 6.005189561843872,
|
|
"epoch": 0.3507666456626759,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004991974048546359,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.13765867426991463,
|
|
"num_tokens": 7692105.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.873351955413819,
|
|
"epoch": 0.3511867254778408,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499194875718117,
|
|
"loss": 5.859,
|
|
"mean_token_accuracy": 0.1459092453122139,
|
|
"num_tokens": 7701294.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.976405239105224,
|
|
"epoch": 0.3516068052930057,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991923426101013,
|
|
"loss": 5.8556,
|
|
"mean_token_accuracy": 0.14097452014684678,
|
|
"num_tokens": 7710964.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.988002777099609,
|
|
"epoch": 0.35202688510817054,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991898055306337,
|
|
"loss": 5.9768,
|
|
"mean_token_accuracy": 0.13131897300481796,
|
|
"num_tokens": 7719938.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.942753410339355,
|
|
"epoch": 0.35244696492333544,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004991872644797591,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.13939437940716742,
|
|
"num_tokens": 7729129.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.955871152877807,
|
|
"epoch": 0.35286704473850034,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004991847194575226,
|
|
"loss": 5.8881,
|
|
"mean_token_accuracy": 0.13834249898791312,
|
|
"num_tokens": 7738506.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 6.041079711914063,
|
|
"epoch": 0.3532871245536652,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991821704639693,
|
|
"loss": 5.9968,
|
|
"mean_token_accuracy": 0.13867756947875023,
|
|
"num_tokens": 7749320.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 6.0422234535217285,
|
|
"epoch": 0.3537072043688301,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991796174991443,
|
|
"loss": 5.8516,
|
|
"mean_token_accuracy": 0.14419358000159263,
|
|
"num_tokens": 7758735.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.810104942321777,
|
|
"epoch": 0.354127284183995,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004991770605630927,
|
|
"loss": 5.8115,
|
|
"mean_token_accuracy": 0.14199010655283928,
|
|
"num_tokens": 7767556.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.862843370437622,
|
|
"epoch": 0.3545473639991598,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991744996558599,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.14548772126436232,
|
|
"num_tokens": 7776615.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.955168771743774,
|
|
"epoch": 0.3549674438143247,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004991719347774913,
|
|
"loss": 5.8885,
|
|
"mean_token_accuracy": 0.14509620741009713,
|
|
"num_tokens": 7785288.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.897441482543945,
|
|
"epoch": 0.3553875236294896,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004991693659280324,
|
|
"loss": 5.7878,
|
|
"mean_token_accuracy": 0.1456679493188858,
|
|
"num_tokens": 7794381.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.895413112640381,
|
|
"epoch": 0.3558076034446545,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004991667931075284,
|
|
"loss": 5.7548,
|
|
"mean_token_accuracy": 0.14165765419602394,
|
|
"num_tokens": 7803265.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.8606267929077145,
|
|
"epoch": 0.35622768325981935,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991642163160252,
|
|
"loss": 5.8796,
|
|
"mean_token_accuracy": 0.13830938637256623,
|
|
"num_tokens": 7812445.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.941714191436768,
|
|
"epoch": 0.35664776307498425,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991616355535684,
|
|
"loss": 5.8695,
|
|
"mean_token_accuracy": 0.1441208615899086,
|
|
"num_tokens": 7822073.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 6.004122114181518,
|
|
"epoch": 0.35706784289014915,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004991590508202036,
|
|
"loss": 5.8472,
|
|
"mean_token_accuracy": 0.13856493979692458,
|
|
"num_tokens": 7831193.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.952021503448487,
|
|
"epoch": 0.357487922705314,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004991564621159766,
|
|
"loss": 5.8909,
|
|
"mean_token_accuracy": 0.1399833530187607,
|
|
"num_tokens": 7840311.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.902349615097046,
|
|
"epoch": 0.3579080025204789,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004991538694409334,
|
|
"loss": 5.8981,
|
|
"mean_token_accuracy": 0.13640205860137938,
|
|
"num_tokens": 7849622.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.93274884223938,
|
|
"epoch": 0.3583280823356438,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004991512727951198,
|
|
"loss": 5.8639,
|
|
"mean_token_accuracy": 0.1423584371805191,
|
|
"num_tokens": 7859494.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 6.066871976852417,
|
|
"epoch": 0.3587481621508087,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991486721785818,
|
|
"loss": 5.9611,
|
|
"mean_token_accuracy": 0.13798293545842172,
|
|
"num_tokens": 7868526.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.916080617904663,
|
|
"epoch": 0.3591682419659735,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991460675913655,
|
|
"loss": 5.7946,
|
|
"mean_token_accuracy": 0.1431095890700817,
|
|
"num_tokens": 7877631.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.9288982391357425,
|
|
"epoch": 0.3595883217811384,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000499143459033517,
|
|
"loss": 5.8525,
|
|
"mean_token_accuracy": 0.14929330348968506,
|
|
"num_tokens": 7886814.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.835088777542114,
|
|
"epoch": 0.3600084015963033,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991408465050825,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.15145567432045937,
|
|
"num_tokens": 7896337.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.841267919540405,
|
|
"epoch": 0.36042848141146816,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991382300061084,
|
|
"loss": 5.9429,
|
|
"mean_token_accuracy": 0.13477055355906487,
|
|
"num_tokens": 7906071.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 6.013036108016967,
|
|
"epoch": 0.36084856122663306,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991356095366409,
|
|
"loss": 5.9236,
|
|
"mean_token_accuracy": 0.14087440073490143,
|
|
"num_tokens": 7915003.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.964684629440308,
|
|
"epoch": 0.36126864104179796,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991329850967266,
|
|
"loss": 5.7748,
|
|
"mean_token_accuracy": 0.14612130969762802,
|
|
"num_tokens": 7924408.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.857362222671509,
|
|
"epoch": 0.3616887208569628,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991303566864118,
|
|
"loss": 5.752,
|
|
"mean_token_accuracy": 0.14585833102464676,
|
|
"num_tokens": 7934717.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.800111103057861,
|
|
"epoch": 0.3621088006721277,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004991277243057431,
|
|
"loss": 5.8176,
|
|
"mean_token_accuracy": 0.14245440661907197,
|
|
"num_tokens": 7944278.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.853901958465576,
|
|
"epoch": 0.3625288804872926,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991250879547673,
|
|
"loss": 5.8345,
|
|
"mean_token_accuracy": 0.14364267513155937,
|
|
"num_tokens": 7953344.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.9053857803344725,
|
|
"epoch": 0.3629489603024575,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991224476335309,
|
|
"loss": 5.8601,
|
|
"mean_token_accuracy": 0.1401130437850952,
|
|
"num_tokens": 7962869.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.988316392898559,
|
|
"epoch": 0.36336904011762233,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004991198033420807,
|
|
"loss": 5.8527,
|
|
"mean_token_accuracy": 0.14232899993658066,
|
|
"num_tokens": 7971981.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.870962715148925,
|
|
"epoch": 0.36378911993278723,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004991171550804636,
|
|
"loss": 5.8073,
|
|
"mean_token_accuracy": 0.139846058934927,
|
|
"num_tokens": 7980979.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.898285436630249,
|
|
"epoch": 0.36420919974795213,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004991145028487266,
|
|
"loss": 5.8963,
|
|
"mean_token_accuracy": 0.14070027470588684,
|
|
"num_tokens": 7989607.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.864823675155639,
|
|
"epoch": 0.36462927956311697,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991118466469165,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.14677212983369828,
|
|
"num_tokens": 7998356.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.8904320240020756,
|
|
"epoch": 0.36504935937828187,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004991091864750805,
|
|
"loss": 5.818,
|
|
"mean_token_accuracy": 0.14362581819295883,
|
|
"num_tokens": 8007596.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.893006706237793,
|
|
"epoch": 0.36546943919344677,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991065223332655,
|
|
"loss": 5.8754,
|
|
"mean_token_accuracy": 0.13881655633449555,
|
|
"num_tokens": 8016493.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.957713174819946,
|
|
"epoch": 0.36588951900861166,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991038542215191,
|
|
"loss": 5.8451,
|
|
"mean_token_accuracy": 0.1374589078128338,
|
|
"num_tokens": 8025867.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.831826066970825,
|
|
"epoch": 0.3663095988237765,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004991011821398882,
|
|
"loss": 5.8861,
|
|
"mean_token_accuracy": 0.1465972438454628,
|
|
"num_tokens": 8036251.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 6.003261423110962,
|
|
"epoch": 0.3667296786389414,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990985060884202,
|
|
"loss": 5.8444,
|
|
"mean_token_accuracy": 0.1452535480260849,
|
|
"num_tokens": 8045647.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.943668365478516,
|
|
"epoch": 0.3671497584541063,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990958260671627,
|
|
"loss": 5.8987,
|
|
"mean_token_accuracy": 0.13597789257764817,
|
|
"num_tokens": 8056025.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.898333263397217,
|
|
"epoch": 0.36756983826927114,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004990931420761629,
|
|
"loss": 5.8364,
|
|
"mean_token_accuracy": 0.14677493423223495,
|
|
"num_tokens": 8065029.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.953028678894043,
|
|
"epoch": 0.36798991808443604,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990904541154685,
|
|
"loss": 5.7841,
|
|
"mean_token_accuracy": 0.15241612046957015,
|
|
"num_tokens": 8073249.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.914327716827392,
|
|
"epoch": 0.36840999789960094,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990877621851271,
|
|
"loss": 5.9274,
|
|
"mean_token_accuracy": 0.13789283782243728,
|
|
"num_tokens": 8082039.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.818746089935303,
|
|
"epoch": 0.3688300777147658,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004990850662851863,
|
|
"loss": 5.7546,
|
|
"mean_token_accuracy": 0.14923306405544282,
|
|
"num_tokens": 8090011.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.97280101776123,
|
|
"epoch": 0.3692501575299307,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990823664156941,
|
|
"loss": 5.8789,
|
|
"mean_token_accuracy": 0.1489357531070709,
|
|
"num_tokens": 8099934.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.970620107650757,
|
|
"epoch": 0.3696702373450956,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990796625766981,
|
|
"loss": 5.8822,
|
|
"mean_token_accuracy": 0.13866196647286416,
|
|
"num_tokens": 8108969.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.857716226577759,
|
|
"epoch": 0.3700903171602605,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990769547682462,
|
|
"loss": 5.798,
|
|
"mean_token_accuracy": 0.14401047080755233,
|
|
"num_tokens": 8117372.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 6.015813732147217,
|
|
"epoch": 0.3705103969754253,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990742429903866,
|
|
"loss": 5.9812,
|
|
"mean_token_accuracy": 0.13605612963438035,
|
|
"num_tokens": 8127108.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 6.0110640048980715,
|
|
"epoch": 0.3709304767905902,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000499071527243167,
|
|
"loss": 5.9774,
|
|
"mean_token_accuracy": 0.13931988626718522,
|
|
"num_tokens": 8137392.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.916806697845459,
|
|
"epoch": 0.3713505566057551,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004990688075266357,
|
|
"loss": 5.8172,
|
|
"mean_token_accuracy": 0.14630230888724327,
|
|
"num_tokens": 8146257.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.90497236251831,
|
|
"epoch": 0.37177063642091995,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990660838408409,
|
|
"loss": 5.7894,
|
|
"mean_token_accuracy": 0.14007715433835982,
|
|
"num_tokens": 8154952.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.948085355758667,
|
|
"epoch": 0.37219071623608485,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004990633561858308,
|
|
"loss": 5.8263,
|
|
"mean_token_accuracy": 0.14142653867602348,
|
|
"num_tokens": 8164365.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.9057210922241214,
|
|
"epoch": 0.37261079605124975,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990606245616537,
|
|
"loss": 5.8405,
|
|
"mean_token_accuracy": 0.13960912972688674,
|
|
"num_tokens": 8172614.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 6.0053239345550535,
|
|
"epoch": 0.37303087586641465,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990578889683579,
|
|
"loss": 5.8993,
|
|
"mean_token_accuracy": 0.13672763109207153,
|
|
"num_tokens": 8182445.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.912483501434326,
|
|
"epoch": 0.3734509556815795,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990551494059921,
|
|
"loss": 5.7912,
|
|
"mean_token_accuracy": 0.14882408380508422,
|
|
"num_tokens": 8191871.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.91331787109375,
|
|
"epoch": 0.3738710354967444,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990524058746047,
|
|
"loss": 5.9292,
|
|
"mean_token_accuracy": 0.14731585383415222,
|
|
"num_tokens": 8200658.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.922462463378906,
|
|
"epoch": 0.3742911153119093,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990496583742443,
|
|
"loss": 5.8609,
|
|
"mean_token_accuracy": 0.13896840661764145,
|
|
"num_tokens": 8209776.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.8580132007598875,
|
|
"epoch": 0.3747111951270741,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004990469069049596,
|
|
"loss": 5.7933,
|
|
"mean_token_accuracy": 0.14876351952552797,
|
|
"num_tokens": 8219401.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.9017116069793705,
|
|
"epoch": 0.375131274942239,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004990441514667993,
|
|
"loss": 5.8399,
|
|
"mean_token_accuracy": 0.1457892268896103,
|
|
"num_tokens": 8228762.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.960052967071533,
|
|
"epoch": 0.3755513547574039,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990413920598121,
|
|
"loss": 5.8364,
|
|
"mean_token_accuracy": 0.1444413885474205,
|
|
"num_tokens": 8236612.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.957969760894775,
|
|
"epoch": 0.37597143457256876,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990386286840471,
|
|
"loss": 5.8452,
|
|
"mean_token_accuracy": 0.14290711134672165,
|
|
"num_tokens": 8245043.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 6.0023870944976805,
|
|
"epoch": 0.37639151438773366,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990358613395532,
|
|
"loss": 5.9381,
|
|
"mean_token_accuracy": 0.13609616905450822,
|
|
"num_tokens": 8255270.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.976658725738526,
|
|
"epoch": 0.37681159420289856,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004990330900263792,
|
|
"loss": 5.896,
|
|
"mean_token_accuracy": 0.13675653785467148,
|
|
"num_tokens": 8264761.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.991942405700684,
|
|
"epoch": 0.37723167401806346,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990303147445745,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.14412947744131088,
|
|
"num_tokens": 8274308.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.831737422943116,
|
|
"epoch": 0.3776517538332283,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990275354941881,
|
|
"loss": 5.751,
|
|
"mean_token_accuracy": 0.15253113806247712,
|
|
"num_tokens": 8283323.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.965500402450561,
|
|
"epoch": 0.3780718336483932,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990247522752694,
|
|
"loss": 6.0719,
|
|
"mean_token_accuracy": 0.12804851979017257,
|
|
"num_tokens": 8293452.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.9973039627075195,
|
|
"epoch": 0.3784919134635581,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990219650878674,
|
|
"loss": 5.7459,
|
|
"mean_token_accuracy": 0.14813876897096634,
|
|
"num_tokens": 8302941.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.840318632125855,
|
|
"epoch": 0.37891199327872294,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004990191739320318,
|
|
"loss": 5.7706,
|
|
"mean_token_accuracy": 0.15119873285293578,
|
|
"num_tokens": 8311811.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.808368587493897,
|
|
"epoch": 0.37933207309388783,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990163788078117,
|
|
"loss": 5.6889,
|
|
"mean_token_accuracy": 0.1518329106271267,
|
|
"num_tokens": 8321130.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.834763097763061,
|
|
"epoch": 0.37975215290905273,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990135797152569,
|
|
"loss": 5.7997,
|
|
"mean_token_accuracy": 0.14402930140495301,
|
|
"num_tokens": 8330233.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.881337881088257,
|
|
"epoch": 0.3801722327242176,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004990107766544169,
|
|
"loss": 5.7852,
|
|
"mean_token_accuracy": 0.144415046274662,
|
|
"num_tokens": 8338585.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.83257737159729,
|
|
"epoch": 0.38059231253938247,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990079696253413,
|
|
"loss": 5.8118,
|
|
"mean_token_accuracy": 0.14888912737369536,
|
|
"num_tokens": 8346618.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.908400917053223,
|
|
"epoch": 0.38101239235454737,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990051586280799,
|
|
"loss": 5.7942,
|
|
"mean_token_accuracy": 0.14552049711346626,
|
|
"num_tokens": 8356273.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.918098402023316,
|
|
"epoch": 0.38143247216971227,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990023436626824,
|
|
"loss": 5.7951,
|
|
"mean_token_accuracy": 0.14602155163884162,
|
|
"num_tokens": 8366668.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.982459354400635,
|
|
"epoch": 0.3818525519848771,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004989995247291988,
|
|
"loss": 5.9163,
|
|
"mean_token_accuracy": 0.14120357036590575,
|
|
"num_tokens": 8375610.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.895563316345215,
|
|
"epoch": 0.382272631800042,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004989967018276789,
|
|
"loss": 5.774,
|
|
"mean_token_accuracy": 0.15064741671085358,
|
|
"num_tokens": 8384455.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.79692234992981,
|
|
"epoch": 0.3826927116152069,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989938749581727,
|
|
"loss": 5.8123,
|
|
"mean_token_accuracy": 0.14297219812870027,
|
|
"num_tokens": 8393868.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.923454284667969,
|
|
"epoch": 0.38311279143037175,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004989910441207305,
|
|
"loss": 5.8328,
|
|
"mean_token_accuracy": 0.1404195971786976,
|
|
"num_tokens": 8402916.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.898684453964234,
|
|
"epoch": 0.38353287124553664,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989882093154023,
|
|
"loss": 5.7638,
|
|
"mean_token_accuracy": 0.14875229001045226,
|
|
"num_tokens": 8411649.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.880671072006225,
|
|
"epoch": 0.38395295106070154,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004989853705422381,
|
|
"loss": 5.8801,
|
|
"mean_token_accuracy": 0.13631365299224854,
|
|
"num_tokens": 8420393.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.883023405075074,
|
|
"epoch": 0.38437303087586644,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004989825278012886,
|
|
"loss": 5.7743,
|
|
"mean_token_accuracy": 0.14661871045827865,
|
|
"num_tokens": 8429404.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.882754182815551,
|
|
"epoch": 0.3847931106910313,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000498979681092604,
|
|
"loss": 5.8106,
|
|
"mean_token_accuracy": 0.14257726520299913,
|
|
"num_tokens": 8438299.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.837142848968506,
|
|
"epoch": 0.3852131905061962,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989768304162345,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.14974153488874437,
|
|
"num_tokens": 8447392.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.9916746616363525,
|
|
"epoch": 0.3856332703213611,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004989739757722308,
|
|
"loss": 5.8625,
|
|
"mean_token_accuracy": 0.13722902536392212,
|
|
"num_tokens": 8456361.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.905898475646973,
|
|
"epoch": 0.3860533501365259,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989711171606436,
|
|
"loss": 5.7858,
|
|
"mean_token_accuracy": 0.14541147351264955,
|
|
"num_tokens": 8465548.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.921667671203613,
|
|
"epoch": 0.3864734299516908,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004989682545815232,
|
|
"loss": 5.8109,
|
|
"mean_token_accuracy": 0.1411545142531395,
|
|
"num_tokens": 8474454.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.837777233123779,
|
|
"epoch": 0.3868935097668557,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989653880349207,
|
|
"loss": 5.7277,
|
|
"mean_token_accuracy": 0.14593051224946976,
|
|
"num_tokens": 8482694.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.864150905609131,
|
|
"epoch": 0.38731358958202056,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989625175208864,
|
|
"loss": 5.8308,
|
|
"mean_token_accuracy": 0.14381687343120575,
|
|
"num_tokens": 8491162.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.819499731063843,
|
|
"epoch": 0.38773366939718545,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004989596430394717,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.1608663707971573,
|
|
"num_tokens": 8500716.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.8265057563781735,
|
|
"epoch": 0.38815374921235035,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000498956764590727,
|
|
"loss": 5.7384,
|
|
"mean_token_accuracy": 0.14157627001404763,
|
|
"num_tokens": 8508871.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.979275703430176,
|
|
"epoch": 0.38857382902751525,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004989538821747037,
|
|
"loss": 5.9482,
|
|
"mean_token_accuracy": 0.1420240134000778,
|
|
"num_tokens": 8518450.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.9397321224212645,
|
|
"epoch": 0.3889939088426801,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004989509957914527,
|
|
"loss": 5.8528,
|
|
"mean_token_accuracy": 0.1380702592432499,
|
|
"num_tokens": 8528238.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.852479600906372,
|
|
"epoch": 0.389413988657845,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989481054410251,
|
|
"loss": 5.7431,
|
|
"mean_token_accuracy": 0.14131385385990142,
|
|
"num_tokens": 8537587.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.9004875183105465,
|
|
"epoch": 0.3898340684730099,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989452111234721,
|
|
"loss": 5.854,
|
|
"mean_token_accuracy": 0.14011769965291024,
|
|
"num_tokens": 8547703.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.860686302185059,
|
|
"epoch": 0.39025414828817473,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498942312838845,
|
|
"loss": 5.7958,
|
|
"mean_token_accuracy": 0.14458008110523224,
|
|
"num_tokens": 8557001.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.8804422378540036,
|
|
"epoch": 0.3906742281033396,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989394105871952,
|
|
"loss": 5.692,
|
|
"mean_token_accuracy": 0.15489965081214904,
|
|
"num_tokens": 8565638.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.966875410079956,
|
|
"epoch": 0.3910943079185045,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000498936504368574,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.14225341156125068,
|
|
"num_tokens": 8574428.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.759807777404785,
|
|
"epoch": 0.3915143877336694,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004989335941830329,
|
|
"loss": 5.816,
|
|
"mean_token_accuracy": 0.14541401863098144,
|
|
"num_tokens": 8583157.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.834117889404297,
|
|
"epoch": 0.39193446754883426,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004989306800306236,
|
|
"loss": 5.7781,
|
|
"mean_token_accuracy": 0.14344885647296907,
|
|
"num_tokens": 8592382.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.8663976192474365,
|
|
"epoch": 0.39235454736399916,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004989277619113975,
|
|
"loss": 5.7604,
|
|
"mean_token_accuracy": 0.15097892433404922,
|
|
"num_tokens": 8601058.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.956953763961792,
|
|
"epoch": 0.39277462717916406,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004989248398254065,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.1437965750694275,
|
|
"num_tokens": 8609479.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.92048830986023,
|
|
"epoch": 0.3931947069943289,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004989219137727021,
|
|
"loss": 5.8058,
|
|
"mean_token_accuracy": 0.14700522273778915,
|
|
"num_tokens": 8618860.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.8700724124908445,
|
|
"epoch": 0.3936147868094938,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989189837533365,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14664537757635115,
|
|
"num_tokens": 8627462.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.981065273284912,
|
|
"epoch": 0.3940348666246587,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989160497673613,
|
|
"loss": 5.9387,
|
|
"mean_token_accuracy": 0.13696896955370902,
|
|
"num_tokens": 8637569.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.918409252166748,
|
|
"epoch": 0.39445494643982354,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004989131118148286,
|
|
"loss": 5.7353,
|
|
"mean_token_accuracy": 0.14450196400284768,
|
|
"num_tokens": 8645440.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.836373901367187,
|
|
"epoch": 0.39487502625498844,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004989101698957904,
|
|
"loss": 5.9023,
|
|
"mean_token_accuracy": 0.14248489439487458,
|
|
"num_tokens": 8655077.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.941747808456421,
|
|
"epoch": 0.39529510607015333,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004989072240102988,
|
|
"loss": 5.8142,
|
|
"mean_token_accuracy": 0.14740578532218934,
|
|
"num_tokens": 8663126.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.973061513900757,
|
|
"epoch": 0.39571518588531823,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989042741584061,
|
|
"loss": 5.7952,
|
|
"mean_token_accuracy": 0.14338430240750313,
|
|
"num_tokens": 8672386.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.720412731170654,
|
|
"epoch": 0.3961352657004831,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004989013203401645,
|
|
"loss": 5.7388,
|
|
"mean_token_accuracy": 0.1476906917989254,
|
|
"num_tokens": 8681930.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.883289384841919,
|
|
"epoch": 0.396555345515648,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004988983625556264,
|
|
"loss": 5.7919,
|
|
"mean_token_accuracy": 0.14368573501706122,
|
|
"num_tokens": 8690993.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.890859937667846,
|
|
"epoch": 0.39697542533081287,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004988954008048438,
|
|
"loss": 5.7809,
|
|
"mean_token_accuracy": 0.14698703289031984,
|
|
"num_tokens": 8699497.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 6.004160451889038,
|
|
"epoch": 0.3973955051459777,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988924350878697,
|
|
"loss": 5.986,
|
|
"mean_token_accuracy": 0.1333600528538227,
|
|
"num_tokens": 8709274.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.947705507278442,
|
|
"epoch": 0.3978155849611426,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988894654047563,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.13920372053980828,
|
|
"num_tokens": 8718158.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.82051944732666,
|
|
"epoch": 0.3982356647763075,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004988864917555562,
|
|
"loss": 5.7239,
|
|
"mean_token_accuracy": 0.14391618072986603,
|
|
"num_tokens": 8727459.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.940366458892822,
|
|
"epoch": 0.3986557445914724,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004988835141403224,
|
|
"loss": 5.8538,
|
|
"mean_token_accuracy": 0.14721113741397857,
|
|
"num_tokens": 8737614.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.819404935836792,
|
|
"epoch": 0.39907582440663725,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004988805325591073,
|
|
"loss": 5.6874,
|
|
"mean_token_accuracy": 0.14453882575035096,
|
|
"num_tokens": 8746799.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.84985032081604,
|
|
"epoch": 0.39949590422180214,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004988775470119639,
|
|
"loss": 5.8628,
|
|
"mean_token_accuracy": 0.14014028683304786,
|
|
"num_tokens": 8756555.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.867576169967651,
|
|
"epoch": 0.39991598403696704,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004988745574989451,
|
|
"loss": 5.8851,
|
|
"mean_token_accuracy": 0.1480340264737606,
|
|
"num_tokens": 8765849.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 6.094280099868774,
|
|
"epoch": 0.4003360638521319,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988715640201036,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.13378295823931693,
|
|
"num_tokens": 8775713.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.884061288833618,
|
|
"epoch": 0.4007561436672968,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004988685665754928,
|
|
"loss": 5.7775,
|
|
"mean_token_accuracy": 0.14666623920202254,
|
|
"num_tokens": 8784717.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.8814960479736325,
|
|
"epoch": 0.4011762234824617,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988655651651656,
|
|
"loss": 5.7911,
|
|
"mean_token_accuracy": 0.14413672238588332,
|
|
"num_tokens": 8794388.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.836367225646972,
|
|
"epoch": 0.4015963032976265,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004988625597891751,
|
|
"loss": 5.8093,
|
|
"mean_token_accuracy": 0.14697518199682236,
|
|
"num_tokens": 8802436.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.912711811065674,
|
|
"epoch": 0.4020163831127914,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988595504475746,
|
|
"loss": 5.7636,
|
|
"mean_token_accuracy": 0.1465681880712509,
|
|
"num_tokens": 8811184.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.9507347583770756,
|
|
"epoch": 0.4024364629279563,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004988565371404175,
|
|
"loss": 5.8423,
|
|
"mean_token_accuracy": 0.14505148231983184,
|
|
"num_tokens": 8820525.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.830136728286743,
|
|
"epoch": 0.4028565427431212,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004988535198677571,
|
|
"loss": 5.7011,
|
|
"mean_token_accuracy": 0.153212571144104,
|
|
"num_tokens": 8828928.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.90922179222107,
|
|
"epoch": 0.40327662255828606,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004988504986296469,
|
|
"loss": 5.907,
|
|
"mean_token_accuracy": 0.1371180810034275,
|
|
"num_tokens": 8838615.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.942590522766113,
|
|
"epoch": 0.40369670237345096,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988474734261404,
|
|
"loss": 5.9047,
|
|
"mean_token_accuracy": 0.13416762948036193,
|
|
"num_tokens": 8848709.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.973557710647583,
|
|
"epoch": 0.40411678218861585,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988444442572911,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.1310623273253441,
|
|
"num_tokens": 8858277.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.891769552230835,
|
|
"epoch": 0.4045368620037807,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988414111231528,
|
|
"loss": 5.8161,
|
|
"mean_token_accuracy": 0.14670211374759673,
|
|
"num_tokens": 8868436.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.925015592575074,
|
|
"epoch": 0.4049569418189456,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498838374023779,
|
|
"loss": 5.7888,
|
|
"mean_token_accuracy": 0.13960602283477783,
|
|
"num_tokens": 8877740.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.908780908584594,
|
|
"epoch": 0.4053770216341105,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988353329592239,
|
|
"loss": 5.7761,
|
|
"mean_token_accuracy": 0.14475535228848457,
|
|
"num_tokens": 8887408.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.893645095825195,
|
|
"epoch": 0.4057971014492754,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988322879295409,
|
|
"loss": 5.929,
|
|
"mean_token_accuracy": 0.13994188457727433,
|
|
"num_tokens": 8897141.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.865872049331665,
|
|
"epoch": 0.40621718126444023,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988292389347844,
|
|
"loss": 5.7105,
|
|
"mean_token_accuracy": 0.15417256727814674,
|
|
"num_tokens": 8905747.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.965148115158081,
|
|
"epoch": 0.40663726107960513,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000498826185975008,
|
|
"loss": 5.8673,
|
|
"mean_token_accuracy": 0.14333693608641623,
|
|
"num_tokens": 8914926.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.872843933105469,
|
|
"epoch": 0.40705734089477,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004988231290502662,
|
|
"loss": 5.8806,
|
|
"mean_token_accuracy": 0.14108002185821533,
|
|
"num_tokens": 8923956.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.925130224227905,
|
|
"epoch": 0.40747742070993487,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988200681606127,
|
|
"loss": 5.7542,
|
|
"mean_token_accuracy": 0.1388688787817955,
|
|
"num_tokens": 8932654.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.9108325958251955,
|
|
"epoch": 0.40789750052509977,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000498817003306102,
|
|
"loss": 5.7364,
|
|
"mean_token_accuracy": 0.1501722030341625,
|
|
"num_tokens": 8941716.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.846788120269776,
|
|
"epoch": 0.40831758034026466,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004988139344867884,
|
|
"loss": 5.8122,
|
|
"mean_token_accuracy": 0.14448407515883446,
|
|
"num_tokens": 8950377.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.848782968521118,
|
|
"epoch": 0.4087376601554295,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988108617027261,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.14761658608913422,
|
|
"num_tokens": 8959857.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.834667444229126,
|
|
"epoch": 0.4091577399705944,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988077849539698,
|
|
"loss": 5.7183,
|
|
"mean_token_accuracy": 0.1485067203640938,
|
|
"num_tokens": 8968272.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.923686075210571,
|
|
"epoch": 0.4095778197857593,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004988047042405736,
|
|
"loss": 5.7969,
|
|
"mean_token_accuracy": 0.14762237221002578,
|
|
"num_tokens": 8977445.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.964400959014893,
|
|
"epoch": 0.4099978996009242,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988016195625924,
|
|
"loss": 5.8644,
|
|
"mean_token_accuracy": 0.13916484266519547,
|
|
"num_tokens": 8987315.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.8641290187835695,
|
|
"epoch": 0.41041797941608904,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987985309200807,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.1417423367500305,
|
|
"num_tokens": 8998119.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.7576408863067625,
|
|
"epoch": 0.41083805923125394,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987954383130934,
|
|
"loss": 5.7477,
|
|
"mean_token_accuracy": 0.1535985603928566,
|
|
"num_tokens": 9007167.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.866803312301636,
|
|
"epoch": 0.41125813904641884,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498792341741685,
|
|
"loss": 5.8006,
|
|
"mean_token_accuracy": 0.13756236732006072,
|
|
"num_tokens": 9016690.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.996728754043579,
|
|
"epoch": 0.4116782188615837,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004987892412059106,
|
|
"loss": 5.8881,
|
|
"mean_token_accuracy": 0.1421562008559704,
|
|
"num_tokens": 9026117.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.823458862304688,
|
|
"epoch": 0.4120982986767486,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004987861367058251,
|
|
"loss": 5.7583,
|
|
"mean_token_accuracy": 0.1456121936440468,
|
|
"num_tokens": 9035754.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.91724009513855,
|
|
"epoch": 0.4125183784919135,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987830282414833,
|
|
"loss": 5.7614,
|
|
"mean_token_accuracy": 0.15125717446208,
|
|
"num_tokens": 9045453.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.882875871658325,
|
|
"epoch": 0.41293845830707837,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987799158129404,
|
|
"loss": 5.8736,
|
|
"mean_token_accuracy": 0.14322762489318847,
|
|
"num_tokens": 9056045.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.822021722793579,
|
|
"epoch": 0.4133585381222432,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004987767994202516,
|
|
"loss": 5.7652,
|
|
"mean_token_accuracy": 0.14132684618234634,
|
|
"num_tokens": 9065728.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.874257898330688,
|
|
"epoch": 0.4137786179374081,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987736790634719,
|
|
"loss": 5.7867,
|
|
"mean_token_accuracy": 0.14259056150913238,
|
|
"num_tokens": 9075522.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.868446731567383,
|
|
"epoch": 0.414198697752573,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004987705547426568,
|
|
"loss": 5.7633,
|
|
"mean_token_accuracy": 0.14451717659831048,
|
|
"num_tokens": 9084412.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.86938099861145,
|
|
"epoch": 0.41461877756773785,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987674264578615,
|
|
"loss": 5.8382,
|
|
"mean_token_accuracy": 0.1410167396068573,
|
|
"num_tokens": 9094289.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.902176809310913,
|
|
"epoch": 0.41503885738290275,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004987642942091414,
|
|
"loss": 5.7413,
|
|
"mean_token_accuracy": 0.14698186367750168,
|
|
"num_tokens": 9103124.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.898521900177002,
|
|
"epoch": 0.41545893719806765,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004987611579965523,
|
|
"loss": 5.6945,
|
|
"mean_token_accuracy": 0.1453884869813919,
|
|
"num_tokens": 9112794.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.867249441146851,
|
|
"epoch": 0.4158790170132325,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004987580178201492,
|
|
"loss": 5.8508,
|
|
"mean_token_accuracy": 0.15215325057506562,
|
|
"num_tokens": 9122718.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.877714014053344,
|
|
"epoch": 0.4162990968283974,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004987548736799882,
|
|
"loss": 5.8851,
|
|
"mean_token_accuracy": 0.13938734084367752,
|
|
"num_tokens": 9131855.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.866538429260254,
|
|
"epoch": 0.4167191766435623,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004987517255761248,
|
|
"loss": 5.7248,
|
|
"mean_token_accuracy": 0.14940666258335114,
|
|
"num_tokens": 9141102.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.806973934173584,
|
|
"epoch": 0.4171392564587272,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004987485735086148,
|
|
"loss": 5.8043,
|
|
"mean_token_accuracy": 0.14497776329517365,
|
|
"num_tokens": 9150552.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.940771627426147,
|
|
"epoch": 0.417559336273892,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000498745417477514,
|
|
"loss": 5.7927,
|
|
"mean_token_accuracy": 0.14460284858942032,
|
|
"num_tokens": 9160105.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.864925670623779,
|
|
"epoch": 0.4179794160890569,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004987422574828784,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.14519683197140693,
|
|
"num_tokens": 9169367.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.846901607513428,
|
|
"epoch": 0.4183994959042218,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987390935247639,
|
|
"loss": 5.6568,
|
|
"mean_token_accuracy": 0.15195999220013617,
|
|
"num_tokens": 9177872.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.892278623580933,
|
|
"epoch": 0.41881957571938666,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987359256032265,
|
|
"loss": 5.8728,
|
|
"mean_token_accuracy": 0.1392049200832844,
|
|
"num_tokens": 9187879.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.834523773193359,
|
|
"epoch": 0.41923965553455156,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004987327537183225,
|
|
"loss": 5.7865,
|
|
"mean_token_accuracy": 0.14359964653849602,
|
|
"num_tokens": 9198281.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.898417997360229,
|
|
"epoch": 0.41965973534971646,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004987295778701078,
|
|
"loss": 5.7784,
|
|
"mean_token_accuracy": 0.1480983316898346,
|
|
"num_tokens": 9207670.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.903277587890625,
|
|
"epoch": 0.42007981516488135,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000498726398058639,
|
|
"loss": 5.7986,
|
|
"mean_token_accuracy": 0.1475730612874031,
|
|
"num_tokens": 9216995.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.920054292678833,
|
|
"epoch": 0.4204998949800462,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004987232142839723,
|
|
"loss": 5.8785,
|
|
"mean_token_accuracy": 0.13731264397501947,
|
|
"num_tokens": 9227330.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.861970615386963,
|
|
"epoch": 0.4209199747952111,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987200265461638,
|
|
"loss": 5.7885,
|
|
"mean_token_accuracy": 0.15134866386651993,
|
|
"num_tokens": 9236666.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.934697484970092,
|
|
"epoch": 0.421340054610376,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987168348452705,
|
|
"loss": 5.7864,
|
|
"mean_token_accuracy": 0.144124399125576,
|
|
"num_tokens": 9246388.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.8499044418334964,
|
|
"epoch": 0.42176013442554083,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004987136391813485,
|
|
"loss": 5.7404,
|
|
"mean_token_accuracy": 0.15391666144132615,
|
|
"num_tokens": 9255239.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.773643350601196,
|
|
"epoch": 0.42218021424070573,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987104395544547,
|
|
"loss": 5.7252,
|
|
"mean_token_accuracy": 0.14332954734563827,
|
|
"num_tokens": 9264468.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.859898376464844,
|
|
"epoch": 0.42260029405587063,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987072359646455,
|
|
"loss": 5.7927,
|
|
"mean_token_accuracy": 0.15058641731739045,
|
|
"num_tokens": 9274140.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.917972660064697,
|
|
"epoch": 0.42302037387103547,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987040284119778,
|
|
"loss": 5.7586,
|
|
"mean_token_accuracy": 0.1428128033876419,
|
|
"num_tokens": 9283539.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.781129264831543,
|
|
"epoch": 0.42344045368620037,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004987008168965087,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.14332580342888832,
|
|
"num_tokens": 9292664.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.946068525314331,
|
|
"epoch": 0.42386053350136527,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004986976014182946,
|
|
"loss": 5.8657,
|
|
"mean_token_accuracy": 0.14432715028524398,
|
|
"num_tokens": 9302814.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.980961608886719,
|
|
"epoch": 0.42428061331653016,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986943819773927,
|
|
"loss": 5.858,
|
|
"mean_token_accuracy": 0.14330325573682784,
|
|
"num_tokens": 9312654.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.9505743980407715,
|
|
"epoch": 0.424700693131695,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00049869115857386,
|
|
"loss": 5.8737,
|
|
"mean_token_accuracy": 0.13669376373291015,
|
|
"num_tokens": 9322271.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.951388359069824,
|
|
"epoch": 0.4251207729468599,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986879312077536,
|
|
"loss": 5.8193,
|
|
"mean_token_accuracy": 0.14102528542280196,
|
|
"num_tokens": 9331341.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.834031820297241,
|
|
"epoch": 0.4255408527620248,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986846998791308,
|
|
"loss": 5.7561,
|
|
"mean_token_accuracy": 0.1436670668423176,
|
|
"num_tokens": 9339863.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.811039066314697,
|
|
"epoch": 0.42596093257718964,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986814645880485,
|
|
"loss": 5.7236,
|
|
"mean_token_accuracy": 0.14669884666800498,
|
|
"num_tokens": 9349488.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.830924463272095,
|
|
"epoch": 0.42638101239235454,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986782253345645,
|
|
"loss": 5.7333,
|
|
"mean_token_accuracy": 0.14323149994015694,
|
|
"num_tokens": 9357977.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.839050388336181,
|
|
"epoch": 0.42680109220751944,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004986749821187358,
|
|
"loss": 5.8394,
|
|
"mean_token_accuracy": 0.14253177791833876,
|
|
"num_tokens": 9367449.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.939317226409912,
|
|
"epoch": 0.42722117202268434,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00049867173494062,
|
|
"loss": 5.8681,
|
|
"mean_token_accuracy": 0.14768607616424562,
|
|
"num_tokens": 9377070.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.813904285430908,
|
|
"epoch": 0.4276412518378492,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986684838002744,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.14204483926296235,
|
|
"num_tokens": 9385881.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.823819637298584,
|
|
"epoch": 0.4280613316530141,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004986652286977569,
|
|
"loss": 5.7905,
|
|
"mean_token_accuracy": 0.14255458265542983,
|
|
"num_tokens": 9395159.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.877113628387451,
|
|
"epoch": 0.428481411468179,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986619696331252,
|
|
"loss": 5.7486,
|
|
"mean_token_accuracy": 0.14601895585656166,
|
|
"num_tokens": 9404590.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.856746768951416,
|
|
"epoch": 0.4289014912833438,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986587066064367,
|
|
"loss": 5.7708,
|
|
"mean_token_accuracy": 0.1473971426486969,
|
|
"num_tokens": 9414452.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.868241453170777,
|
|
"epoch": 0.4293215710985087,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004986554396177494,
|
|
"loss": 5.894,
|
|
"mean_token_accuracy": 0.1396991342306137,
|
|
"num_tokens": 9424004.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.933579587936402,
|
|
"epoch": 0.4297416509136736,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986521686671212,
|
|
"loss": 5.7713,
|
|
"mean_token_accuracy": 0.1551983118057251,
|
|
"num_tokens": 9433487.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.856822824478149,
|
|
"epoch": 0.43016173072883845,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00049864889375461,
|
|
"loss": 5.8359,
|
|
"mean_token_accuracy": 0.13958305045962333,
|
|
"num_tokens": 9442742.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.880755043029785,
|
|
"epoch": 0.43058181054400335,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004986456148802738,
|
|
"loss": 5.8957,
|
|
"mean_token_accuracy": 0.14121335968375207,
|
|
"num_tokens": 9452550.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 6.039326620101929,
|
|
"epoch": 0.43100189035916825,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004986423320441707,
|
|
"loss": 5.8546,
|
|
"mean_token_accuracy": 0.13762183710932732,
|
|
"num_tokens": 9461920.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.904562616348267,
|
|
"epoch": 0.43142197017433315,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004986390452463588,
|
|
"loss": 5.7682,
|
|
"mean_token_accuracy": 0.14276604056358339,
|
|
"num_tokens": 9470817.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.710296773910523,
|
|
"epoch": 0.431842049989498,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986357544868964,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.15019231289625168,
|
|
"num_tokens": 9479936.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.892205905914307,
|
|
"epoch": 0.4322621298046629,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004986324597658418,
|
|
"loss": 5.7581,
|
|
"mean_token_accuracy": 0.15196042209863664,
|
|
"num_tokens": 9489818.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.733763742446899,
|
|
"epoch": 0.4326822096198278,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004986291610832533,
|
|
"loss": 5.7455,
|
|
"mean_token_accuracy": 0.14281522929668428,
|
|
"num_tokens": 9499688.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.960237169265747,
|
|
"epoch": 0.4331022894349926,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004986258584391892,
|
|
"loss": 5.8063,
|
|
"mean_token_accuracy": 0.14208860471844673,
|
|
"num_tokens": 9509581.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 6.0035475730896,
|
|
"epoch": 0.4335223692501575,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004986225518337084,
|
|
"loss": 5.89,
|
|
"mean_token_accuracy": 0.143732051551342,
|
|
"num_tokens": 9518556.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.81024432182312,
|
|
"epoch": 0.4339424490653224,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004986192412668692,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.14318298548460007,
|
|
"num_tokens": 9527612.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.847835922241211,
|
|
"epoch": 0.4343625288804873,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004986159267387302,
|
|
"loss": 5.6856,
|
|
"mean_token_accuracy": 0.1560652643442154,
|
|
"num_tokens": 9535882.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.862061595916748,
|
|
"epoch": 0.43478260869565216,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986126082493502,
|
|
"loss": 5.7914,
|
|
"mean_token_accuracy": 0.14822041988372803,
|
|
"num_tokens": 9544799.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.794046545028687,
|
|
"epoch": 0.43520268851081706,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004986092857987881,
|
|
"loss": 5.6968,
|
|
"mean_token_accuracy": 0.15352533906698226,
|
|
"num_tokens": 9553805.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.832414722442627,
|
|
"epoch": 0.43562276832598196,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004986059593871026,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.14509093537926673,
|
|
"num_tokens": 9563493.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.899970149993896,
|
|
"epoch": 0.4360428481411468,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004986026290143527,
|
|
"loss": 5.8201,
|
|
"mean_token_accuracy": 0.14310061410069466,
|
|
"num_tokens": 9572297.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.985169315338135,
|
|
"epoch": 0.4364629279563117,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004985992946805973,
|
|
"loss": 5.9499,
|
|
"mean_token_accuracy": 0.1373360723257065,
|
|
"num_tokens": 9581967.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.853709316253662,
|
|
"epoch": 0.4368830077714766,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004985959563858955,
|
|
"loss": 5.8611,
|
|
"mean_token_accuracy": 0.14648908525705337,
|
|
"num_tokens": 9590885.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.920672750473022,
|
|
"epoch": 0.43730308758664144,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004985926141303066,
|
|
"loss": 5.7766,
|
|
"mean_token_accuracy": 0.14383909106254578,
|
|
"num_tokens": 9599247.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.823170852661133,
|
|
"epoch": 0.43772316740180633,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004985892679138896,
|
|
"loss": 5.709,
|
|
"mean_token_accuracy": 0.15263715162873268,
|
|
"num_tokens": 9608296.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.922242307662964,
|
|
"epoch": 0.43814324721697123,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985859177367038,
|
|
"loss": 5.7539,
|
|
"mean_token_accuracy": 0.14295759946107864,
|
|
"num_tokens": 9616734.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.933417272567749,
|
|
"epoch": 0.43856332703213613,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004985825635988087,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.14136623740196227,
|
|
"num_tokens": 9626246.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.840227174758911,
|
|
"epoch": 0.43898340684730097,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985792055002635,
|
|
"loss": 5.7156,
|
|
"mean_token_accuracy": 0.1447908401489258,
|
|
"num_tokens": 9634963.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.864311695098877,
|
|
"epoch": 0.43940348666246587,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004985758434411278,
|
|
"loss": 5.7954,
|
|
"mean_token_accuracy": 0.1492132991552353,
|
|
"num_tokens": 9643615.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.824445819854736,
|
|
"epoch": 0.43982356647763077,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004985724774214613,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14679911136627197,
|
|
"num_tokens": 9653306.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.8889368057250975,
|
|
"epoch": 0.4402436462927956,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004985691074413233,
|
|
"loss": 5.7966,
|
|
"mean_token_accuracy": 0.1408935308456421,
|
|
"num_tokens": 9662389.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.806066703796387,
|
|
"epoch": 0.4406637261079605,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004985657335007739,
|
|
"loss": 5.7659,
|
|
"mean_token_accuracy": 0.14551339596509932,
|
|
"num_tokens": 9671183.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.852633047103882,
|
|
"epoch": 0.4410838059231254,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004985623555998725,
|
|
"loss": 5.778,
|
|
"mean_token_accuracy": 0.1539351999759674,
|
|
"num_tokens": 9680544.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.867886209487915,
|
|
"epoch": 0.4415038857382903,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985589737386791,
|
|
"loss": 5.8053,
|
|
"mean_token_accuracy": 0.1449089080095291,
|
|
"num_tokens": 9690137.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.847021532058716,
|
|
"epoch": 0.44192396555345514,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004985555879172535,
|
|
"loss": 5.7433,
|
|
"mean_token_accuracy": 0.14687602072954178,
|
|
"num_tokens": 9699149.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.898943853378296,
|
|
"epoch": 0.44234404536862004,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498552198135656,
|
|
"loss": 5.8097,
|
|
"mean_token_accuracy": 0.15019679218530654,
|
|
"num_tokens": 9709308.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.844637632369995,
|
|
"epoch": 0.44276412518378494,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004985488043939462,
|
|
"loss": 5.7573,
|
|
"mean_token_accuracy": 0.1442711167037487,
|
|
"num_tokens": 9718462.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.853937387466431,
|
|
"epoch": 0.4431842049989498,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004985454066921846,
|
|
"loss": 5.6905,
|
|
"mean_token_accuracy": 0.1537187710404396,
|
|
"num_tokens": 9727626.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.747472763061523,
|
|
"epoch": 0.4436042848141147,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004985420050304312,
|
|
"loss": 5.7068,
|
|
"mean_token_accuracy": 0.1498991407454014,
|
|
"num_tokens": 9737091.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.846937942504883,
|
|
"epoch": 0.4440243646292796,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004985385994087462,
|
|
"loss": 5.7867,
|
|
"mean_token_accuracy": 0.14585647359490395,
|
|
"num_tokens": 9746135.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.949729108810425,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004985351898271901,
|
|
"loss": 5.719,
|
|
"mean_token_accuracy": 0.1520434781908989,
|
|
"num_tokens": 9754549.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.887947463989258,
|
|
"epoch": 0.4448645242596093,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004985317762858231,
|
|
"loss": 5.8567,
|
|
"mean_token_accuracy": 0.14025997146964073,
|
|
"num_tokens": 9764219.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.871951913833618,
|
|
"epoch": 0.4452846040747742,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000498528358784706,
|
|
"loss": 5.6972,
|
|
"mean_token_accuracy": 0.15001460164785385,
|
|
"num_tokens": 9772234.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.811316633224488,
|
|
"epoch": 0.4457046838899391,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498524937323899,
|
|
"loss": 5.7622,
|
|
"mean_token_accuracy": 0.15125853270292283,
|
|
"num_tokens": 9781417.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.981836175918579,
|
|
"epoch": 0.44612476370510395,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004985215119034628,
|
|
"loss": 5.8763,
|
|
"mean_token_accuracy": 0.13692381381988525,
|
|
"num_tokens": 9791286.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.866169118881226,
|
|
"epoch": 0.44654484352026885,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985180825234582,
|
|
"loss": 5.8755,
|
|
"mean_token_accuracy": 0.13873762115836144,
|
|
"num_tokens": 9802157.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.981353807449341,
|
|
"epoch": 0.44696492333543375,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004985146491839459,
|
|
"loss": 5.8547,
|
|
"mean_token_accuracy": 0.1320488214492798,
|
|
"num_tokens": 9812646.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.9978625774383545,
|
|
"epoch": 0.4473850031505986,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985112118849865,
|
|
"loss": 5.8664,
|
|
"mean_token_accuracy": 0.13918881937861444,
|
|
"num_tokens": 9822274.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.781670093536377,
|
|
"epoch": 0.4478050829657635,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004985077706266412,
|
|
"loss": 5.6507,
|
|
"mean_token_accuracy": 0.14431787207722663,
|
|
"num_tokens": 9831337.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.797645950317383,
|
|
"epoch": 0.4482251627809284,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004985043254089708,
|
|
"loss": 5.8111,
|
|
"mean_token_accuracy": 0.13542471826076508,
|
|
"num_tokens": 9840798.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.871469783782959,
|
|
"epoch": 0.44864524259609323,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004985008762320364,
|
|
"loss": 5.7666,
|
|
"mean_token_accuracy": 0.14363950192928315,
|
|
"num_tokens": 9850117.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.885560655593872,
|
|
"epoch": 0.4490653224112581,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000498497423095899,
|
|
"loss": 5.7176,
|
|
"mean_token_accuracy": 0.15319354236125945,
|
|
"num_tokens": 9858227.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.810570764541626,
|
|
"epoch": 0.449485402226423,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004984939660006199,
|
|
"loss": 5.8079,
|
|
"mean_token_accuracy": 0.14338937029242516,
|
|
"num_tokens": 9867157.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.811974906921387,
|
|
"epoch": 0.4499054820415879,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004984905049462602,
|
|
"loss": 5.7349,
|
|
"mean_token_accuracy": 0.144259013235569,
|
|
"num_tokens": 9877045.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.959705638885498,
|
|
"epoch": 0.45032556185675277,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004984870399328814,
|
|
"loss": 5.8617,
|
|
"mean_token_accuracy": 0.14245471283793448,
|
|
"num_tokens": 9886637.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.816979646682739,
|
|
"epoch": 0.45074564167191766,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004984835709605446,
|
|
"loss": 5.7271,
|
|
"mean_token_accuracy": 0.15511318892240525,
|
|
"num_tokens": 9895601.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.86139702796936,
|
|
"epoch": 0.45116572148708256,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004984800980293116,
|
|
"loss": 5.8807,
|
|
"mean_token_accuracy": 0.14196527227759362,
|
|
"num_tokens": 9904775.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.883301210403443,
|
|
"epoch": 0.4515858013022474,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004984766211392435,
|
|
"loss": 5.8184,
|
|
"mean_token_accuracy": 0.13878512308001517,
|
|
"num_tokens": 9913795.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.856382942199707,
|
|
"epoch": 0.4520058811174123,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004984731402904024,
|
|
"loss": 5.6546,
|
|
"mean_token_accuracy": 0.15193988084793092,
|
|
"num_tokens": 9922576.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.768913459777832,
|
|
"epoch": 0.4524259609325772,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004984696554828496,
|
|
"loss": 5.6446,
|
|
"mean_token_accuracy": 0.15225213021039963,
|
|
"num_tokens": 9930971.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.856381464004516,
|
|
"epoch": 0.4528460407477421,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004984661667166468,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.1514030024409294,
|
|
"num_tokens": 9939628.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.887900066375733,
|
|
"epoch": 0.45326612056290694,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004984626739918561,
|
|
"loss": 5.7294,
|
|
"mean_token_accuracy": 0.15370103269815444,
|
|
"num_tokens": 9948397.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.8639452934265135,
|
|
"epoch": 0.45368620037807184,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004984591773085391,
|
|
"loss": 5.8108,
|
|
"mean_token_accuracy": 0.14718640744686126,
|
|
"num_tokens": 9957683.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.911360502243042,
|
|
"epoch": 0.45410628019323673,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984556766667578,
|
|
"loss": 5.7938,
|
|
"mean_token_accuracy": 0.14773029685020447,
|
|
"num_tokens": 9966756.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.876928043365479,
|
|
"epoch": 0.4545263600084016,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004984521720665743,
|
|
"loss": 5.7996,
|
|
"mean_token_accuracy": 0.1499388188123703,
|
|
"num_tokens": 9976000.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.9389279842376705,
|
|
"epoch": 0.4549464398235665,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984486635080507,
|
|
"loss": 5.7922,
|
|
"mean_token_accuracy": 0.146384534239769,
|
|
"num_tokens": 9985509.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.7951904296875,
|
|
"epoch": 0.45536651963873137,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004984451509912489,
|
|
"loss": 5.744,
|
|
"mean_token_accuracy": 0.1474005714058876,
|
|
"num_tokens": 9994342.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.838972473144532,
|
|
"epoch": 0.4557865994538962,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004984416345162315,
|
|
"loss": 5.7889,
|
|
"mean_token_accuracy": 0.14537926837801934,
|
|
"num_tokens": 10004249.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.8457417488098145,
|
|
"epoch": 0.4562066792690611,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984381140830605,
|
|
"loss": 5.7485,
|
|
"mean_token_accuracy": 0.14723600521683694,
|
|
"num_tokens": 10012430.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.878772354125976,
|
|
"epoch": 0.456626759084226,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004984345896917984,
|
|
"loss": 5.7605,
|
|
"mean_token_accuracy": 0.14340553283691407,
|
|
"num_tokens": 10021434.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.859716320037842,
|
|
"epoch": 0.4570468388993909,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984310613425076,
|
|
"loss": 5.7662,
|
|
"mean_token_accuracy": 0.1505170688033104,
|
|
"num_tokens": 10030473.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.890053796768188,
|
|
"epoch": 0.45746691871455575,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004984275290352506,
|
|
"loss": 5.7347,
|
|
"mean_token_accuracy": 0.1503530338406563,
|
|
"num_tokens": 10039057.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.906252813339234,
|
|
"epoch": 0.45788699852972065,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004984239927700899,
|
|
"loss": 5.8309,
|
|
"mean_token_accuracy": 0.14800925105810164,
|
|
"num_tokens": 10047998.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.96235499382019,
|
|
"epoch": 0.45830707834488554,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004984204525470883,
|
|
"loss": 5.7626,
|
|
"mean_token_accuracy": 0.14305243864655495,
|
|
"num_tokens": 10057479.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.773991537094116,
|
|
"epoch": 0.4587271581600504,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004984169083663084,
|
|
"loss": 5.7318,
|
|
"mean_token_accuracy": 0.14002140685915948,
|
|
"num_tokens": 10067754.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.805001163482666,
|
|
"epoch": 0.4591472379752153,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004984133602278129,
|
|
"loss": 5.8253,
|
|
"mean_token_accuracy": 0.1421283006668091,
|
|
"num_tokens": 10076815.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 6.033328580856323,
|
|
"epoch": 0.4595673177903802,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498409808131665,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.14671371206641198,
|
|
"num_tokens": 10086300.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.823101377487182,
|
|
"epoch": 0.4599873976055451,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984062520779272,
|
|
"loss": 5.7259,
|
|
"mean_token_accuracy": 0.1552243560552597,
|
|
"num_tokens": 10095383.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.773621034622193,
|
|
"epoch": 0.4604074774207099,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984026920666628,
|
|
"loss": 5.7019,
|
|
"mean_token_accuracy": 0.1514463573694229,
|
|
"num_tokens": 10103971.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.798014068603516,
|
|
"epoch": 0.4608275572358748,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004983991280979347,
|
|
"loss": 5.6971,
|
|
"mean_token_accuracy": 0.1502104952931404,
|
|
"num_tokens": 10113028.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.823189973831177,
|
|
"epoch": 0.4612476370510397,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004983955601718061,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.14814986884593964,
|
|
"num_tokens": 10121890.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.896232748031617,
|
|
"epoch": 0.46166771686620456,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983919882883401,
|
|
"loss": 5.8089,
|
|
"mean_token_accuracy": 0.1452305495738983,
|
|
"num_tokens": 10131655.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.876237583160401,
|
|
"epoch": 0.46208779668136946,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004983884124476,
|
|
"loss": 5.8051,
|
|
"mean_token_accuracy": 0.14433109760284424,
|
|
"num_tokens": 10140778.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.897982120513916,
|
|
"epoch": 0.46250787649653435,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983848326496494,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.1391661711037159,
|
|
"num_tokens": 10150229.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.943829345703125,
|
|
"epoch": 0.4629279563116992,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983812488945513,
|
|
"loss": 5.7502,
|
|
"mean_token_accuracy": 0.14314467534422876,
|
|
"num_tokens": 10158939.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.819750833511352,
|
|
"epoch": 0.4633480361268641,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004983776611823696,
|
|
"loss": 5.7489,
|
|
"mean_token_accuracy": 0.14325918182730674,
|
|
"num_tokens": 10168383.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.7525170803070065,
|
|
"epoch": 0.463768115942029,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004983740695131676,
|
|
"loss": 5.7483,
|
|
"mean_token_accuracy": 0.1506567046046257,
|
|
"num_tokens": 10178678.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.8393933296203615,
|
|
"epoch": 0.4641881957571939,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000498370473887009,
|
|
"loss": 5.7404,
|
|
"mean_token_accuracy": 0.1451387256383896,
|
|
"num_tokens": 10188964.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.9242652416229244,
|
|
"epoch": 0.46460827557235873,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004983668743039573,
|
|
"loss": 5.7722,
|
|
"mean_token_accuracy": 0.15323825627565385,
|
|
"num_tokens": 10198333.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.789677238464355,
|
|
"epoch": 0.46502835538752363,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983632707640766,
|
|
"loss": 5.7876,
|
|
"mean_token_accuracy": 0.14813560321927072,
|
|
"num_tokens": 10207876.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.812788200378418,
|
|
"epoch": 0.4654484352026885,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004983596632674306,
|
|
"loss": 5.7229,
|
|
"mean_token_accuracy": 0.14903474599123,
|
|
"num_tokens": 10216822.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.883552932739258,
|
|
"epoch": 0.46586851501785337,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983560518140831,
|
|
"loss": 5.8344,
|
|
"mean_token_accuracy": 0.139993616938591,
|
|
"num_tokens": 10226887.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.850424337387085,
|
|
"epoch": 0.46628859483301827,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004983524364040982,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.1548854097723961,
|
|
"num_tokens": 10235935.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.844246101379395,
|
|
"epoch": 0.46670867464818316,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983488170375399,
|
|
"loss": 5.6405,
|
|
"mean_token_accuracy": 0.1503463476896286,
|
|
"num_tokens": 10245590.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.735381555557251,
|
|
"epoch": 0.46712875446334806,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983451937144723,
|
|
"loss": 5.7345,
|
|
"mean_token_accuracy": 0.1456381857395172,
|
|
"num_tokens": 10255104.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.7118124008178714,
|
|
"epoch": 0.4675488342785129,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004983415664349595,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.16290194243192674,
|
|
"num_tokens": 10264236.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.817228507995606,
|
|
"epoch": 0.4679689140936778,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004983379351990659,
|
|
"loss": 5.7056,
|
|
"mean_token_accuracy": 0.1503439575433731,
|
|
"num_tokens": 10273335.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.7475629329681395,
|
|
"epoch": 0.4683889939088427,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004983343000068559,
|
|
"loss": 5.6682,
|
|
"mean_token_accuracy": 0.1495598793029785,
|
|
"num_tokens": 10282206.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.688462829589843,
|
|
"epoch": 0.46880907372400754,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004983306608583937,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.16340474039316177,
|
|
"num_tokens": 10290056.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.7730052947998045,
|
|
"epoch": 0.46922915353917244,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983270177537438,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.14809525161981582,
|
|
"num_tokens": 10299726.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.84525089263916,
|
|
"epoch": 0.46964923335433734,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983233706929708,
|
|
"loss": 5.7725,
|
|
"mean_token_accuracy": 0.1471342384815216,
|
|
"num_tokens": 10308696.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.880400562286377,
|
|
"epoch": 0.4700693131695022,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004983197196761392,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.14054280817508696,
|
|
"num_tokens": 10317845.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.84756875038147,
|
|
"epoch": 0.4704893929846671,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004983160647033139,
|
|
"loss": 5.737,
|
|
"mean_token_accuracy": 0.150573068857193,
|
|
"num_tokens": 10326563.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.826395320892334,
|
|
"epoch": 0.470909472799832,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004983124057745595,
|
|
"loss": 5.7235,
|
|
"mean_token_accuracy": 0.14374103918671607,
|
|
"num_tokens": 10335931.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.76983675956726,
|
|
"epoch": 0.47132955261499687,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004983087428899408,
|
|
"loss": 5.7216,
|
|
"mean_token_accuracy": 0.1377339854836464,
|
|
"num_tokens": 10344984.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.842723369598389,
|
|
"epoch": 0.4717496324301617,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004983050760495227,
|
|
"loss": 5.7638,
|
|
"mean_token_accuracy": 0.14885966181755067,
|
|
"num_tokens": 10353522.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.915482044219971,
|
|
"epoch": 0.4721697122453266,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983014052533702,
|
|
"loss": 5.7678,
|
|
"mean_token_accuracy": 0.14949656873941422,
|
|
"num_tokens": 10363527.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.765365362167358,
|
|
"epoch": 0.4725897920604915,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004982977305015481,
|
|
"loss": 5.6942,
|
|
"mean_token_accuracy": 0.1467475950717926,
|
|
"num_tokens": 10372040.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.808851623535157,
|
|
"epoch": 0.47300987187565635,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004982940517941219,
|
|
"loss": 5.6732,
|
|
"mean_token_accuracy": 0.14801965281367302,
|
|
"num_tokens": 10381279.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.891337108612061,
|
|
"epoch": 0.47342995169082125,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982903691311564,
|
|
"loss": 5.8457,
|
|
"mean_token_accuracy": 0.1401650868356228,
|
|
"num_tokens": 10390608.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.811560487747192,
|
|
"epoch": 0.47385003150598615,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004982866825127172,
|
|
"loss": 5.6437,
|
|
"mean_token_accuracy": 0.1533919870853424,
|
|
"num_tokens": 10399851.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.952455997467041,
|
|
"epoch": 0.47427011132115104,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982829919388692,
|
|
"loss": 5.9303,
|
|
"mean_token_accuracy": 0.1413193352520466,
|
|
"num_tokens": 10410425.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.829264545440674,
|
|
"epoch": 0.4746901911363159,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004982792974096781,
|
|
"loss": 5.6844,
|
|
"mean_token_accuracy": 0.15058013647794724,
|
|
"num_tokens": 10418783.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.883219861984253,
|
|
"epoch": 0.4751102709514808,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000498275598925209,
|
|
"loss": 5.8575,
|
|
"mean_token_accuracy": 0.14019499495625495,
|
|
"num_tokens": 10427360.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.982011365890503,
|
|
"epoch": 0.4755303507666457,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004982718964855277,
|
|
"loss": 5.8116,
|
|
"mean_token_accuracy": 0.14399669840931892,
|
|
"num_tokens": 10436613.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.872733783721924,
|
|
"epoch": 0.4759504305818105,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982681900907,
|
|
"loss": 5.8526,
|
|
"mean_token_accuracy": 0.1458025962114334,
|
|
"num_tokens": 10445055.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.826623582839966,
|
|
"epoch": 0.4763705103969754,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000498264479740791,
|
|
"loss": 5.6666,
|
|
"mean_token_accuracy": 0.15394981056451798,
|
|
"num_tokens": 10454516.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.948064708709717,
|
|
"epoch": 0.4767905902121403,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004982607654358668,
|
|
"loss": 5.8096,
|
|
"mean_token_accuracy": 0.147859063744545,
|
|
"num_tokens": 10463771.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.835044527053833,
|
|
"epoch": 0.47721067002730516,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000498257047175993,
|
|
"loss": 5.7488,
|
|
"mean_token_accuracy": 0.142615008354187,
|
|
"num_tokens": 10473783.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.83440375328064,
|
|
"epoch": 0.47763074984247006,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004982533249612357,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.14993957430124283,
|
|
"num_tokens": 10483424.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.763900947570801,
|
|
"epoch": 0.47805082965763496,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004982495987916607,
|
|
"loss": 5.6455,
|
|
"mean_token_accuracy": 0.15347654670476912,
|
|
"num_tokens": 10492536.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.8370520114898685,
|
|
"epoch": 0.47847090947279985,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004982458686673339,
|
|
"loss": 5.7578,
|
|
"mean_token_accuracy": 0.14936625212430954,
|
|
"num_tokens": 10501616.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.956824541091919,
|
|
"epoch": 0.4788909892879647,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004982421345883217,
|
|
"loss": 5.8031,
|
|
"mean_token_accuracy": 0.14071496576070786,
|
|
"num_tokens": 10511190.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.793789196014404,
|
|
"epoch": 0.4793110691031296,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004982383965546898,
|
|
"loss": 5.7381,
|
|
"mean_token_accuracy": 0.144473847001791,
|
|
"num_tokens": 10520310.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.833015632629395,
|
|
"epoch": 0.4797311489182945,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004982346545665048,
|
|
"loss": 5.6941,
|
|
"mean_token_accuracy": 0.1467716298997402,
|
|
"num_tokens": 10528711.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.8455291271209715,
|
|
"epoch": 0.48015122873345933,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004982309086238328,
|
|
"loss": 5.8016,
|
|
"mean_token_accuracy": 0.14259516224265098,
|
|
"num_tokens": 10538484.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.898940181732177,
|
|
"epoch": 0.48057130854862423,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004982271587267403,
|
|
"loss": 5.747,
|
|
"mean_token_accuracy": 0.14794613867998124,
|
|
"num_tokens": 10547623.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.868904733657837,
|
|
"epoch": 0.48099138836378913,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004982234048752935,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.14849727526307105,
|
|
"num_tokens": 10556234.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.9389198303222654,
|
|
"epoch": 0.481411468178954,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000498219647069559,
|
|
"loss": 5.9273,
|
|
"mean_token_accuracy": 0.13982586190104485,
|
|
"num_tokens": 10566308.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.836957883834839,
|
|
"epoch": 0.48183154799411887,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004982158853096035,
|
|
"loss": 5.8519,
|
|
"mean_token_accuracy": 0.1417085811495781,
|
|
"num_tokens": 10575212.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.8836267471313475,
|
|
"epoch": 0.48225162780928377,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982121195954935,
|
|
"loss": 5.6287,
|
|
"mean_token_accuracy": 0.15638786405324936,
|
|
"num_tokens": 10584590.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.817459297180176,
|
|
"epoch": 0.48267170762444866,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004982083499272957,
|
|
"loss": 5.7007,
|
|
"mean_token_accuracy": 0.14900539070367813,
|
|
"num_tokens": 10593997.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.799760389328003,
|
|
"epoch": 0.4830917874396135,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004982045763050768,
|
|
"loss": 5.8291,
|
|
"mean_token_accuracy": 0.1467505380511284,
|
|
"num_tokens": 10603299.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.825570392608642,
|
|
"epoch": 0.4835118672547784,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982007987289041,
|
|
"loss": 5.7641,
|
|
"mean_token_accuracy": 0.14574431553483008,
|
|
"num_tokens": 10613546.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.833213567733765,
|
|
"epoch": 0.4839319470699433,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004981970171988439,
|
|
"loss": 5.7267,
|
|
"mean_token_accuracy": 0.15680563673377038,
|
|
"num_tokens": 10622966.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.918120956420898,
|
|
"epoch": 0.48435202688510814,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004981932317149636,
|
|
"loss": 5.8074,
|
|
"mean_token_accuracy": 0.14230270087718963,
|
|
"num_tokens": 10633441.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.926499748229981,
|
|
"epoch": 0.48477210670027304,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00049818944227733,
|
|
"loss": 5.7829,
|
|
"mean_token_accuracy": 0.145944182574749,
|
|
"num_tokens": 10643124.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.8368360042572025,
|
|
"epoch": 0.48519218651543794,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004981856488860105,
|
|
"loss": 5.75,
|
|
"mean_token_accuracy": 0.14405592083930968,
|
|
"num_tokens": 10652517.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.827040672302246,
|
|
"epoch": 0.48561226633060284,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004981818515410721,
|
|
"loss": 5.8018,
|
|
"mean_token_accuracy": 0.14195797815918923,
|
|
"num_tokens": 10663352.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.911312675476074,
|
|
"epoch": 0.4860323461457677,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004981780502425821,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.14514586478471755,
|
|
"num_tokens": 10672430.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.858085298538208,
|
|
"epoch": 0.4864524259609326,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004981742449906079,
|
|
"loss": 5.7778,
|
|
"mean_token_accuracy": 0.15105650201439857,
|
|
"num_tokens": 10681908.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.876479959487915,
|
|
"epoch": 0.4868725057760975,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004981704357852168,
|
|
"loss": 5.7501,
|
|
"mean_token_accuracy": 0.1459008663892746,
|
|
"num_tokens": 10691259.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.803030967712402,
|
|
"epoch": 0.4872925855912623,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004981666226264764,
|
|
"loss": 5.6514,
|
|
"mean_token_accuracy": 0.14785986095666886,
|
|
"num_tokens": 10699668.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.827937030792237,
|
|
"epoch": 0.4877126654064272,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004981628055144542,
|
|
"loss": 5.7065,
|
|
"mean_token_accuracy": 0.15127545595169067,
|
|
"num_tokens": 10709146.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.876874828338623,
|
|
"epoch": 0.4881327452215921,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981589844492177,
|
|
"loss": 5.8008,
|
|
"mean_token_accuracy": 0.13951031863689423,
|
|
"num_tokens": 10718724.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.814950895309448,
|
|
"epoch": 0.488552825036757,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004981551594308349,
|
|
"loss": 5.7424,
|
|
"mean_token_accuracy": 0.14747670367360116,
|
|
"num_tokens": 10728101.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.938137483596802,
|
|
"epoch": 0.48897290485192185,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004981513304593733,
|
|
"loss": 5.7721,
|
|
"mean_token_accuracy": 0.15057093650102615,
|
|
"num_tokens": 10736750.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.9004603862762455,
|
|
"epoch": 0.48939298466708675,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004981474975349006,
|
|
"loss": 5.9573,
|
|
"mean_token_accuracy": 0.143083293735981,
|
|
"num_tokens": 10746914.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.944899702072144,
|
|
"epoch": 0.48981306448225165,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000498143660657485,
|
|
"loss": 5.7841,
|
|
"mean_token_accuracy": 0.14469311460852624,
|
|
"num_tokens": 10755786.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.719291877746582,
|
|
"epoch": 0.4902331442974165,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004981398198271944,
|
|
"loss": 5.6544,
|
|
"mean_token_accuracy": 0.15054057389497758,
|
|
"num_tokens": 10764821.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.821346855163574,
|
|
"epoch": 0.4906532241125814,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004981359750440968,
|
|
"loss": 5.7381,
|
|
"mean_token_accuracy": 0.14619418531656264,
|
|
"num_tokens": 10773569.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.812557601928711,
|
|
"epoch": 0.4910733039277463,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004981321263082603,
|
|
"loss": 5.7233,
|
|
"mean_token_accuracy": 0.14379709362983703,
|
|
"num_tokens": 10782298.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.7633030891418455,
|
|
"epoch": 0.4914933837429111,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000498128273619753,
|
|
"loss": 5.6964,
|
|
"mean_token_accuracy": 0.15067172646522523,
|
|
"num_tokens": 10792087.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.826433086395264,
|
|
"epoch": 0.491913463558076,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004981244169786433,
|
|
"loss": 5.7863,
|
|
"mean_token_accuracy": 0.14527801647782326,
|
|
"num_tokens": 10801641.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.962628364562988,
|
|
"epoch": 0.4923335433732409,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004981205563849994,
|
|
"loss": 5.8636,
|
|
"mean_token_accuracy": 0.1445979543030262,
|
|
"num_tokens": 10811612.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.84666166305542,
|
|
"epoch": 0.4927536231884058,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004981166918388897,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.1496157467365265,
|
|
"num_tokens": 10821608.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.758074522018433,
|
|
"epoch": 0.49317370300357066,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004981128233403828,
|
|
"loss": 5.6341,
|
|
"mean_token_accuracy": 0.15541895031929015,
|
|
"num_tokens": 10830679.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.810383653640747,
|
|
"epoch": 0.49359378281873556,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000498108950889547,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.15059976279735565,
|
|
"num_tokens": 10839669.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.813056564331054,
|
|
"epoch": 0.49401386263390046,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004981050744864512,
|
|
"loss": 5.6876,
|
|
"mean_token_accuracy": 0.14685238003730774,
|
|
"num_tokens": 10849666.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.78202338218689,
|
|
"epoch": 0.4944339424490653,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004981011941311638,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.1536119759082794,
|
|
"num_tokens": 10858225.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.7550591945648195,
|
|
"epoch": 0.4948540222642302,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004980973098237535,
|
|
"loss": 5.7246,
|
|
"mean_token_accuracy": 0.14252085834741593,
|
|
"num_tokens": 10867466.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.849875020980835,
|
|
"epoch": 0.4952741020793951,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980934215642894,
|
|
"loss": 5.7463,
|
|
"mean_token_accuracy": 0.151506906747818,
|
|
"num_tokens": 10875850.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.780202579498291,
|
|
"epoch": 0.49569418189456,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00049808952935284,
|
|
"loss": 5.6809,
|
|
"mean_token_accuracy": 0.15422153174877168,
|
|
"num_tokens": 10885154.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.7728334903717045,
|
|
"epoch": 0.49611426170972484,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004980856331894747,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.14351727366447448,
|
|
"num_tokens": 10894080.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.794958066940308,
|
|
"epoch": 0.49653434152488973,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980817330742621,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.1406318761408329,
|
|
"num_tokens": 10903248.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.890414190292359,
|
|
"epoch": 0.49695442134005463,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004980778290072716,
|
|
"loss": 5.7344,
|
|
"mean_token_accuracy": 0.1520361930131912,
|
|
"num_tokens": 10912939.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.844255971908569,
|
|
"epoch": 0.4973745011552195,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004980739209885722,
|
|
"loss": 5.7519,
|
|
"mean_token_accuracy": 0.14798953309655188,
|
|
"num_tokens": 10921505.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.894140291213989,
|
|
"epoch": 0.49779458097038437,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980700090182331,
|
|
"loss": 5.8334,
|
|
"mean_token_accuracy": 0.14881108254194259,
|
|
"num_tokens": 10931861.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.870219659805298,
|
|
"epoch": 0.49821466078554927,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004980660930963238,
|
|
"loss": 5.7625,
|
|
"mean_token_accuracy": 0.14495279788970947,
|
|
"num_tokens": 10940810.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.808070087432862,
|
|
"epoch": 0.4986347406007141,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004980621732229133,
|
|
"loss": 5.6263,
|
|
"mean_token_accuracy": 0.15171189308166505,
|
|
"num_tokens": 10949514.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.853536224365234,
|
|
"epoch": 0.499054820415879,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004980582493980714,
|
|
"loss": 5.8402,
|
|
"mean_token_accuracy": 0.13668815642595292,
|
|
"num_tokens": 10959161.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.811306715011597,
|
|
"epoch": 0.4994749002310439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004980543216218674,
|
|
"loss": 5.7084,
|
|
"mean_token_accuracy": 0.1605042815208435,
|
|
"num_tokens": 10968983.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.838724660873413,
|
|
"epoch": 0.4998949800462088,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980503898943711,
|
|
"loss": 5.8486,
|
|
"mean_token_accuracy": 0.14541933685541153,
|
|
"num_tokens": 10978044.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.919149684906006,
|
|
"epoch": 0.5003150598613737,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980464542156519,
|
|
"loss": 5.7474,
|
|
"mean_token_accuracy": 0.15162651985883713,
|
|
"num_tokens": 10986980.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.8385172367095945,
|
|
"epoch": 0.5007351396765385,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004980425145857796,
|
|
"loss": 5.6939,
|
|
"mean_token_accuracy": 0.15786231756210328,
|
|
"num_tokens": 10995163.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.755066156387329,
|
|
"epoch": 0.5011552194917034,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000498038571004824,
|
|
"loss": 5.6211,
|
|
"mean_token_accuracy": 0.159263913333416,
|
|
"num_tokens": 11003722.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.732334613800049,
|
|
"epoch": 0.5015752993068683,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004980346234728549,
|
|
"loss": 5.6829,
|
|
"mean_token_accuracy": 0.15636452287435532,
|
|
"num_tokens": 11013176.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.856866264343262,
|
|
"epoch": 0.5019953791220332,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004980306719899424,
|
|
"loss": 5.7417,
|
|
"mean_token_accuracy": 0.1482336312532425,
|
|
"num_tokens": 11022636.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.81472544670105,
|
|
"epoch": 0.5024154589371981,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980267165561564,
|
|
"loss": 5.6994,
|
|
"mean_token_accuracy": 0.15061589032411576,
|
|
"num_tokens": 11031896.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.8317889213562015,
|
|
"epoch": 0.502835538752363,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980227571715669,
|
|
"loss": 5.7442,
|
|
"mean_token_accuracy": 0.14868111461400985,
|
|
"num_tokens": 11040802.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.817817497253418,
|
|
"epoch": 0.5032556185675279,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004980187938362441,
|
|
"loss": 5.6616,
|
|
"mean_token_accuracy": 0.14449788331985475,
|
|
"num_tokens": 11049701.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.8403524398803714,
|
|
"epoch": 0.5036756983826927,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004980148265502581,
|
|
"loss": 5.8553,
|
|
"mean_token_accuracy": 0.1392398163676262,
|
|
"num_tokens": 11059555.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.883025121688843,
|
|
"epoch": 0.5040957781978576,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980108553136795,
|
|
"loss": 5.7762,
|
|
"mean_token_accuracy": 0.14863402545452117,
|
|
"num_tokens": 11068940.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5040957781978576,
|
|
"eval_entropy": 5.732787127158954,
|
|
"eval_loss": 5.7686614990234375,
|
|
"eval_mean_token_accuracy": 0.15331337192289018,
|
|
"eval_num_tokens": 11068940.0,
|
|
"eval_runtime": 27.3892,
|
|
"eval_samples_per_second": 1364.261,
|
|
"eval_steps_per_second": 170.542,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 5.908424186706543,
|
|
"epoch": 0.5045158580130225,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980068801265783,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.14692858532071112,
|
|
"num_tokens": 11079014.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 5.866373205184937,
|
|
"epoch": 0.5049359378281874,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004980029009890251,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.1466228261590004,
|
|
"num_tokens": 11089526.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 5.839123296737671,
|
|
"epoch": 0.5053560176433523,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004979989179010904,
|
|
"loss": 5.7197,
|
|
"mean_token_accuracy": 0.15178524404764177,
|
|
"num_tokens": 11099156.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 5.760820007324218,
|
|
"epoch": 0.5057760974585171,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004979949308628445,
|
|
"loss": 5.7078,
|
|
"mean_token_accuracy": 0.15017148554325105,
|
|
"num_tokens": 11108242.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 5.7764500141143795,
|
|
"epoch": 0.506196177273682,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004979909398743584,
|
|
"loss": 5.7066,
|
|
"mean_token_accuracy": 0.15099107772111892,
|
|
"num_tokens": 11118076.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 5.893146562576294,
|
|
"epoch": 0.5066162570888468,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004979869449357026,
|
|
"loss": 5.7766,
|
|
"mean_token_accuracy": 0.15781906694173814,
|
|
"num_tokens": 11127265.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 5.810907363891602,
|
|
"epoch": 0.5070363369040117,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004979829460469478,
|
|
"loss": 5.6965,
|
|
"mean_token_accuracy": 0.1483650103211403,
|
|
"num_tokens": 11136429.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.813454437255859,
|
|
"epoch": 0.5074564167191766,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979789432081649,
|
|
"loss": 5.7139,
|
|
"mean_token_accuracy": 0.1487409368157387,
|
|
"num_tokens": 11146201.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 5.864733123779297,
|
|
"epoch": 0.5078764965343415,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000497974936419425,
|
|
"loss": 5.7222,
|
|
"mean_token_accuracy": 0.15236361622810363,
|
|
"num_tokens": 11154867.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 5.746392869949341,
|
|
"epoch": 0.5082965763495064,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004979709256807989,
|
|
"loss": 5.758,
|
|
"mean_token_accuracy": 0.1480425164103508,
|
|
"num_tokens": 11164092.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 5.840289688110351,
|
|
"epoch": 0.5087166561646713,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004979669109923575,
|
|
"loss": 5.7754,
|
|
"mean_token_accuracy": 0.14666769057512283,
|
|
"num_tokens": 11173176.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 5.953520202636719,
|
|
"epoch": 0.5091367359798362,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004979628923541721,
|
|
"loss": 5.7491,
|
|
"mean_token_accuracy": 0.1458544984459877,
|
|
"num_tokens": 11182397.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 5.871777105331421,
|
|
"epoch": 0.509556815795001,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497958869766314,
|
|
"loss": 5.7938,
|
|
"mean_token_accuracy": 0.14472762495279312,
|
|
"num_tokens": 11191790.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 5.785938310623169,
|
|
"epoch": 0.5099768956101659,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979548432288543,
|
|
"loss": 5.7104,
|
|
"mean_token_accuracy": 0.1533594697713852,
|
|
"num_tokens": 11201104.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 5.850540256500244,
|
|
"epoch": 0.5103969754253308,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004979508127418643,
|
|
"loss": 5.7179,
|
|
"mean_token_accuracy": 0.1509293831884861,
|
|
"num_tokens": 11209578.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 5.824426078796387,
|
|
"epoch": 0.5108170552404957,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004979467783054155,
|
|
"loss": 5.6559,
|
|
"mean_token_accuracy": 0.15454075038433074,
|
|
"num_tokens": 11218380.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 5.734690237045288,
|
|
"epoch": 0.5112371350556606,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979427399195793,
|
|
"loss": 5.6795,
|
|
"mean_token_accuracy": 0.1466882646083832,
|
|
"num_tokens": 11227810.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.784052991867066,
|
|
"epoch": 0.5116572148708255,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004979386975844274,
|
|
"loss": 5.6925,
|
|
"mean_token_accuracy": 0.1516873687505722,
|
|
"num_tokens": 11236631.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 5.811602210998535,
|
|
"epoch": 0.5120772946859904,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004979346513000311,
|
|
"loss": 5.7643,
|
|
"mean_token_accuracy": 0.14228157997131347,
|
|
"num_tokens": 11247418.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 5.801711654663086,
|
|
"epoch": 0.5124973745011552,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004979306010664623,
|
|
"loss": 5.6482,
|
|
"mean_token_accuracy": 0.15656405985355376,
|
|
"num_tokens": 11256246.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 5.709601259231567,
|
|
"epoch": 0.5129174543163201,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004979265468837927,
|
|
"loss": 5.6377,
|
|
"mean_token_accuracy": 0.15466838777065278,
|
|
"num_tokens": 11265980.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.778408575057983,
|
|
"epoch": 0.513337534131485,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000497922488752094,
|
|
"loss": 5.6873,
|
|
"mean_token_accuracy": 0.1463077425956726,
|
|
"num_tokens": 11276158.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 5.757645797729492,
|
|
"epoch": 0.5137576139466499,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004979184266714383,
|
|
"loss": 5.6121,
|
|
"mean_token_accuracy": 0.1554221287369728,
|
|
"num_tokens": 11284957.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 5.694925689697266,
|
|
"epoch": 0.5141776937618148,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979143606418974,
|
|
"loss": 5.6283,
|
|
"mean_token_accuracy": 0.1562877871096134,
|
|
"num_tokens": 11294340.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.903133296966553,
|
|
"epoch": 0.5145977735769797,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979102906635435,
|
|
"loss": 5.8808,
|
|
"mean_token_accuracy": 0.14421921372413635,
|
|
"num_tokens": 11303344.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 5.9017737865447994,
|
|
"epoch": 0.5150178533921445,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979062167364486,
|
|
"loss": 5.7468,
|
|
"mean_token_accuracy": 0.15465227216482164,
|
|
"num_tokens": 11311338.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 5.760764503479004,
|
|
"epoch": 0.5154379332073094,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979021388606847,
|
|
"loss": 5.5793,
|
|
"mean_token_accuracy": 0.16053801253437996,
|
|
"num_tokens": 11320194.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 5.783118629455567,
|
|
"epoch": 0.5158580130224742,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978980570363243,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.15072498917579652,
|
|
"num_tokens": 11329952.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.807923793792725,
|
|
"epoch": 0.5162780928376391,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004978939712634396,
|
|
"loss": 5.7097,
|
|
"mean_token_accuracy": 0.1485825777053833,
|
|
"num_tokens": 11339384.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 5.927007532119751,
|
|
"epoch": 0.516698172652804,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978898815421029,
|
|
"loss": 5.882,
|
|
"mean_token_accuracy": 0.14463590383529662,
|
|
"num_tokens": 11348409.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 5.948485612869263,
|
|
"epoch": 0.5171182524679689,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004978857878723867,
|
|
"loss": 5.7826,
|
|
"mean_token_accuracy": 0.1465214103460312,
|
|
"num_tokens": 11357478.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.871764278411865,
|
|
"epoch": 0.5175383322831338,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004978816902543636,
|
|
"loss": 5.7924,
|
|
"mean_token_accuracy": 0.14824822992086412,
|
|
"num_tokens": 11366379.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.857372522354126,
|
|
"epoch": 0.5179584120982986,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978775886881062,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.144633187353611,
|
|
"num_tokens": 11376357.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 5.790678644180298,
|
|
"epoch": 0.5183784919134635,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000497873483173687,
|
|
"loss": 5.682,
|
|
"mean_token_accuracy": 0.1550826385617256,
|
|
"num_tokens": 11384995.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 5.803675746917724,
|
|
"epoch": 0.5187985717286284,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978693737111787,
|
|
"loss": 5.691,
|
|
"mean_token_accuracy": 0.14901078641414642,
|
|
"num_tokens": 11395363.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.773939752578736,
|
|
"epoch": 0.5192186515437933,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978652603006543,
|
|
"loss": 5.6785,
|
|
"mean_token_accuracy": 0.14922358542680741,
|
|
"num_tokens": 11404511.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.83831205368042,
|
|
"epoch": 0.5196387313589582,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004978611429421866,
|
|
"loss": 5.7376,
|
|
"mean_token_accuracy": 0.14898759126663208,
|
|
"num_tokens": 11413400.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 5.867534255981445,
|
|
"epoch": 0.5200588111741231,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004978570216358485,
|
|
"loss": 5.7719,
|
|
"mean_token_accuracy": 0.14096312299370767,
|
|
"num_tokens": 11423693.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 5.85771164894104,
|
|
"epoch": 0.520478890989288,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000497852896381713,
|
|
"loss": 5.7317,
|
|
"mean_token_accuracy": 0.14528233110904692,
|
|
"num_tokens": 11433195.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 5.8870384216308596,
|
|
"epoch": 0.5208989708044528,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004978487671798531,
|
|
"loss": 5.8604,
|
|
"mean_token_accuracy": 0.13629197254776954,
|
|
"num_tokens": 11443416.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 5.938678550720215,
|
|
"epoch": 0.5213190506196177,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004978446340303422,
|
|
"loss": 5.7271,
|
|
"mean_token_accuracy": 0.15116187259554864,
|
|
"num_tokens": 11452487.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 5.809211301803589,
|
|
"epoch": 0.5217391304347826,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004978404969332533,
|
|
"loss": 5.7517,
|
|
"mean_token_accuracy": 0.15704237520694733,
|
|
"num_tokens": 11461893.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.73575005531311,
|
|
"epoch": 0.5221592102499475,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004978363558886597,
|
|
"loss": 5.6754,
|
|
"mean_token_accuracy": 0.14295373037457465,
|
|
"num_tokens": 11471238.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 5.850252771377564,
|
|
"epoch": 0.5225792900651124,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004978322108966348,
|
|
"loss": 5.7739,
|
|
"mean_token_accuracy": 0.14141838401556014,
|
|
"num_tokens": 11480571.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 5.817096996307373,
|
|
"epoch": 0.5229993698802773,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978280619572521,
|
|
"loss": 5.7567,
|
|
"mean_token_accuracy": 0.14793166518211365,
|
|
"num_tokens": 11489552.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 5.864131927490234,
|
|
"epoch": 0.5234194496954422,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497823909070585,
|
|
"loss": 5.8087,
|
|
"mean_token_accuracy": 0.1432569444179535,
|
|
"num_tokens": 11498715.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 5.847290849685669,
|
|
"epoch": 0.523839529510607,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978197522367071,
|
|
"loss": 5.7472,
|
|
"mean_token_accuracy": 0.14424416646361352,
|
|
"num_tokens": 11508472.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 5.939693546295166,
|
|
"epoch": 0.5242596093257719,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004978155914556919,
|
|
"loss": 5.6864,
|
|
"mean_token_accuracy": 0.15637651830911636,
|
|
"num_tokens": 11517620.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.744783592224121,
|
|
"epoch": 0.5246796891409368,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004978114267276134,
|
|
"loss": 5.7336,
|
|
"mean_token_accuracy": 0.14782111793756486,
|
|
"num_tokens": 11526106.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 5.853097581863404,
|
|
"epoch": 0.5250997689561017,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978072580525451,
|
|
"loss": 5.7751,
|
|
"mean_token_accuracy": 0.14963556379079818,
|
|
"num_tokens": 11535840.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 5.883814191818237,
|
|
"epoch": 0.5255198487712666,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000497803085430561,
|
|
"loss": 5.7622,
|
|
"mean_token_accuracy": 0.15003612414002418,
|
|
"num_tokens": 11545110.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 5.879300594329834,
|
|
"epoch": 0.5259399285864315,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004977989088617349,
|
|
"loss": 5.7805,
|
|
"mean_token_accuracy": 0.1432628057897091,
|
|
"num_tokens": 11554382.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 5.77400393486023,
|
|
"epoch": 0.5263600084015964,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000497794728346141,
|
|
"loss": 5.632,
|
|
"mean_token_accuracy": 0.1552414707839489,
|
|
"num_tokens": 11562821.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 5.952142190933228,
|
|
"epoch": 0.5267800882167611,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004977905438838531,
|
|
"loss": 5.8474,
|
|
"mean_token_accuracy": 0.14172168597579002,
|
|
"num_tokens": 11571705.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 5.71492829322815,
|
|
"epoch": 0.527200168031926,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004977863554749453,
|
|
"loss": 5.6778,
|
|
"mean_token_accuracy": 0.14525432735681534,
|
|
"num_tokens": 11580692.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 5.727636861801147,
|
|
"epoch": 0.5276202478470909,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004977821631194922,
|
|
"loss": 5.686,
|
|
"mean_token_accuracy": 0.14509947448968888,
|
|
"num_tokens": 11589966.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 5.8679040431976315,
|
|
"epoch": 0.5280403276622558,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004977779668175677,
|
|
"loss": 5.7627,
|
|
"mean_token_accuracy": 0.1469483494758606,
|
|
"num_tokens": 11599627.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 5.856904077529907,
|
|
"epoch": 0.5284604074774207,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004977737665692461,
|
|
"loss": 5.7366,
|
|
"mean_token_accuracy": 0.15558115839958192,
|
|
"num_tokens": 11608431.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 5.841502332687378,
|
|
"epoch": 0.5288804872925856,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004977695623746021,
|
|
"loss": 5.6142,
|
|
"mean_token_accuracy": 0.14905260503292084,
|
|
"num_tokens": 11617552.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 5.712338972091675,
|
|
"epoch": 0.5293005671077504,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004977653542337099,
|
|
"loss": 5.6645,
|
|
"mean_token_accuracy": 0.15581920593976975,
|
|
"num_tokens": 11626828.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 5.804640913009644,
|
|
"epoch": 0.5297206469229153,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004977611421466443,
|
|
"loss": 5.746,
|
|
"mean_token_accuracy": 0.14610961824655533,
|
|
"num_tokens": 11635867.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 5.886562156677246,
|
|
"epoch": 0.5301407267380802,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977569261134797,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15055324360728264,
|
|
"num_tokens": 11644711.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 5.830437183380127,
|
|
"epoch": 0.5305608065532451,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004977527061342908,
|
|
"loss": 5.7385,
|
|
"mean_token_accuracy": 0.15071533769369125,
|
|
"num_tokens": 11653320.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 5.832324886322022,
|
|
"epoch": 0.53098088636841,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004977484822091524,
|
|
"loss": 5.703,
|
|
"mean_token_accuracy": 0.15310411900281906,
|
|
"num_tokens": 11662753.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 5.879701805114746,
|
|
"epoch": 0.5314009661835749,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004977442543381394,
|
|
"loss": 5.7395,
|
|
"mean_token_accuracy": 0.1498982183635235,
|
|
"num_tokens": 11671622.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.854084539413452,
|
|
"epoch": 0.5318210459987398,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004977400225213266,
|
|
"loss": 5.7196,
|
|
"mean_token_accuracy": 0.14721598774194716,
|
|
"num_tokens": 11679964.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.763905620574951,
|
|
"epoch": 0.5322411258139046,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497735786758789,
|
|
"loss": 5.6842,
|
|
"mean_token_accuracy": 0.1521085247397423,
|
|
"num_tokens": 11688700.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 5.846723842620849,
|
|
"epoch": 0.5326612056290695,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004977315470506016,
|
|
"loss": 5.8056,
|
|
"mean_token_accuracy": 0.14883239492774009,
|
|
"num_tokens": 11698425.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 5.966537141799927,
|
|
"epoch": 0.5330812854442344,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004977273033968397,
|
|
"loss": 5.791,
|
|
"mean_token_accuracy": 0.13928466588258742,
|
|
"num_tokens": 11707705.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.8435125827789305,
|
|
"epoch": 0.5335013652593993,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004977230557975782,
|
|
"loss": 5.6783,
|
|
"mean_token_accuracy": 0.1494770586490631,
|
|
"num_tokens": 11717079.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 5.791642379760742,
|
|
"epoch": 0.5339214450745642,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004977188042528923,
|
|
"loss": 5.6678,
|
|
"mean_token_accuracy": 0.14970564991235732,
|
|
"num_tokens": 11725504.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 5.847938060760498,
|
|
"epoch": 0.5343415248897291,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004977145487628576,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14778463244438172,
|
|
"num_tokens": 11735282.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.854086971282959,
|
|
"epoch": 0.534761604704894,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004977102893275494,
|
|
"loss": 5.7377,
|
|
"mean_token_accuracy": 0.14616001397371292,
|
|
"num_tokens": 11744827.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 5.835380983352661,
|
|
"epoch": 0.5351816845200588,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000497706025947043,
|
|
"loss": 5.7012,
|
|
"mean_token_accuracy": 0.14849554300308226,
|
|
"num_tokens": 11753066.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 5.829690742492676,
|
|
"epoch": 0.5356017643352237,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004977017586214142,
|
|
"loss": 5.7175,
|
|
"mean_token_accuracy": 0.14658187404274942,
|
|
"num_tokens": 11761190.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 5.845994329452514,
|
|
"epoch": 0.5360218441503886,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004976974873507382,
|
|
"loss": 5.6947,
|
|
"mean_token_accuracy": 0.15390099734067916,
|
|
"num_tokens": 11770321.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 5.7918110370635985,
|
|
"epoch": 0.5364419239655535,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000497693212135091,
|
|
"loss": 5.7547,
|
|
"mean_token_accuracy": 0.14563888013362886,
|
|
"num_tokens": 11778388.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 5.857013368606568,
|
|
"epoch": 0.5368620037807184,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004976889329745482,
|
|
"loss": 5.6164,
|
|
"mean_token_accuracy": 0.15133741348981858,
|
|
"num_tokens": 11786250.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 5.720251989364624,
|
|
"epoch": 0.5372820835958833,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004976846498691857,
|
|
"loss": 5.579,
|
|
"mean_token_accuracy": 0.15662760883569718,
|
|
"num_tokens": 11794831.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 5.777666759490967,
|
|
"epoch": 0.5377021634110482,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004976803628190792,
|
|
"loss": 5.6537,
|
|
"mean_token_accuracy": 0.15591528862714768,
|
|
"num_tokens": 11803550.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 5.767534923553467,
|
|
"epoch": 0.5381222432262129,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004976760718243047,
|
|
"loss": 5.7165,
|
|
"mean_token_accuracy": 0.14894714206457138,
|
|
"num_tokens": 11812478.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 5.8361043453216555,
|
|
"epoch": 0.5385423230413778,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004976717768849383,
|
|
"loss": 5.6892,
|
|
"mean_token_accuracy": 0.14339745715260505,
|
|
"num_tokens": 11822463.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 5.79760046005249,
|
|
"epoch": 0.5389624028565427,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0004976674780010561,
|
|
"loss": 5.7244,
|
|
"mean_token_accuracy": 0.13902894631028176,
|
|
"num_tokens": 11831853.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.824806070327758,
|
|
"epoch": 0.5393824826717076,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000497663175172734,
|
|
"loss": 5.7457,
|
|
"mean_token_accuracy": 0.1442998580634594,
|
|
"num_tokens": 11841574.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 5.9099555015563965,
|
|
"epoch": 0.5398025624868725,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004976588684000486,
|
|
"loss": 5.8432,
|
|
"mean_token_accuracy": 0.13176233023405076,
|
|
"num_tokens": 11852489.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 5.846707534790039,
|
|
"epoch": 0.5402226423020374,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004976545576830759,
|
|
"loss": 5.6999,
|
|
"mean_token_accuracy": 0.1471443608403206,
|
|
"num_tokens": 11861499.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 5.810786867141724,
|
|
"epoch": 0.5406427221172023,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004976502430218924,
|
|
"loss": 5.776,
|
|
"mean_token_accuracy": 0.14316292852163315,
|
|
"num_tokens": 11871685.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 5.8063677787780765,
|
|
"epoch": 0.5410628019323671,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004976459244165744,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.14863400161266327,
|
|
"num_tokens": 11881340.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.772097444534301,
|
|
"epoch": 0.541482881747532,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004976416018671986,
|
|
"loss": 5.7131,
|
|
"mean_token_accuracy": 0.14742937684059143,
|
|
"num_tokens": 11890700.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 5.814801359176636,
|
|
"epoch": 0.5419029615626969,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004976372753738415,
|
|
"loss": 5.7129,
|
|
"mean_token_accuracy": 0.14111651703715325,
|
|
"num_tokens": 11900329.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 5.9360603332519535,
|
|
"epoch": 0.5423230413778618,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004976329449365795,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1429471679031849,
|
|
"num_tokens": 11909915.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 5.787397623062134,
|
|
"epoch": 0.5427431211930267,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004976286105554897,
|
|
"loss": 5.7645,
|
|
"mean_token_accuracy": 0.14958669245243073,
|
|
"num_tokens": 11918302.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.77375168800354,
|
|
"epoch": 0.5431632010081916,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004976242722306487,
|
|
"loss": 5.7198,
|
|
"mean_token_accuracy": 0.14630756974220277,
|
|
"num_tokens": 11927794.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 5.919241952896118,
|
|
"epoch": 0.5435832808233564,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004976199299621333,
|
|
"loss": 5.747,
|
|
"mean_token_accuracy": 0.14924167543649675,
|
|
"num_tokens": 11937701.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 5.725202035903931,
|
|
"epoch": 0.5440033606385213,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004976155837500205,
|
|
"loss": 5.6509,
|
|
"mean_token_accuracy": 0.15285194665193558,
|
|
"num_tokens": 11946106.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 5.793752574920655,
|
|
"epoch": 0.5444234404536862,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004976112335943872,
|
|
"loss": 5.5899,
|
|
"mean_token_accuracy": 0.15264788568019866,
|
|
"num_tokens": 11954604.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.727561092376709,
|
|
"epoch": 0.5448435202688511,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004976068794953106,
|
|
"loss": 5.655,
|
|
"mean_token_accuracy": 0.15496142357587814,
|
|
"num_tokens": 11963664.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 5.800908708572388,
|
|
"epoch": 0.545263600084016,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004976025214528677,
|
|
"loss": 5.6569,
|
|
"mean_token_accuracy": 0.15130768865346908,
|
|
"num_tokens": 11973426.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 5.773944950103759,
|
|
"epoch": 0.5456836798991809,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004975981594671359,
|
|
"loss": 5.6981,
|
|
"mean_token_accuracy": 0.14681158736348152,
|
|
"num_tokens": 11982339.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.846315574645996,
|
|
"epoch": 0.5461037597143458,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004975937935381921,
|
|
"loss": 5.7408,
|
|
"mean_token_accuracy": 0.15329586565494538,
|
|
"num_tokens": 11992016.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 5.7528393268585205,
|
|
"epoch": 0.5465238395295106,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000497589423666114,
|
|
"loss": 5.7341,
|
|
"mean_token_accuracy": 0.1440807357430458,
|
|
"num_tokens": 12000616.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 5.6946946144104,
|
|
"epoch": 0.5469439193446755,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975850498509789,
|
|
"loss": 5.6253,
|
|
"mean_token_accuracy": 0.15553901046514512,
|
|
"num_tokens": 12009717.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 5.767681360244751,
|
|
"epoch": 0.5473639991598404,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004975806720928642,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.1479937508702278,
|
|
"num_tokens": 12018020.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 5.797775173187256,
|
|
"epoch": 0.5477840789750053,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004975762903918475,
|
|
"loss": 5.7163,
|
|
"mean_token_accuracy": 0.14613735526800156,
|
|
"num_tokens": 12027119.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 5.875396728515625,
|
|
"epoch": 0.5482041587901701,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975719047480064,
|
|
"loss": 5.6829,
|
|
"mean_token_accuracy": 0.15304642170667648,
|
|
"num_tokens": 12035566.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 5.761675643920898,
|
|
"epoch": 0.548624238605335,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975675151614187,
|
|
"loss": 5.6105,
|
|
"mean_token_accuracy": 0.15602717846632003,
|
|
"num_tokens": 12044505.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 5.709016609191894,
|
|
"epoch": 0.5490443184204999,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000497563121632162,
|
|
"loss": 5.6827,
|
|
"mean_token_accuracy": 0.15345038324594498,
|
|
"num_tokens": 12053338.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 5.784457445144653,
|
|
"epoch": 0.5494643982356647,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004975587241603142,
|
|
"loss": 5.676,
|
|
"mean_token_accuracy": 0.14854272603988647,
|
|
"num_tokens": 12063235.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 5.909809684753418,
|
|
"epoch": 0.5498844780508296,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004975543227459533,
|
|
"loss": 5.7491,
|
|
"mean_token_accuracy": 0.1429952785372734,
|
|
"num_tokens": 12072490.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 5.8736042976379395,
|
|
"epoch": 0.5503045578659945,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004975499173891571,
|
|
"loss": 5.818,
|
|
"mean_token_accuracy": 0.14217820167541503,
|
|
"num_tokens": 12081474.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 5.804098796844483,
|
|
"epoch": 0.5507246376811594,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975455080900037,
|
|
"loss": 5.6739,
|
|
"mean_token_accuracy": 0.15498915761709214,
|
|
"num_tokens": 12090963.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 5.811689233779907,
|
|
"epoch": 0.5511447174963243,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975410948485713,
|
|
"loss": 5.6853,
|
|
"mean_token_accuracy": 0.1526065543293953,
|
|
"num_tokens": 12099786.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 5.74642539024353,
|
|
"epoch": 0.5515647973114892,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004975366776649379,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14672838300466537,
|
|
"num_tokens": 12108469.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.774152183532715,
|
|
"epoch": 0.5519848771266541,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004975322565391818,
|
|
"loss": 5.6804,
|
|
"mean_token_accuracy": 0.1517785020172596,
|
|
"num_tokens": 12118287.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 5.879052972793579,
|
|
"epoch": 0.5524049569418189,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004975278314713814,
|
|
"loss": 5.8381,
|
|
"mean_token_accuracy": 0.14230698868632316,
|
|
"num_tokens": 12127122.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 5.914984178543091,
|
|
"epoch": 0.5528250367569838,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004975234024616152,
|
|
"loss": 5.731,
|
|
"mean_token_accuracy": 0.15133389160037042,
|
|
"num_tokens": 12136395.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.734422016143799,
|
|
"epoch": 0.5532451165721487,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004975189695099613,
|
|
"loss": 5.6943,
|
|
"mean_token_accuracy": 0.15051371306180955,
|
|
"num_tokens": 12145025.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 5.800812196731568,
|
|
"epoch": 0.5536651963873136,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004975145326164985,
|
|
"loss": 5.7429,
|
|
"mean_token_accuracy": 0.1447499178349972,
|
|
"num_tokens": 12154352.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.8064220428466795,
|
|
"epoch": 0.5540852762024785,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004975100917813055,
|
|
"loss": 5.6588,
|
|
"mean_token_accuracy": 0.15041681826114656,
|
|
"num_tokens": 12163802.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 5.750297594070434,
|
|
"epoch": 0.5545053560176434,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004975056470044606,
|
|
"loss": 5.682,
|
|
"mean_token_accuracy": 0.14631521701812744,
|
|
"num_tokens": 12173111.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 5.8171515464782715,
|
|
"epoch": 0.5549254358328082,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004975011982860428,
|
|
"loss": 5.7383,
|
|
"mean_token_accuracy": 0.14391349628567696,
|
|
"num_tokens": 12182048.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 5.812657642364502,
|
|
"epoch": 0.5553455156479731,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004974967456261309,
|
|
"loss": 5.7159,
|
|
"mean_token_accuracy": 0.15039578825235367,
|
|
"num_tokens": 12191501.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.857609844207763,
|
|
"epoch": 0.555765595463138,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004974922890248036,
|
|
"loss": 5.7249,
|
|
"mean_token_accuracy": 0.15451397448778154,
|
|
"num_tokens": 12201132.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 5.899567031860352,
|
|
"epoch": 0.5561856752783029,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00049748782848214,
|
|
"loss": 5.8549,
|
|
"mean_token_accuracy": 0.14553611800074578,
|
|
"num_tokens": 12211082.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 5.807045125961304,
|
|
"epoch": 0.5566057550934678,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004974833639982192,
|
|
"loss": 5.6909,
|
|
"mean_token_accuracy": 0.15329068303108215,
|
|
"num_tokens": 12219946.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 5.925949478149414,
|
|
"epoch": 0.5570258349086327,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00049747889557312,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.14512094482779503,
|
|
"num_tokens": 12229668.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 5.886264276504517,
|
|
"epoch": 0.5574459147237976,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004974744232069219,
|
|
"loss": 5.7574,
|
|
"mean_token_accuracy": 0.14679303765296936,
|
|
"num_tokens": 12238750.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.809984493255615,
|
|
"epoch": 0.5578659945389624,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004974699468997038,
|
|
"loss": 5.7017,
|
|
"mean_token_accuracy": 0.14905162900686264,
|
|
"num_tokens": 12246825.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 5.811229848861695,
|
|
"epoch": 0.5582860743541272,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004974654666515452,
|
|
"loss": 5.6602,
|
|
"mean_token_accuracy": 0.14834603071212768,
|
|
"num_tokens": 12256413.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 5.882418012619018,
|
|
"epoch": 0.5587061541692921,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004974609824625254,
|
|
"loss": 5.6729,
|
|
"mean_token_accuracy": 0.1607891857624054,
|
|
"num_tokens": 12265458.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 5.649556875228882,
|
|
"epoch": 0.559126233984457,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004974564943327239,
|
|
"loss": 5.6227,
|
|
"mean_token_accuracy": 0.15252939462661744,
|
|
"num_tokens": 12274124.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 5.668555736541748,
|
|
"epoch": 0.5595463137996219,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00049745200226222,
|
|
"loss": 5.5888,
|
|
"mean_token_accuracy": 0.16476203203201295,
|
|
"num_tokens": 12283513.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 5.861951494216919,
|
|
"epoch": 0.5599663936147868,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004974475062510936,
|
|
"loss": 5.7171,
|
|
"mean_token_accuracy": 0.15322822630405425,
|
|
"num_tokens": 12292396.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 5.834360265731812,
|
|
"epoch": 0.5603864734299517,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974430062994242,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1490551695227623,
|
|
"num_tokens": 12301604.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 5.901991987228394,
|
|
"epoch": 0.5608065532451165,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004974385024072912,
|
|
"loss": 5.7881,
|
|
"mean_token_accuracy": 0.14175782203674317,
|
|
"num_tokens": 12310458.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 5.967726707458496,
|
|
"epoch": 0.5612266330602814,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000497433994574775,
|
|
"loss": 5.7835,
|
|
"mean_token_accuracy": 0.1453966811299324,
|
|
"num_tokens": 12319620.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 5.85808310508728,
|
|
"epoch": 0.5616467128754463,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000497429482801955,
|
|
"loss": 5.8356,
|
|
"mean_token_accuracy": 0.1476121611893177,
|
|
"num_tokens": 12329518.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.773319292068481,
|
|
"epoch": 0.5620667926906112,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004974249670889111,
|
|
"loss": 5.6512,
|
|
"mean_token_accuracy": 0.15055545866489412,
|
|
"num_tokens": 12338244.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 5.965986871719361,
|
|
"epoch": 0.5624868725057761,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974204474357237,
|
|
"loss": 5.8233,
|
|
"mean_token_accuracy": 0.14185196608304979,
|
|
"num_tokens": 12347962.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 5.896701097488403,
|
|
"epoch": 0.562906952320941,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004974159238424723,
|
|
"loss": 5.7434,
|
|
"mean_token_accuracy": 0.14349103569984437,
|
|
"num_tokens": 12357020.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 5.812654113769531,
|
|
"epoch": 0.5633270321361059,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004974113963092376,
|
|
"loss": 5.7151,
|
|
"mean_token_accuracy": 0.1478872776031494,
|
|
"num_tokens": 12366108.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 5.879363203048706,
|
|
"epoch": 0.5637471119512707,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004974068648360995,
|
|
"loss": 5.646,
|
|
"mean_token_accuracy": 0.15770871341228485,
|
|
"num_tokens": 12374508.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 5.793216609954834,
|
|
"epoch": 0.5641671917664356,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974023294231383,
|
|
"loss": 5.652,
|
|
"mean_token_accuracy": 0.15676265954971313,
|
|
"num_tokens": 12383555.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 5.762006092071533,
|
|
"epoch": 0.5645872715816005,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004973977900704342,
|
|
"loss": 5.7612,
|
|
"mean_token_accuracy": 0.1457872360944748,
|
|
"num_tokens": 12392680.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.872710561752319,
|
|
"epoch": 0.5650073513967654,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004973932467780679,
|
|
"loss": 5.7963,
|
|
"mean_token_accuracy": 0.14350106567144394,
|
|
"num_tokens": 12401881.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 5.897738790512085,
|
|
"epoch": 0.5654274312119303,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004973886995461197,
|
|
"loss": 5.7755,
|
|
"mean_token_accuracy": 0.14316605031490326,
|
|
"num_tokens": 12411487.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 5.799207353591919,
|
|
"epoch": 0.5658475110270952,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973841483746703,
|
|
"loss": 5.594,
|
|
"mean_token_accuracy": 0.16017859652638436,
|
|
"num_tokens": 12420376.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 5.6296477794647215,
|
|
"epoch": 0.5662675908422601,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004973795932638001,
|
|
"loss": 5.639,
|
|
"mean_token_accuracy": 0.15424187034368514,
|
|
"num_tokens": 12429518.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 5.768233728408814,
|
|
"epoch": 0.5666876706574249,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00049737503421359,
|
|
"loss": 5.6208,
|
|
"mean_token_accuracy": 0.15618278905749322,
|
|
"num_tokens": 12438952.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 5.762353801727295,
|
|
"epoch": 0.5671077504725898,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004973704712241206,
|
|
"loss": 5.6399,
|
|
"mean_token_accuracy": 0.14973016381263732,
|
|
"num_tokens": 12448576.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.758606004714966,
|
|
"epoch": 0.5675278302877547,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004973659042954729,
|
|
"loss": 5.666,
|
|
"mean_token_accuracy": 0.15317632332444192,
|
|
"num_tokens": 12458166.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 5.703948211669922,
|
|
"epoch": 0.5679479101029196,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004973613334277277,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.15764016062021255,
|
|
"num_tokens": 12467271.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 5.815484666824341,
|
|
"epoch": 0.5683679899180845,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004973567586209658,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.1427201583981514,
|
|
"num_tokens": 12476255.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 5.838050889968872,
|
|
"epoch": 0.5687880697332494,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004973521798752686,
|
|
"loss": 5.7306,
|
|
"mean_token_accuracy": 0.1476944074034691,
|
|
"num_tokens": 12485096.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 5.906451845169068,
|
|
"epoch": 0.5692081495484141,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000497347597190717,
|
|
"loss": 5.7558,
|
|
"mean_token_accuracy": 0.1506843164563179,
|
|
"num_tokens": 12494405.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 5.792209434509277,
|
|
"epoch": 0.569628229363579,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973430105673921,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.14848777875304223,
|
|
"num_tokens": 12503349.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.828717470169067,
|
|
"epoch": 0.5700483091787439,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004973384200053754,
|
|
"loss": 5.7518,
|
|
"mean_token_accuracy": 0.15347943902015687,
|
|
"num_tokens": 12513122.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.784585285186767,
|
|
"epoch": 0.5704683889939088,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000497333825504748,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14986882135272026,
|
|
"num_tokens": 12523614.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 5.838396644592285,
|
|
"epoch": 0.5708884688090737,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973292270655914,
|
|
"loss": 5.7434,
|
|
"mean_token_accuracy": 0.143761482834816,
|
|
"num_tokens": 12532031.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.926707601547241,
|
|
"epoch": 0.5713085486242386,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000497324624687987,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.1392517074942589,
|
|
"num_tokens": 12542239.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 5.917767190933228,
|
|
"epoch": 0.5717286284394035,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004973200183720164,
|
|
"loss": 5.7483,
|
|
"mean_token_accuracy": 0.14240999147295952,
|
|
"num_tokens": 12552608.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 5.775180721282959,
|
|
"epoch": 0.5721487082545683,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004973154081177611,
|
|
"loss": 5.593,
|
|
"mean_token_accuracy": 0.15000374913215636,
|
|
"num_tokens": 12562020.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 5.760695695877075,
|
|
"epoch": 0.5725687880697332,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004973107939253027,
|
|
"loss": 5.6762,
|
|
"mean_token_accuracy": 0.1592295289039612,
|
|
"num_tokens": 12570519.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 5.706324434280395,
|
|
"epoch": 0.5729888678848981,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004973061757947233,
|
|
"loss": 5.6616,
|
|
"mean_token_accuracy": 0.15384514778852462,
|
|
"num_tokens": 12579324.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.790519523620605,
|
|
"epoch": 0.573408947700063,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004973015537261043,
|
|
"loss": 5.7372,
|
|
"mean_token_accuracy": 0.1493046186864376,
|
|
"num_tokens": 12588014.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 5.890619230270386,
|
|
"epoch": 0.5738290275152279,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004972969277195279,
|
|
"loss": 5.7305,
|
|
"mean_token_accuracy": 0.15202558934688568,
|
|
"num_tokens": 12596882.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 5.819242668151856,
|
|
"epoch": 0.5742491073303928,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004972922977750757,
|
|
"loss": 5.6515,
|
|
"mean_token_accuracy": 0.1478489086031914,
|
|
"num_tokens": 12606069.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 5.829999208450317,
|
|
"epoch": 0.5746691871455577,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00049728766389283,
|
|
"loss": 5.6783,
|
|
"mean_token_accuracy": 0.1460999220609665,
|
|
"num_tokens": 12615167.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 5.775484275817871,
|
|
"epoch": 0.5750892669607225,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004972830260728729,
|
|
"loss": 5.7111,
|
|
"mean_token_accuracy": 0.15089115351438523,
|
|
"num_tokens": 12624230.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 5.81471266746521,
|
|
"epoch": 0.5755093467758874,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004972783843152863,
|
|
"loss": 5.6964,
|
|
"mean_token_accuracy": 0.15319516360759736,
|
|
"num_tokens": 12633158.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 5.742516231536865,
|
|
"epoch": 0.5759294265910523,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004972737386201527,
|
|
"loss": 5.6358,
|
|
"mean_token_accuracy": 0.1493402510881424,
|
|
"num_tokens": 12641465.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 5.772433757781982,
|
|
"epoch": 0.5763495064062172,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004972690889875541,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.15269945561885834,
|
|
"num_tokens": 12650437.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 5.9466852188110355,
|
|
"epoch": 0.5767695862213821,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004972644354175732,
|
|
"loss": 5.8321,
|
|
"mean_token_accuracy": 0.14773827642202378,
|
|
"num_tokens": 12660072.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 5.8965418338775635,
|
|
"epoch": 0.577189666036547,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004972597779102922,
|
|
"loss": 5.844,
|
|
"mean_token_accuracy": 0.14816712588071823,
|
|
"num_tokens": 12670405.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 5.826220703125,
|
|
"epoch": 0.5776097458517119,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004972551164657937,
|
|
"loss": 5.7126,
|
|
"mean_token_accuracy": 0.15028751343488694,
|
|
"num_tokens": 12679992.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 5.9022228717803955,
|
|
"epoch": 0.5780298256668767,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004972504510841602,
|
|
"loss": 5.7796,
|
|
"mean_token_accuracy": 0.14697190523147582,
|
|
"num_tokens": 12690289.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 5.883794593811035,
|
|
"epoch": 0.5784499054820416,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004972457817654745,
|
|
"loss": 5.7709,
|
|
"mean_token_accuracy": 0.14337689578533172,
|
|
"num_tokens": 12700518.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.896582746505738,
|
|
"epoch": 0.5788699852972065,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004972411085098191,
|
|
"loss": 5.8202,
|
|
"mean_token_accuracy": 0.138790999352932,
|
|
"num_tokens": 12710603.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.896594381332397,
|
|
"epoch": 0.5792900651123714,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497236431317277,
|
|
"loss": 5.7086,
|
|
"mean_token_accuracy": 0.14955383241176606,
|
|
"num_tokens": 12719298.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.828510808944702,
|
|
"epoch": 0.5797101449275363,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000497231750187931,
|
|
"loss": 5.7051,
|
|
"mean_token_accuracy": 0.1494380295276642,
|
|
"num_tokens": 12728368.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.847594785690307,
|
|
"epoch": 0.5801302247427012,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004972270651218638,
|
|
"loss": 5.769,
|
|
"mean_token_accuracy": 0.15052054449915886,
|
|
"num_tokens": 12737898.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 5.896743059158325,
|
|
"epoch": 0.580550304557866,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004972223761191587,
|
|
"loss": 5.7024,
|
|
"mean_token_accuracy": 0.1484552301466465,
|
|
"num_tokens": 12746761.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 5.748441457748413,
|
|
"epoch": 0.5809703843730308,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004972176831798986,
|
|
"loss": 5.6317,
|
|
"mean_token_accuracy": 0.1558982439339161,
|
|
"num_tokens": 12755128.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 5.8237542629241945,
|
|
"epoch": 0.5813904641881957,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004972129863041667,
|
|
"loss": 5.8145,
|
|
"mean_token_accuracy": 0.1419169031083584,
|
|
"num_tokens": 12764727.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 5.825289487838745,
|
|
"epoch": 0.5818105440033606,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004972082854920462,
|
|
"loss": 5.6682,
|
|
"mean_token_accuracy": 0.15212180316448212,
|
|
"num_tokens": 12773557.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.780522108078003,
|
|
"epoch": 0.5822306238185255,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004972035807436203,
|
|
"loss": 5.6741,
|
|
"mean_token_accuracy": 0.15388695299625396,
|
|
"num_tokens": 12782525.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.874711608886718,
|
|
"epoch": 0.5826507036336904,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004971988720589723,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.14911144897341727,
|
|
"num_tokens": 12791534.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 5.865447235107422,
|
|
"epoch": 0.5830707834488553,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004971941594381858,
|
|
"loss": 5.6622,
|
|
"mean_token_accuracy": 0.1520915597677231,
|
|
"num_tokens": 12800662.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.833262968063354,
|
|
"epoch": 0.5834908632640201,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004971894428813441,
|
|
"loss": 5.7134,
|
|
"mean_token_accuracy": 0.15022262334823608,
|
|
"num_tokens": 12809440.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 5.89053783416748,
|
|
"epoch": 0.583910943079185,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000497184722388531,
|
|
"loss": 5.7974,
|
|
"mean_token_accuracy": 0.14950450211763383,
|
|
"num_tokens": 12818560.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 5.910626697540283,
|
|
"epoch": 0.5843310228943499,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004971799979598297,
|
|
"loss": 5.7158,
|
|
"mean_token_accuracy": 0.15047362595796585,
|
|
"num_tokens": 12827898.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 5.736415719985962,
|
|
"epoch": 0.5847511027095148,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004971752695953243,
|
|
"loss": 5.6673,
|
|
"mean_token_accuracy": 0.15286629199981688,
|
|
"num_tokens": 12837199.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 5.841268587112427,
|
|
"epoch": 0.5851711825246797,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004971705372950984,
|
|
"loss": 5.6889,
|
|
"mean_token_accuracy": 0.14883269965648652,
|
|
"num_tokens": 12846493.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 5.862727975845337,
|
|
"epoch": 0.5855912623398446,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004971658010592358,
|
|
"loss": 5.7059,
|
|
"mean_token_accuracy": 0.14308914840221404,
|
|
"num_tokens": 12855026.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 5.807987260818481,
|
|
"epoch": 0.5860113421550095,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004971610608878205,
|
|
"loss": 5.7711,
|
|
"mean_token_accuracy": 0.14490452259778977,
|
|
"num_tokens": 12864563.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 5.884010982513428,
|
|
"epoch": 0.5864314219701743,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004971563167809363,
|
|
"loss": 5.7237,
|
|
"mean_token_accuracy": 0.15075904428958892,
|
|
"num_tokens": 12874358.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 5.7711278915405275,
|
|
"epoch": 0.5868515017853392,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004971515687386674,
|
|
"loss": 5.7117,
|
|
"mean_token_accuracy": 0.1473625972867012,
|
|
"num_tokens": 12883110.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 5.803575611114502,
|
|
"epoch": 0.5872715816005041,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004971468167610978,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.15010628029704093,
|
|
"num_tokens": 12892977.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 5.790566396713257,
|
|
"epoch": 0.587691661415669,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004971420608483117,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.1545809641480446,
|
|
"num_tokens": 12902327.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 5.741348314285278,
|
|
"epoch": 0.5881117412308339,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004971373010003936,
|
|
"loss": 5.6022,
|
|
"mean_token_accuracy": 0.16168920323252678,
|
|
"num_tokens": 12911957.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 5.8003096103668215,
|
|
"epoch": 0.5885318210459988,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004971325372174274,
|
|
"loss": 5.6907,
|
|
"mean_token_accuracy": 0.14657490849494934,
|
|
"num_tokens": 12920380.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"entropy": 5.811933612823486,
|
|
"epoch": 0.5889519008611637,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004971277694994976,
|
|
"loss": 5.7533,
|
|
"mean_token_accuracy": 0.15078987032175065,
|
|
"num_tokens": 12929670.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 5.819301414489746,
|
|
"epoch": 0.5893719806763285,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000497122997846689,
|
|
"loss": 5.6612,
|
|
"mean_token_accuracy": 0.1566910207271576,
|
|
"num_tokens": 12938185.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"entropy": 5.85056962966919,
|
|
"epoch": 0.5897920604914934,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004971182222590857,
|
|
"loss": 5.6984,
|
|
"mean_token_accuracy": 0.15590957552194595,
|
|
"num_tokens": 12947706.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 5.766946744918823,
|
|
"epoch": 0.5902121403066583,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004971134427367725,
|
|
"loss": 5.6836,
|
|
"mean_token_accuracy": 0.14876563102006912,
|
|
"num_tokens": 12957393.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"entropy": 5.863473749160766,
|
|
"epoch": 0.5906322201218231,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497108659279834,
|
|
"loss": 5.5813,
|
|
"mean_token_accuracy": 0.1580106034874916,
|
|
"num_tokens": 12967165.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 5.893796777725219,
|
|
"epoch": 0.591052299936988,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004971038718883551,
|
|
"loss": 5.7311,
|
|
"mean_token_accuracy": 0.14258148968219758,
|
|
"num_tokens": 12976490.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"entropy": 5.8169300079345705,
|
|
"epoch": 0.5914723797521529,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004970990805624203,
|
|
"loss": 5.7245,
|
|
"mean_token_accuracy": 0.1458576127886772,
|
|
"num_tokens": 12985423.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 5.806120443344116,
|
|
"epoch": 0.5918924595673178,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970942853021147,
|
|
"loss": 5.6187,
|
|
"mean_token_accuracy": 0.15678810328245163,
|
|
"num_tokens": 12994510.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"entropy": 5.8349559783935545,
|
|
"epoch": 0.5923125393824826,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004970894861075232,
|
|
"loss": 5.734,
|
|
"mean_token_accuracy": 0.1486038699746132,
|
|
"num_tokens": 13003383.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 5.833832693099976,
|
|
"epoch": 0.5927326191976475,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970846829787309,
|
|
"loss": 5.6695,
|
|
"mean_token_accuracy": 0.15129955112934113,
|
|
"num_tokens": 13012550.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"entropy": 5.845009517669678,
|
|
"epoch": 0.5931526990128124,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004970798759158227,
|
|
"loss": 5.7421,
|
|
"mean_token_accuracy": 0.14426639974117278,
|
|
"num_tokens": 13022066.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 5.804647397994995,
|
|
"epoch": 0.5935727788279773,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004970750649188839,
|
|
"loss": 5.711,
|
|
"mean_token_accuracy": 0.15260717198252677,
|
|
"num_tokens": 13031008.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"entropy": 5.774487495422363,
|
|
"epoch": 0.5939928586431422,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004970702499879998,
|
|
"loss": 5.6978,
|
|
"mean_token_accuracy": 0.14794613867998124,
|
|
"num_tokens": 13040366.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 5.774663066864013,
|
|
"epoch": 0.5944129384583071,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004970654311232554,
|
|
"loss": 5.7282,
|
|
"mean_token_accuracy": 0.14623787105083466,
|
|
"num_tokens": 13051140.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"entropy": 5.849271965026856,
|
|
"epoch": 0.594833018273472,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004970606083247362,
|
|
"loss": 5.6443,
|
|
"mean_token_accuracy": 0.15294349193572998,
|
|
"num_tokens": 13059835.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 5.7127063274383545,
|
|
"epoch": 0.5952530980886368,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004970557815925278,
|
|
"loss": 5.5898,
|
|
"mean_token_accuracy": 0.14923029839992524,
|
|
"num_tokens": 13068909.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"entropy": 5.729467248916626,
|
|
"epoch": 0.5956731779038017,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 0.0004970509509267155,
|
|
"loss": 5.6618,
|
|
"mean_token_accuracy": 0.14696715027093887,
|
|
"num_tokens": 13078380.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 5.90779447555542,
|
|
"epoch": 0.5960932577189666,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004970461163273849,
|
|
"loss": 5.7102,
|
|
"mean_token_accuracy": 0.15209844410419465,
|
|
"num_tokens": 13087774.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"entropy": 5.781322765350342,
|
|
"epoch": 0.5965133375341315,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004970412777946219,
|
|
"loss": 5.5491,
|
|
"mean_token_accuracy": 0.1548515573143959,
|
|
"num_tokens": 13095938.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 5.7372105598449705,
|
|
"epoch": 0.5969334173492964,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004970364353285117,
|
|
"loss": 5.6888,
|
|
"mean_token_accuracy": 0.15444473102688788,
|
|
"num_tokens": 13104661.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"entropy": 5.844806241989136,
|
|
"epoch": 0.5973534971644613,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004970315889291405,
|
|
"loss": 5.6731,
|
|
"mean_token_accuracy": 0.1474146157503128,
|
|
"num_tokens": 13114505.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 5.694882488250732,
|
|
"epoch": 0.5977735769796261,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004970267385965941,
|
|
"loss": 5.6245,
|
|
"mean_token_accuracy": 0.15627836883068086,
|
|
"num_tokens": 13124590.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"entropy": 5.715419483184815,
|
|
"epoch": 0.598193656794791,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004970218843309583,
|
|
"loss": 5.6087,
|
|
"mean_token_accuracy": 0.1559140369296074,
|
|
"num_tokens": 13134026.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 5.890923166275025,
|
|
"epoch": 0.5986137366099559,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004970170261323192,
|
|
"loss": 5.7662,
|
|
"mean_token_accuracy": 0.15187639147043228,
|
|
"num_tokens": 13142654.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"entropy": 5.7584481716156,
|
|
"epoch": 0.5990338164251208,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004970121640007627,
|
|
"loss": 5.6728,
|
|
"mean_token_accuracy": 0.1504793107509613,
|
|
"num_tokens": 13151177.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 5.807246541976928,
|
|
"epoch": 0.5994538962402857,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004970072979363751,
|
|
"loss": 5.6657,
|
|
"mean_token_accuracy": 0.1458762139081955,
|
|
"num_tokens": 13159689.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"entropy": 5.796993541717529,
|
|
"epoch": 0.5998739760554506,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004970024279392425,
|
|
"loss": 5.7087,
|
|
"mean_token_accuracy": 0.1491813488304615,
|
|
"num_tokens": 13168601.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 5.799499607086181,
|
|
"epoch": 0.6002940558706155,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004969975540094513,
|
|
"loss": 5.6911,
|
|
"mean_token_accuracy": 0.1491454616189003,
|
|
"num_tokens": 13177035.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"entropy": 5.840288925170898,
|
|
"epoch": 0.6007141356857802,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004969926761470876,
|
|
"loss": 5.6471,
|
|
"mean_token_accuracy": 0.15894681811332703,
|
|
"num_tokens": 13185444.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 5.787335777282715,
|
|
"epoch": 0.6011342155009451,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000496987794352238,
|
|
"loss": 5.6543,
|
|
"mean_token_accuracy": 0.15718057453632356,
|
|
"num_tokens": 13194987.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"entropy": 5.711384534835815,
|
|
"epoch": 0.60155429531611,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969829086249889,
|
|
"loss": 5.6887,
|
|
"mean_token_accuracy": 0.14929505437612534,
|
|
"num_tokens": 13203807.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 5.874243068695068,
|
|
"epoch": 0.6019743751312749,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000496978018965427,
|
|
"loss": 5.7803,
|
|
"mean_token_accuracy": 0.14797215312719345,
|
|
"num_tokens": 13214362.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"entropy": 5.938519763946533,
|
|
"epoch": 0.6023944549464398,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004969731253736387,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.14409856349229813,
|
|
"num_tokens": 13224192.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 5.824232769012451,
|
|
"epoch": 0.6028145347616047,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004969682278497109,
|
|
"loss": 5.7438,
|
|
"mean_token_accuracy": 0.149906075745821,
|
|
"num_tokens": 13234430.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"entropy": 5.766725778579712,
|
|
"epoch": 0.6032346145767696,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004969633263937301,
|
|
"loss": 5.6477,
|
|
"mean_token_accuracy": 0.15190263986587524,
|
|
"num_tokens": 13243681.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 5.959778547286987,
|
|
"epoch": 0.6036546943919344,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004969584210057832,
|
|
"loss": 5.9315,
|
|
"mean_token_accuracy": 0.13914565443992616,
|
|
"num_tokens": 13254334.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"entropy": 5.908876419067383,
|
|
"epoch": 0.6040747742070993,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004969535116859573,
|
|
"loss": 5.7233,
|
|
"mean_token_accuracy": 0.15498362332582474,
|
|
"num_tokens": 13263781.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 5.757447004318237,
|
|
"epoch": 0.6044948540222642,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969485984343392,
|
|
"loss": 5.633,
|
|
"mean_token_accuracy": 0.15214563608169557,
|
|
"num_tokens": 13272831.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"entropy": 5.840635204315186,
|
|
"epoch": 0.6049149338374291,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000496943681251016,
|
|
"loss": 5.6943,
|
|
"mean_token_accuracy": 0.15125853568315506,
|
|
"num_tokens": 13281621.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 5.772113513946533,
|
|
"epoch": 0.605335013652594,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004969387601360747,
|
|
"loss": 5.6754,
|
|
"mean_token_accuracy": 0.1471445269882679,
|
|
"num_tokens": 13291021.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"entropy": 5.837057733535767,
|
|
"epoch": 0.6057550934677589,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969338350896026,
|
|
"loss": 5.6877,
|
|
"mean_token_accuracy": 0.15487841069698333,
|
|
"num_tokens": 13299752.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 5.855220079421997,
|
|
"epoch": 0.6061751732829238,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004969289061116869,
|
|
"loss": 5.7219,
|
|
"mean_token_accuracy": 0.14336248189210893,
|
|
"num_tokens": 13309112.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"entropy": 5.829800653457641,
|
|
"epoch": 0.6065952530980886,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004969239732024148,
|
|
"loss": 5.7305,
|
|
"mean_token_accuracy": 0.15485918670892715,
|
|
"num_tokens": 13318328.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 5.693413162231446,
|
|
"epoch": 0.6070153329132535,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969190363618739,
|
|
"loss": 5.6063,
|
|
"mean_token_accuracy": 0.149900983273983,
|
|
"num_tokens": 13328940.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"entropy": 5.717437219619751,
|
|
"epoch": 0.6074354127284184,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004969140955901516,
|
|
"loss": 5.6137,
|
|
"mean_token_accuracy": 0.15410374999046325,
|
|
"num_tokens": 13337829.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 5.903831624984742,
|
|
"epoch": 0.6078554925435833,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004969091508873352,
|
|
"loss": 5.804,
|
|
"mean_token_accuracy": 0.14683766812086105,
|
|
"num_tokens": 13348289.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"entropy": 5.835478973388672,
|
|
"epoch": 0.6082755723587482,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969042022535126,
|
|
"loss": 5.708,
|
|
"mean_token_accuracy": 0.15235030949115752,
|
|
"num_tokens": 13357292.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 5.843629169464111,
|
|
"epoch": 0.6086956521739131,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004968992496887713,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.14912576526403426,
|
|
"num_tokens": 13366640.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"entropy": 5.844546985626221,
|
|
"epoch": 0.609115731989078,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004968942931931989,
|
|
"loss": 5.6594,
|
|
"mean_token_accuracy": 0.1629155233502388,
|
|
"num_tokens": 13377509.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 5.813440895080566,
|
|
"epoch": 0.6095358118042428,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004968893327668835,
|
|
"loss": 5.749,
|
|
"mean_token_accuracy": 0.14384781569242477,
|
|
"num_tokens": 13386573.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"entropy": 5.739164876937866,
|
|
"epoch": 0.6099558916194077,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004968843684099128,
|
|
"loss": 5.607,
|
|
"mean_token_accuracy": 0.1540288582444191,
|
|
"num_tokens": 13395790.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 5.730731964111328,
|
|
"epoch": 0.6103759714345726,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004968794001223747,
|
|
"loss": 5.658,
|
|
"mean_token_accuracy": 0.1504225805401802,
|
|
"num_tokens": 13405265.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"entropy": 5.77107720375061,
|
|
"epoch": 0.6107960512497375,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004968744279043574,
|
|
"loss": 5.6733,
|
|
"mean_token_accuracy": 0.15312831848859787,
|
|
"num_tokens": 13413796.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 5.86907844543457,
|
|
"epoch": 0.6112161310649024,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004968694517559488,
|
|
"loss": 5.7213,
|
|
"mean_token_accuracy": 0.15257197394967079,
|
|
"num_tokens": 13423299.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"entropy": 5.709070634841919,
|
|
"epoch": 0.6116362108800673,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004968644716772371,
|
|
"loss": 5.6292,
|
|
"mean_token_accuracy": 0.15693681687116623,
|
|
"num_tokens": 13432267.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 5.737072992324829,
|
|
"epoch": 0.612056290695232,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004968594876683105,
|
|
"loss": 5.733,
|
|
"mean_token_accuracy": 0.14609354361891747,
|
|
"num_tokens": 13442332.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"entropy": 5.8117883682250975,
|
|
"epoch": 0.6124763705103969,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004968544997292572,
|
|
"loss": 5.6747,
|
|
"mean_token_accuracy": 0.15259024500846863,
|
|
"num_tokens": 13451700.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 5.812619876861572,
|
|
"epoch": 0.6128964503255618,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004968495078601659,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.14332814291119575,
|
|
"num_tokens": 13461009.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"entropy": 5.858203887939453,
|
|
"epoch": 0.6133165301407267,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004968445120611247,
|
|
"loss": 5.7707,
|
|
"mean_token_accuracy": 0.15080213099718093,
|
|
"num_tokens": 13470341.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 5.905436229705811,
|
|
"epoch": 0.6137366099558916,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004968395123322223,
|
|
"loss": 5.7003,
|
|
"mean_token_accuracy": 0.1523931697010994,
|
|
"num_tokens": 13479898.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"entropy": 5.742975854873658,
|
|
"epoch": 0.6141566897710565,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000496834508673547,
|
|
"loss": 5.6046,
|
|
"mean_token_accuracy": 0.15081604719161987,
|
|
"num_tokens": 13488116.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 5.723895263671875,
|
|
"epoch": 0.6145767695862214,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004968295010851877,
|
|
"loss": 5.6474,
|
|
"mean_token_accuracy": 0.15416487902402878,
|
|
"num_tokens": 13497814.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"entropy": 5.786228084564209,
|
|
"epoch": 0.6149968494013862,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004968244895672331,
|
|
"loss": 5.6679,
|
|
"mean_token_accuracy": 0.14462938904762268,
|
|
"num_tokens": 13506617.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 5.833630132675171,
|
|
"epoch": 0.6154169292165511,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004968194741197718,
|
|
"loss": 5.8051,
|
|
"mean_token_accuracy": 0.1436678983271122,
|
|
"num_tokens": 13516632.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"entropy": 5.897484588623047,
|
|
"epoch": 0.615837009031716,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004968144547428927,
|
|
"loss": 5.7291,
|
|
"mean_token_accuracy": 0.15222294852137566,
|
|
"num_tokens": 13526452.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 5.792807674407959,
|
|
"epoch": 0.6162570888468809,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004968094314366848,
|
|
"loss": 5.6406,
|
|
"mean_token_accuracy": 0.150718155503273,
|
|
"num_tokens": 13535663.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"entropy": 5.687614870071411,
|
|
"epoch": 0.6166771686620458,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000496804404201237,
|
|
"loss": 5.558,
|
|
"mean_token_accuracy": 0.16134363710880278,
|
|
"num_tokens": 13544574.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 5.88130555152893,
|
|
"epoch": 0.6170972484772107,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004967993730366385,
|
|
"loss": 5.7309,
|
|
"mean_token_accuracy": 0.15020160600543023,
|
|
"num_tokens": 13553041.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"entropy": 5.799270153045654,
|
|
"epoch": 0.6175173282923756,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004967943379429781,
|
|
"loss": 5.7106,
|
|
"mean_token_accuracy": 0.14654484167695045,
|
|
"num_tokens": 13562108.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 5.930500316619873,
|
|
"epoch": 0.6179374081075404,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004967892989203454,
|
|
"loss": 5.8659,
|
|
"mean_token_accuracy": 0.14354829862713814,
|
|
"num_tokens": 13571500.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"entropy": 5.872519779205322,
|
|
"epoch": 0.6183574879227053,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004967842559688295,
|
|
"loss": 5.7577,
|
|
"mean_token_accuracy": 0.14510439038276673,
|
|
"num_tokens": 13581304.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 5.81227593421936,
|
|
"epoch": 0.6187775677378702,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004967792090885195,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.15179503858089446,
|
|
"num_tokens": 13590734.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"entropy": 5.746864557266235,
|
|
"epoch": 0.6191976475530351,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004967741582795052,
|
|
"loss": 5.6924,
|
|
"mean_token_accuracy": 0.14929923564195632,
|
|
"num_tokens": 13600486.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 5.881101942062378,
|
|
"epoch": 0.6196177273682,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967691035418758,
|
|
"loss": 5.7268,
|
|
"mean_token_accuracy": 0.14389215558767318,
|
|
"num_tokens": 13610542.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"entropy": 5.792819786071777,
|
|
"epoch": 0.6200378071833649,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.000496764044875721,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.15460289865732194,
|
|
"num_tokens": 13619431.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 5.761080598831176,
|
|
"epoch": 0.6204578869985298,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967589822811303,
|
|
"loss": 5.6957,
|
|
"mean_token_accuracy": 0.14801864922046662,
|
|
"num_tokens": 13629930.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"entropy": 5.956879663467407,
|
|
"epoch": 0.6208779668136946,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004967539157581934,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14267176687717437,
|
|
"num_tokens": 13639439.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 5.9114847660064695,
|
|
"epoch": 0.6212980466288595,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000496748845307,
|
|
"loss": 5.7476,
|
|
"mean_token_accuracy": 0.15258604139089585,
|
|
"num_tokens": 13648548.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"entropy": 5.858182144165039,
|
|
"epoch": 0.6217181264440244,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004967437709276401,
|
|
"loss": 5.7985,
|
|
"mean_token_accuracy": 0.15154744163155556,
|
|
"num_tokens": 13657658.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 5.721544599533081,
|
|
"epoch": 0.6221382062591893,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004967386926202034,
|
|
"loss": 5.5518,
|
|
"mean_token_accuracy": 0.15903828144073487,
|
|
"num_tokens": 13666763.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"entropy": 5.837467288970947,
|
|
"epoch": 0.6225582860743542,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00049673361038478,
|
|
"loss": 5.8103,
|
|
"mean_token_accuracy": 0.14174049571156502,
|
|
"num_tokens": 13676527.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 5.855217123031617,
|
|
"epoch": 0.622978365889519,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004967285242214599,
|
|
"loss": 5.7674,
|
|
"mean_token_accuracy": 0.149812014400959,
|
|
"num_tokens": 13685404.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"entropy": 5.782896041870117,
|
|
"epoch": 0.6233984457046838,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000496723434130333,
|
|
"loss": 5.5821,
|
|
"mean_token_accuracy": 0.15357585549354552,
|
|
"num_tokens": 13693118.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 5.7227521419525145,
|
|
"epoch": 0.6238185255198487,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004967183401114898,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15249475762248038,
|
|
"num_tokens": 13702015.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"entropy": 5.806180191040039,
|
|
"epoch": 0.6242386053350136,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004967132421650203,
|
|
"loss": 5.6877,
|
|
"mean_token_accuracy": 0.14611244574189186,
|
|
"num_tokens": 13711658.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 5.766854763031006,
|
|
"epoch": 0.6246586851501785,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004967081402910149,
|
|
"loss": 5.6979,
|
|
"mean_token_accuracy": 0.14979787766933442,
|
|
"num_tokens": 13720718.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"entropy": 5.728975391387939,
|
|
"epoch": 0.6250787649653434,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496703034489564,
|
|
"loss": 5.5606,
|
|
"mean_token_accuracy": 0.1568959876894951,
|
|
"num_tokens": 13729364.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 5.909390020370483,
|
|
"epoch": 0.6254988447805083,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004966979247607579,
|
|
"loss": 5.8725,
|
|
"mean_token_accuracy": 0.14035747721791267,
|
|
"num_tokens": 13739436.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"entropy": 5.9296684741973875,
|
|
"epoch": 0.6259189245956732,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004966928111046873,
|
|
"loss": 5.7708,
|
|
"mean_token_accuracy": 0.15743647813796996,
|
|
"num_tokens": 13749196.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 5.783377313613892,
|
|
"epoch": 0.626339004410838,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004966876935214426,
|
|
"loss": 5.6254,
|
|
"mean_token_accuracy": 0.15206747651100158,
|
|
"num_tokens": 13758414.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"entropy": 5.766037368774414,
|
|
"epoch": 0.6267590842260029,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0004966825720111147,
|
|
"loss": 5.6562,
|
|
"mean_token_accuracy": 0.14928966909646987,
|
|
"num_tokens": 13767496.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 5.811860084533691,
|
|
"epoch": 0.6271791640411678,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004966774465737942,
|
|
"loss": 5.8047,
|
|
"mean_token_accuracy": 0.15070491954684256,
|
|
"num_tokens": 13777033.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"entropy": 5.844302463531494,
|
|
"epoch": 0.6275992438563327,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004966723172095717,
|
|
"loss": 5.7583,
|
|
"mean_token_accuracy": 0.14748911708593368,
|
|
"num_tokens": 13786313.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 5.826303386688233,
|
|
"epoch": 0.6280193236714976,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004966671839185384,
|
|
"loss": 5.691,
|
|
"mean_token_accuracy": 0.1544649474322796,
|
|
"num_tokens": 13795257.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"entropy": 5.733129787445068,
|
|
"epoch": 0.6284394034866625,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004966620467007851,
|
|
"loss": 5.6151,
|
|
"mean_token_accuracy": 0.15482667088508606,
|
|
"num_tokens": 13804582.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 5.708710527420044,
|
|
"epoch": 0.6288594833018274,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004966569055564027,
|
|
"loss": 5.5858,
|
|
"mean_token_accuracy": 0.1517590843141079,
|
|
"num_tokens": 13813248.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"entropy": 5.892451477050781,
|
|
"epoch": 0.6292795631169922,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004966517604854823,
|
|
"loss": 5.8557,
|
|
"mean_token_accuracy": 0.13463475033640862,
|
|
"num_tokens": 13823301.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 5.816387891769409,
|
|
"epoch": 0.6296996429321571,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004966466114881152,
|
|
"loss": 5.5904,
|
|
"mean_token_accuracy": 0.15593330711126327,
|
|
"num_tokens": 13832040.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"entropy": 5.830536413192749,
|
|
"epoch": 0.630119722747322,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004966414585643925,
|
|
"loss": 5.7743,
|
|
"mean_token_accuracy": 0.14742243885993958,
|
|
"num_tokens": 13841874.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 5.7584226608276365,
|
|
"epoch": 0.6305398025624869,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004966363017144055,
|
|
"loss": 5.6126,
|
|
"mean_token_accuracy": 0.15902097374200821,
|
|
"num_tokens": 13850755.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"entropy": 5.788242483139038,
|
|
"epoch": 0.6309598823776518,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004966311409382455,
|
|
"loss": 5.6797,
|
|
"mean_token_accuracy": 0.14931050986051558,
|
|
"num_tokens": 13860009.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 5.736308908462524,
|
|
"epoch": 0.6313799621928167,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004966259762360039,
|
|
"loss": 5.5946,
|
|
"mean_token_accuracy": 0.15429836511611938,
|
|
"num_tokens": 13868476.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"entropy": 5.711131143569946,
|
|
"epoch": 0.6318000420079816,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004966208076077723,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.15463593304157258,
|
|
"num_tokens": 13877367.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 5.750036096572876,
|
|
"epoch": 0.6322201218231464,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004966156350536422,
|
|
"loss": 5.6935,
|
|
"mean_token_accuracy": 0.14963461458683014,
|
|
"num_tokens": 13885985.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"entropy": 5.755751752853394,
|
|
"epoch": 0.6326402016383113,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004966104585737054,
|
|
"loss": 5.61,
|
|
"mean_token_accuracy": 0.15479331612586975,
|
|
"num_tokens": 13895059.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 5.780548143386841,
|
|
"epoch": 0.6330602814534761,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004966052781680534,
|
|
"loss": 5.6767,
|
|
"mean_token_accuracy": 0.14704100489616395,
|
|
"num_tokens": 13903789.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"entropy": 5.845569133758545,
|
|
"epoch": 0.633480361268641,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004966000938367778,
|
|
"loss": 5.6591,
|
|
"mean_token_accuracy": 0.15396612286567687,
|
|
"num_tokens": 13913377.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 5.6942973136901855,
|
|
"epoch": 0.6339004410838059,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004965949055799708,
|
|
"loss": 5.6186,
|
|
"mean_token_accuracy": 0.1588241770863533,
|
|
"num_tokens": 13922141.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"entropy": 5.787711143493652,
|
|
"epoch": 0.6343205208989708,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004965897133977241,
|
|
"loss": 5.6597,
|
|
"mean_token_accuracy": 0.1402692511677742,
|
|
"num_tokens": 13930717.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 5.825317001342773,
|
|
"epoch": 0.6347406007141357,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004965845172901298,
|
|
"loss": 5.7464,
|
|
"mean_token_accuracy": 0.14808339700102807,
|
|
"num_tokens": 13940344.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"entropy": 5.7218469142913815,
|
|
"epoch": 0.6351606805293005,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004965793172572798,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.15380775630474092,
|
|
"num_tokens": 13948400.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 5.710135746002197,
|
|
"epoch": 0.6355807603444654,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004965741132992663,
|
|
"loss": 5.6947,
|
|
"mean_token_accuracy": 0.14487617537379266,
|
|
"num_tokens": 13957939.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"entropy": 5.832439231872558,
|
|
"epoch": 0.6360008401596303,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965689054161814,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.1547864407300949,
|
|
"num_tokens": 13966943.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 5.738895320892334,
|
|
"epoch": 0.6364209199747952,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004965636936081176,
|
|
"loss": 5.5722,
|
|
"mean_token_accuracy": 0.1546689599752426,
|
|
"num_tokens": 13975850.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"entropy": 5.806326103210449,
|
|
"epoch": 0.6368409997899601,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000496558477875167,
|
|
"loss": 5.6725,
|
|
"mean_token_accuracy": 0.15719727128744126,
|
|
"num_tokens": 13985059.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 5.77093358039856,
|
|
"epoch": 0.637261079605125,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496553258217422,
|
|
"loss": 5.7215,
|
|
"mean_token_accuracy": 0.1449730947613716,
|
|
"num_tokens": 13993571.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"entropy": 5.842133378982544,
|
|
"epoch": 0.6376811594202898,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004965480346349751,
|
|
"loss": 5.7185,
|
|
"mean_token_accuracy": 0.15069702565670012,
|
|
"num_tokens": 14002326.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 5.9778131484985355,
|
|
"epoch": 0.6381012392354547,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000496542807127919,
|
|
"loss": 5.8686,
|
|
"mean_token_accuracy": 0.14351749792695045,
|
|
"num_tokens": 14012002.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"entropy": 5.788293838500977,
|
|
"epoch": 0.6385213190506196,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496537575696346,
|
|
"loss": 5.7363,
|
|
"mean_token_accuracy": 0.14434802830219268,
|
|
"num_tokens": 14022085.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 5.704484844207764,
|
|
"epoch": 0.6389413988657845,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004965323403403488,
|
|
"loss": 5.6045,
|
|
"mean_token_accuracy": 0.15442810356616973,
|
|
"num_tokens": 14030706.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"entropy": 5.77836651802063,
|
|
"epoch": 0.6393614786809494,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004965271010600205,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.15519261509180068,
|
|
"num_tokens": 14039520.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 5.822714900970459,
|
|
"epoch": 0.6397815584961143,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004965218578554535,
|
|
"loss": 5.7178,
|
|
"mean_token_accuracy": 0.15360228195786477,
|
|
"num_tokens": 14048407.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"entropy": 5.711956024169922,
|
|
"epoch": 0.6402016383112792,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496516610726741,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.158063705265522,
|
|
"num_tokens": 14057534.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 5.765710496902466,
|
|
"epoch": 0.640621718126444,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965113596739759,
|
|
"loss": 5.6129,
|
|
"mean_token_accuracy": 0.1602526545524597,
|
|
"num_tokens": 14065992.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"entropy": 5.712855339050293,
|
|
"epoch": 0.6410417979416089,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004965061046972508,
|
|
"loss": 5.6062,
|
|
"mean_token_accuracy": 0.15307263806462287,
|
|
"num_tokens": 14074806.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 5.752716493606568,
|
|
"epoch": 0.6414618777567738,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004965008457966594,
|
|
"loss": 5.6501,
|
|
"mean_token_accuracy": 0.15263762921094895,
|
|
"num_tokens": 14083813.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"entropy": 5.762417888641357,
|
|
"epoch": 0.6418819575719387,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004964955829722945,
|
|
"loss": 5.5858,
|
|
"mean_token_accuracy": 0.1599087104201317,
|
|
"num_tokens": 14092193.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 5.84725341796875,
|
|
"epoch": 0.6423020373871036,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004964903162242493,
|
|
"loss": 5.7916,
|
|
"mean_token_accuracy": 0.14413690567016602,
|
|
"num_tokens": 14102797.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"entropy": 5.76859679222107,
|
|
"epoch": 0.6427221172022685,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004964850455526173,
|
|
"loss": 5.6637,
|
|
"mean_token_accuracy": 0.15364854410290718,
|
|
"num_tokens": 14112226.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 5.661821556091309,
|
|
"epoch": 0.6431421970174334,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004964797709574917,
|
|
"loss": 5.5939,
|
|
"mean_token_accuracy": 0.15402402132749557,
|
|
"num_tokens": 14121775.0,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"entropy": 5.719243478775025,
|
|
"epoch": 0.6435622768325981,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000496474492438966,
|
|
"loss": 5.5856,
|
|
"mean_token_accuracy": 0.15579498410224915,
|
|
"num_tokens": 14130415.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 5.75182991027832,
|
|
"epoch": 0.643982356647763,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964692099971338,
|
|
"loss": 5.6058,
|
|
"mean_token_accuracy": 0.1568465366959572,
|
|
"num_tokens": 14140204.0,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"entropy": 5.736771440505981,
|
|
"epoch": 0.6444024364629279,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964639236320885,
|
|
"loss": 5.567,
|
|
"mean_token_accuracy": 0.15371138900518416,
|
|
"num_tokens": 14149595.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 5.714345407485962,
|
|
"epoch": 0.6448225162780928,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004964586333439239,
|
|
"loss": 5.6346,
|
|
"mean_token_accuracy": 0.15398874282836914,
|
|
"num_tokens": 14158865.0,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"entropy": 5.78523097038269,
|
|
"epoch": 0.6452425960932577,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004964533391327335,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.158450847864151,
|
|
"num_tokens": 14167962.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 5.816212701797485,
|
|
"epoch": 0.6456626759084226,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004964480409986113,
|
|
"loss": 5.6465,
|
|
"mean_token_accuracy": 0.1606015980243683,
|
|
"num_tokens": 14176479.0,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"entropy": 5.829603910446167,
|
|
"epoch": 0.6460827557235875,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004964427389416512,
|
|
"loss": 5.6739,
|
|
"mean_token_accuracy": 0.14969076216220856,
|
|
"num_tokens": 14185408.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 5.702767419815063,
|
|
"epoch": 0.6465028355387523,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000496437432961947,
|
|
"loss": 5.6745,
|
|
"mean_token_accuracy": 0.15580256432294845,
|
|
"num_tokens": 14194155.0,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"entropy": 5.729840040206909,
|
|
"epoch": 0.6469229153539172,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964321230595925,
|
|
"loss": 5.6916,
|
|
"mean_token_accuracy": 0.1505993440747261,
|
|
"num_tokens": 14202779.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 5.923639154434204,
|
|
"epoch": 0.6473429951690821,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004964268092346821,
|
|
"loss": 5.868,
|
|
"mean_token_accuracy": 0.14160000756382943,
|
|
"num_tokens": 14212552.0,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"entropy": 5.925770807266235,
|
|
"epoch": 0.647763074984247,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004964214914873098,
|
|
"loss": 5.6684,
|
|
"mean_token_accuracy": 0.14924321398139,
|
|
"num_tokens": 14222783.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 5.70919623374939,
|
|
"epoch": 0.6481831547994119,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004964161698175697,
|
|
"loss": 5.5477,
|
|
"mean_token_accuracy": 0.15285850167274476,
|
|
"num_tokens": 14232085.0,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"entropy": 5.768083095550537,
|
|
"epoch": 0.6486032346145768,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004964108442255562,
|
|
"loss": 5.7039,
|
|
"mean_token_accuracy": 0.14666701555252076,
|
|
"num_tokens": 14241969.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 5.75738754272461,
|
|
"epoch": 0.6490233144297417,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004964055147113637,
|
|
"loss": 5.616,
|
|
"mean_token_accuracy": 0.1562434285879135,
|
|
"num_tokens": 14251012.0,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"entropy": 5.841613340377807,
|
|
"epoch": 0.6494433942449065,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004964001812750864,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.15030983835458755,
|
|
"num_tokens": 14261110.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 5.793753337860108,
|
|
"epoch": 0.6498634740600714,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000496394843916819,
|
|
"loss": 5.7123,
|
|
"mean_token_accuracy": 0.15001400411128998,
|
|
"num_tokens": 14270869.0,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"entropy": 5.8021101474761965,
|
|
"epoch": 0.6502835538752363,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004963895026366558,
|
|
"loss": 5.6624,
|
|
"mean_token_accuracy": 0.14703597128391266,
|
|
"num_tokens": 14279607.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 5.798326921463013,
|
|
"epoch": 0.6507036336904012,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004963841574346917,
|
|
"loss": 5.6664,
|
|
"mean_token_accuracy": 0.15177475959062575,
|
|
"num_tokens": 14289282.0,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"entropy": 5.785371494293213,
|
|
"epoch": 0.6511237135055661,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004963788083110212,
|
|
"loss": 5.5947,
|
|
"mean_token_accuracy": 0.15618948638439178,
|
|
"num_tokens": 14298658.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 5.867933845520019,
|
|
"epoch": 0.651543793320731,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000496373455265739,
|
|
"loss": 5.6715,
|
|
"mean_token_accuracy": 0.15167464911937714,
|
|
"num_tokens": 14307832.0,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"entropy": 5.737640428543091,
|
|
"epoch": 0.6519638731358958,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004963680982989402,
|
|
"loss": 5.5745,
|
|
"mean_token_accuracy": 0.15618224889039994,
|
|
"num_tokens": 14317122.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 5.728768348693848,
|
|
"epoch": 0.6523839529510607,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004963627374107195,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.15685338973999025,
|
|
"num_tokens": 14326069.0,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"entropy": 5.735061359405518,
|
|
"epoch": 0.6528040327662256,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004963573726011717,
|
|
"loss": 5.6154,
|
|
"mean_token_accuracy": 0.152651646733284,
|
|
"num_tokens": 14335260.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 5.89712963104248,
|
|
"epoch": 0.6532241125813905,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004963520038703922,
|
|
"loss": 5.7147,
|
|
"mean_token_accuracy": 0.14169859886169434,
|
|
"num_tokens": 14345823.0,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"entropy": 5.8055966854095455,
|
|
"epoch": 0.6536441923965554,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000496346631218476,
|
|
"loss": 5.5901,
|
|
"mean_token_accuracy": 0.151746928691864,
|
|
"num_tokens": 14354316.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 5.731487655639649,
|
|
"epoch": 0.6540642722117203,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.000496341254645518,
|
|
"loss": 5.637,
|
|
"mean_token_accuracy": 0.15558102428913118,
|
|
"num_tokens": 14364539.0,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"entropy": 5.791000318527222,
|
|
"epoch": 0.6544843520268852,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004963358741516138,
|
|
"loss": 5.7568,
|
|
"mean_token_accuracy": 0.14070456251502036,
|
|
"num_tokens": 14374081.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 5.791856861114502,
|
|
"epoch": 0.6549044318420499,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004963304897368585,
|
|
"loss": 5.6421,
|
|
"mean_token_accuracy": 0.14869485646486283,
|
|
"num_tokens": 14383255.0,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"entropy": 5.887608623504638,
|
|
"epoch": 0.6553245116572148,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004963251014013475,
|
|
"loss": 5.7709,
|
|
"mean_token_accuracy": 0.14988299310207367,
|
|
"num_tokens": 14392417.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 5.925739812850952,
|
|
"epoch": 0.6557445914723797,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004963197091451763,
|
|
"loss": 5.8171,
|
|
"mean_token_accuracy": 0.14091493040323258,
|
|
"num_tokens": 14401899.0,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"entropy": 5.8610601902008055,
|
|
"epoch": 0.6561646712875446,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004963143129684405,
|
|
"loss": 5.7865,
|
|
"mean_token_accuracy": 0.14567770585417747,
|
|
"num_tokens": 14411245.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 5.733341979980469,
|
|
"epoch": 0.6565847511027095,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004963089128712355,
|
|
"loss": 5.6357,
|
|
"mean_token_accuracy": 0.15616341382265092,
|
|
"num_tokens": 14419710.0,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"entropy": 5.761330413818359,
|
|
"epoch": 0.6570048309178744,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004963035088536571,
|
|
"loss": 5.6196,
|
|
"mean_token_accuracy": 0.16149473637342454,
|
|
"num_tokens": 14430266.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 5.832095336914063,
|
|
"epoch": 0.6574249107330393,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004962981009158012,
|
|
"loss": 5.5946,
|
|
"mean_token_accuracy": 0.14890647828578948,
|
|
"num_tokens": 14439515.0,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"entropy": 5.783193588256836,
|
|
"epoch": 0.6578449905482041,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004962926890577632,
|
|
"loss": 5.6537,
|
|
"mean_token_accuracy": 0.1543855309486389,
|
|
"num_tokens": 14448091.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 5.762275314331054,
|
|
"epoch": 0.658265070363369,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000496287273279639,
|
|
"loss": 5.6831,
|
|
"mean_token_accuracy": 0.14809218272566796,
|
|
"num_tokens": 14457744.0,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"entropy": 5.830176925659179,
|
|
"epoch": 0.6586851501785339,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496281853581525,
|
|
"loss": 5.6747,
|
|
"mean_token_accuracy": 0.15542599856853484,
|
|
"num_tokens": 14467597.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 5.816223096847534,
|
|
"epoch": 0.6591052299936988,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004962764299635168,
|
|
"loss": 5.6557,
|
|
"mean_token_accuracy": 0.15143783688545226,
|
|
"num_tokens": 14476662.0,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"entropy": 5.868206977844238,
|
|
"epoch": 0.6595253098088637,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004962710024257105,
|
|
"loss": 5.7365,
|
|
"mean_token_accuracy": 0.15013337954878808,
|
|
"num_tokens": 14486583.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 5.866771793365478,
|
|
"epoch": 0.6599453896240286,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004962655709682025,
|
|
"loss": 5.7422,
|
|
"mean_token_accuracy": 0.14670923799276353,
|
|
"num_tokens": 14496528.0,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"entropy": 5.847543859481812,
|
|
"epoch": 0.6603654694391935,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004962601355910887,
|
|
"loss": 5.7216,
|
|
"mean_token_accuracy": 0.14750941842794418,
|
|
"num_tokens": 14507026.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 5.714229869842529,
|
|
"epoch": 0.6607855492543583,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004962546962944656,
|
|
"loss": 5.5896,
|
|
"mean_token_accuracy": 0.1554133415222168,
|
|
"num_tokens": 14516480.0,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"entropy": 5.7652284622192385,
|
|
"epoch": 0.6612056290695232,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962492530784295,
|
|
"loss": 5.5384,
|
|
"mean_token_accuracy": 0.16685622930526733,
|
|
"num_tokens": 14525068.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 5.764181613922119,
|
|
"epoch": 0.6616257088846881,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004962438059430768,
|
|
"loss": 5.6811,
|
|
"mean_token_accuracy": 0.15448692589998245,
|
|
"num_tokens": 14534441.0,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"entropy": 5.791794538497925,
|
|
"epoch": 0.662045788699853,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004962383548885039,
|
|
"loss": 5.7416,
|
|
"mean_token_accuracy": 0.15312327668070794,
|
|
"num_tokens": 14543026.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 5.810564088821411,
|
|
"epoch": 0.6624658685150179,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004962328999148075,
|
|
"loss": 5.6235,
|
|
"mean_token_accuracy": 0.15815748721361161,
|
|
"num_tokens": 14552068.0,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"entropy": 5.795226907730102,
|
|
"epoch": 0.6628859483301828,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004962274410220842,
|
|
"loss": 5.748,
|
|
"mean_token_accuracy": 0.14739178717136384,
|
|
"num_tokens": 14561587.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 5.840717220306397,
|
|
"epoch": 0.6633060281453477,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962219782104308,
|
|
"loss": 5.7455,
|
|
"mean_token_accuracy": 0.15566187649965285,
|
|
"num_tokens": 14571020.0,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"entropy": 5.857281494140625,
|
|
"epoch": 0.6637261079605125,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004962165114799439,
|
|
"loss": 5.7013,
|
|
"mean_token_accuracy": 0.14193924963474275,
|
|
"num_tokens": 14580638.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 5.753746509552002,
|
|
"epoch": 0.6641461877756774,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004962110408307204,
|
|
"loss": 5.6411,
|
|
"mean_token_accuracy": 0.1508389577269554,
|
|
"num_tokens": 14590173.0,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"entropy": 5.771540355682373,
|
|
"epoch": 0.6645662675908423,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004962055662628571,
|
|
"loss": 5.6088,
|
|
"mean_token_accuracy": 0.1546558991074562,
|
|
"num_tokens": 14598635.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 5.824790573120117,
|
|
"epoch": 0.6649863474060071,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004962000877764513,
|
|
"loss": 5.6465,
|
|
"mean_token_accuracy": 0.15380171239376067,
|
|
"num_tokens": 14607233.0,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"entropy": 5.900277614593506,
|
|
"epoch": 0.665406427221172,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004961946053715998,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.14116770774126053,
|
|
"num_tokens": 14617483.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 5.774311876296997,
|
|
"epoch": 0.665826507036337,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004961891190483997,
|
|
"loss": 5.6337,
|
|
"mean_token_accuracy": 0.15262163281440735,
|
|
"num_tokens": 14625805.0,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"entropy": 5.750567626953125,
|
|
"epoch": 0.6662465868515017,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004961836288069483,
|
|
"loss": 5.56,
|
|
"mean_token_accuracy": 0.15181114226579667,
|
|
"num_tokens": 14634605.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 5.866780996322632,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004961781346473428,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1443464897572994,
|
|
"num_tokens": 14644970.0,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"entropy": 5.8288147926330565,
|
|
"epoch": 0.6670867464818315,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004961726365696805,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.1512111656367779,
|
|
"num_tokens": 14655043.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 5.81706018447876,
|
|
"epoch": 0.6675068262969964,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004961671345740589,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.1498358130455017,
|
|
"num_tokens": 14663994.0,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"entropy": 5.73077392578125,
|
|
"epoch": 0.6679269061121613,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004961616286605753,
|
|
"loss": 5.6285,
|
|
"mean_token_accuracy": 0.14595297276973723,
|
|
"num_tokens": 14674101.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 5.793763732910156,
|
|
"epoch": 0.6683469859273262,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004961561188293273,
|
|
"loss": 5.7245,
|
|
"mean_token_accuracy": 0.14435067921876907,
|
|
"num_tokens": 14684156.0,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"entropy": 5.726213026046753,
|
|
"epoch": 0.6687670657424911,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004961506050804126,
|
|
"loss": 5.6178,
|
|
"mean_token_accuracy": 0.15918601751327516,
|
|
"num_tokens": 14693223.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 5.852010822296142,
|
|
"epoch": 0.6691871455576559,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000496145087413929,
|
|
"loss": 5.6258,
|
|
"mean_token_accuracy": 0.14910822063684465,
|
|
"num_tokens": 14702959.0,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"entropy": 5.876345634460449,
|
|
"epoch": 0.6696072253728208,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004961395658299737,
|
|
"loss": 5.737,
|
|
"mean_token_accuracy": 0.1483006753027439,
|
|
"num_tokens": 14712146.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 5.710770320892334,
|
|
"epoch": 0.6700273051879857,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004961340403286451,
|
|
"loss": 5.6515,
|
|
"mean_token_accuracy": 0.14912314414978028,
|
|
"num_tokens": 14721932.0,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"entropy": 5.775924396514893,
|
|
"epoch": 0.6704473850031506,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961285109100408,
|
|
"loss": 5.5857,
|
|
"mean_token_accuracy": 0.15742873400449753,
|
|
"num_tokens": 14731080.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 5.719264698028565,
|
|
"epoch": 0.6708674648183155,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004961229775742587,
|
|
"loss": 5.5991,
|
|
"mean_token_accuracy": 0.16006802767515182,
|
|
"num_tokens": 14740057.0,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"entropy": 5.813319492340088,
|
|
"epoch": 0.6712875446334804,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000496117440321397,
|
|
"loss": 5.6828,
|
|
"mean_token_accuracy": 0.15654956847429274,
|
|
"num_tokens": 14748399.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 5.8324696063995365,
|
|
"epoch": 0.6717076244486453,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961118991515537,
|
|
"loss": 5.6881,
|
|
"mean_token_accuracy": 0.14406146556138993,
|
|
"num_tokens": 14757215.0,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"entropy": 5.786386203765869,
|
|
"epoch": 0.6721277042638101,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000496106354064827,
|
|
"loss": 5.6868,
|
|
"mean_token_accuracy": 0.15685203224420546,
|
|
"num_tokens": 14766191.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 5.8651642322540285,
|
|
"epoch": 0.672547784078975,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004961008050613149,
|
|
"loss": 5.7521,
|
|
"mean_token_accuracy": 0.14210513085126877,
|
|
"num_tokens": 14775220.0,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"entropy": 5.838468170166015,
|
|
"epoch": 0.6729678638941399,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004960952521411161,
|
|
"loss": 5.7078,
|
|
"mean_token_accuracy": 0.14716721177101136,
|
|
"num_tokens": 14784287.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 5.932072496414184,
|
|
"epoch": 0.6733879437093048,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004960896953043287,
|
|
"loss": 5.7759,
|
|
"mean_token_accuracy": 0.14442920163273812,
|
|
"num_tokens": 14794219.0,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"entropy": 5.824687051773071,
|
|
"epoch": 0.6738080235244697,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004960841345510511,
|
|
"loss": 5.6703,
|
|
"mean_token_accuracy": 0.1518692597746849,
|
|
"num_tokens": 14803324.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 5.7951741218566895,
|
|
"epoch": 0.6742281033396346,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000496078569881382,
|
|
"loss": 5.6876,
|
|
"mean_token_accuracy": 0.1539413034915924,
|
|
"num_tokens": 14811963.0,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"entropy": 5.747313785552978,
|
|
"epoch": 0.6746481831547995,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004960730012954198,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.14589986428618432,
|
|
"num_tokens": 14821903.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 5.716427040100098,
|
|
"epoch": 0.6750682629699643,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004960674287932634,
|
|
"loss": 5.6271,
|
|
"mean_token_accuracy": 0.14554727971553802,
|
|
"num_tokens": 14831215.0,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"entropy": 5.827300643920898,
|
|
"epoch": 0.6754883427851291,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004960618523750111,
|
|
"loss": 5.5552,
|
|
"mean_token_accuracy": 0.1551190733909607,
|
|
"num_tokens": 14840354.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 5.817133188247681,
|
|
"epoch": 0.675908422600294,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000496056272040762,
|
|
"loss": 5.7402,
|
|
"mean_token_accuracy": 0.14943507611751555,
|
|
"num_tokens": 14849660.0,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"entropy": 5.807599830627441,
|
|
"epoch": 0.6763285024154589,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004960506877906149,
|
|
"loss": 5.6648,
|
|
"mean_token_accuracy": 0.14764449894428253,
|
|
"num_tokens": 14859819.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 5.801334857940674,
|
|
"epoch": 0.6767485822306238,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004960450996246686,
|
|
"loss": 5.6585,
|
|
"mean_token_accuracy": 0.15806604847311972,
|
|
"num_tokens": 14869260.0,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"entropy": 5.7306236743927,
|
|
"epoch": 0.6771686620457887,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004960395075430222,
|
|
"loss": 5.6336,
|
|
"mean_token_accuracy": 0.15279667675495148,
|
|
"num_tokens": 14878685.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 5.749643182754516,
|
|
"epoch": 0.6775887418609536,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004960339115457748,
|
|
"loss": 5.6372,
|
|
"mean_token_accuracy": 0.1503060542047024,
|
|
"num_tokens": 14888456.0,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"entropy": 5.7973710060119625,
|
|
"epoch": 0.6780088216761184,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004960283116330255,
|
|
"loss": 5.731,
|
|
"mean_token_accuracy": 0.14978916943073273,
|
|
"num_tokens": 14897401.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 5.807585668563843,
|
|
"epoch": 0.6784289014912833,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004960227078048735,
|
|
"loss": 5.6567,
|
|
"mean_token_accuracy": 0.15412394553422928,
|
|
"num_tokens": 14906741.0,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"entropy": 5.760078573226929,
|
|
"epoch": 0.6788489813064482,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004960171000614179,
|
|
"loss": 5.5427,
|
|
"mean_token_accuracy": 0.16074198186397554,
|
|
"num_tokens": 14916002.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 5.638378715515136,
|
|
"epoch": 0.6792690611216131,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004960114884027583,
|
|
"loss": 5.4776,
|
|
"mean_token_accuracy": 0.16621290147304535,
|
|
"num_tokens": 14925247.0,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"entropy": 5.708978319168091,
|
|
"epoch": 0.679689140936778,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004960058728289939,
|
|
"loss": 5.608,
|
|
"mean_token_accuracy": 0.15026133954524995,
|
|
"num_tokens": 14933925.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 5.904026126861572,
|
|
"epoch": 0.6801092207519429,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004960002533402243,
|
|
"loss": 5.6881,
|
|
"mean_token_accuracy": 0.15241528823971748,
|
|
"num_tokens": 14943368.0,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"entropy": 5.790306043624878,
|
|
"epoch": 0.6805293005671077,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004959946299365491,
|
|
"loss": 5.6953,
|
|
"mean_token_accuracy": 0.14710961580276488,
|
|
"num_tokens": 14953710.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 5.816765403747558,
|
|
"epoch": 0.6809493803822726,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004959890026180677,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.14748610258102418,
|
|
"num_tokens": 14962814.0,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"entropy": 5.688648128509522,
|
|
"epoch": 0.6813694601974375,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00049598337138488,
|
|
"loss": 5.5964,
|
|
"mean_token_accuracy": 0.16184311360120773,
|
|
"num_tokens": 14971631.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 5.8211281299591064,
|
|
"epoch": 0.6817895400126024,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004959777362370855,
|
|
"loss": 5.5884,
|
|
"mean_token_accuracy": 0.15286847501993178,
|
|
"num_tokens": 14980528.0,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"entropy": 5.87521915435791,
|
|
"epoch": 0.6822096198277673,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004959720971747843,
|
|
"loss": 5.6149,
|
|
"mean_token_accuracy": 0.15216847509145737,
|
|
"num_tokens": 14989331.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 5.713017272949219,
|
|
"epoch": 0.6826296996429322,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004959664541980762,
|
|
"loss": 5.598,
|
|
"mean_token_accuracy": 0.15774561017751693,
|
|
"num_tokens": 14999403.0,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"entropy": 5.737113285064697,
|
|
"epoch": 0.6830497794580971,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004959608073070612,
|
|
"loss": 5.6958,
|
|
"mean_token_accuracy": 0.14559513479471206,
|
|
"num_tokens": 15009388.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 5.837254619598388,
|
|
"epoch": 0.6834698592732619,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004959551565018392,
|
|
"loss": 5.6286,
|
|
"mean_token_accuracy": 0.15535787492990494,
|
|
"num_tokens": 15018586.0,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"entropy": 5.778875064849854,
|
|
"epoch": 0.6838899390884268,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004959495017825104,
|
|
"loss": 5.6407,
|
|
"mean_token_accuracy": 0.15465399324893953,
|
|
"num_tokens": 15027982.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 5.739845132827758,
|
|
"epoch": 0.6843100189035917,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004959438431491749,
|
|
"loss": 5.6278,
|
|
"mean_token_accuracy": 0.15651622265577317,
|
|
"num_tokens": 15037103.0,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"entropy": 5.728132820129394,
|
|
"epoch": 0.6847300987187566,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000495938180601933,
|
|
"loss": 5.7184,
|
|
"mean_token_accuracy": 0.14796946495771407,
|
|
"num_tokens": 15046739.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 5.822361660003662,
|
|
"epoch": 0.6851501785339215,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004959325141408851,
|
|
"loss": 5.666,
|
|
"mean_token_accuracy": 0.15593857914209366,
|
|
"num_tokens": 15056586.0,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"entropy": 5.768631410598755,
|
|
"epoch": 0.6855702583490864,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004959268437661313,
|
|
"loss": 5.641,
|
|
"mean_token_accuracy": 0.15448189303278922,
|
|
"num_tokens": 15066622.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 5.767803955078125,
|
|
"epoch": 0.6859903381642513,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004959211694777724,
|
|
"loss": 5.6293,
|
|
"mean_token_accuracy": 0.15781602412462234,
|
|
"num_tokens": 15075415.0,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"entropy": 5.731510210037231,
|
|
"epoch": 0.686410417979416,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004959154912759086,
|
|
"loss": 5.6134,
|
|
"mean_token_accuracy": 0.15285183787345885,
|
|
"num_tokens": 15085087.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 5.772061681747436,
|
|
"epoch": 0.6868304977945809,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004959098091606406,
|
|
"loss": 5.6231,
|
|
"mean_token_accuracy": 0.1562209889292717,
|
|
"num_tokens": 15093580.0,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"entropy": 5.681428337097168,
|
|
"epoch": 0.6872505776097458,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004959041231320692,
|
|
"loss": 5.5996,
|
|
"mean_token_accuracy": 0.15760979950428008,
|
|
"num_tokens": 15104033.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 5.769718980789184,
|
|
"epoch": 0.6876706574249107,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958984331902951,
|
|
"loss": 5.6773,
|
|
"mean_token_accuracy": 0.14753246530890465,
|
|
"num_tokens": 15113164.0,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"entropy": 5.745969009399414,
|
|
"epoch": 0.6880907372400756,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004958927393354188,
|
|
"loss": 5.6297,
|
|
"mean_token_accuracy": 0.15737390518188477,
|
|
"num_tokens": 15122215.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 5.765387773513794,
|
|
"epoch": 0.6885108170552405,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004958870415675415,
|
|
"loss": 5.6091,
|
|
"mean_token_accuracy": 0.15159644484519957,
|
|
"num_tokens": 15130877.0,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"entropy": 5.7833487033844,
|
|
"epoch": 0.6889308968704054,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004958813398867639,
|
|
"loss": 5.5909,
|
|
"mean_token_accuracy": 0.1610761597752571,
|
|
"num_tokens": 15140227.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 5.874035358428955,
|
|
"epoch": 0.6893509766855702,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004958756342931872,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.14578953385353088,
|
|
"num_tokens": 15150006.0,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"entropy": 5.7979443073272705,
|
|
"epoch": 0.6897710565007351,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004958699247869122,
|
|
"loss": 5.6734,
|
|
"mean_token_accuracy": 0.15173593461513518,
|
|
"num_tokens": 15160032.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 5.775300407409668,
|
|
"epoch": 0.6901911363159,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004958642113680404,
|
|
"loss": 5.607,
|
|
"mean_token_accuracy": 0.15672277957201003,
|
|
"num_tokens": 15168966.0,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"entropy": 5.886404323577881,
|
|
"epoch": 0.6906112161310649,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004958584940366727,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.1462364301085472,
|
|
"num_tokens": 15179337.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 5.845329141616821,
|
|
"epoch": 0.6910312959462298,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958527727929106,
|
|
"loss": 5.6901,
|
|
"mean_token_accuracy": 0.15126113295555116,
|
|
"num_tokens": 15188395.0,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"entropy": 5.777632856369019,
|
|
"epoch": 0.6914513757613947,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004958470476368552,
|
|
"loss": 5.6175,
|
|
"mean_token_accuracy": 0.1590783603489399,
|
|
"num_tokens": 15198669.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 5.717659664154053,
|
|
"epoch": 0.6918714555765595,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004958413185686082,
|
|
"loss": 5.637,
|
|
"mean_token_accuracy": 0.15654054433107376,
|
|
"num_tokens": 15207371.0,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"entropy": 5.771133661270142,
|
|
"epoch": 0.6922915353917244,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004958355855882709,
|
|
"loss": 5.6623,
|
|
"mean_token_accuracy": 0.15609176307916642,
|
|
"num_tokens": 15215694.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 5.838139247894287,
|
|
"epoch": 0.6927116152068893,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000495829848695945,
|
|
"loss": 5.6462,
|
|
"mean_token_accuracy": 0.15314621180295945,
|
|
"num_tokens": 15224963.0,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"entropy": 5.6792638301849365,
|
|
"epoch": 0.6931316950220542,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000495824107891732,
|
|
"loss": 5.4601,
|
|
"mean_token_accuracy": 0.16161370724439622,
|
|
"num_tokens": 15233569.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 5.702935647964478,
|
|
"epoch": 0.6935517748372191,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004958183631757336,
|
|
"loss": 5.6456,
|
|
"mean_token_accuracy": 0.15384626239538193,
|
|
"num_tokens": 15242671.0,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"entropy": 5.757969760894776,
|
|
"epoch": 0.693971854652384,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004958126145480517,
|
|
"loss": 5.6062,
|
|
"mean_token_accuracy": 0.15589472502470017,
|
|
"num_tokens": 15251698.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 5.881031131744384,
|
|
"epoch": 0.6943919344675489,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004958068620087879,
|
|
"loss": 5.7131,
|
|
"mean_token_accuracy": 0.15278587341308594,
|
|
"num_tokens": 15260608.0,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"entropy": 5.7654228687286375,
|
|
"epoch": 0.6948120142827137,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004958011055580443,
|
|
"loss": 5.5824,
|
|
"mean_token_accuracy": 0.1566091775894165,
|
|
"num_tokens": 15268866.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 5.691988468170166,
|
|
"epoch": 0.6952320940978786,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004957953451959229,
|
|
"loss": 5.5428,
|
|
"mean_token_accuracy": 0.1687786027789116,
|
|
"num_tokens": 15277600.0,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"entropy": 5.712690019607544,
|
|
"epoch": 0.6956521739130435,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004957895809225254,
|
|
"loss": 5.577,
|
|
"mean_token_accuracy": 0.15904618948698043,
|
|
"num_tokens": 15286016.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 5.791261529922485,
|
|
"epoch": 0.6960722537282084,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957838127379544,
|
|
"loss": 5.6203,
|
|
"mean_token_accuracy": 0.15775981694459915,
|
|
"num_tokens": 15294676.0,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"entropy": 5.787760162353516,
|
|
"epoch": 0.6964923335433733,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004957780406423118,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.1520596593618393,
|
|
"num_tokens": 15304084.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 5.732133674621582,
|
|
"epoch": 0.6969124133585382,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004957722646356999,
|
|
"loss": 5.6145,
|
|
"mean_token_accuracy": 0.15437885522842407,
|
|
"num_tokens": 15314182.0,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"entropy": 5.82383394241333,
|
|
"epoch": 0.697332493173703,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004957664847182209,
|
|
"loss": 5.7321,
|
|
"mean_token_accuracy": 0.14916351363062857,
|
|
"num_tokens": 15324213.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 5.901606464385987,
|
|
"epoch": 0.6977525729888678,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004957607008899774,
|
|
"loss": 5.6654,
|
|
"mean_token_accuracy": 0.14808408319950103,
|
|
"num_tokens": 15333122.0,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"entropy": 5.821764516830444,
|
|
"epoch": 0.6981726528040327,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004957549131510717,
|
|
"loss": 5.7587,
|
|
"mean_token_accuracy": 0.14488900303840638,
|
|
"num_tokens": 15342199.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 5.85214409828186,
|
|
"epoch": 0.6985927326191976,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004957491215016065,
|
|
"loss": 5.7068,
|
|
"mean_token_accuracy": 0.14899201691150665,
|
|
"num_tokens": 15352463.0,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"entropy": 5.7340789318084715,
|
|
"epoch": 0.6990128124343625,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004957433259416841,
|
|
"loss": 5.5519,
|
|
"mean_token_accuracy": 0.15695535391569138,
|
|
"num_tokens": 15361815.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 5.829116296768189,
|
|
"epoch": 0.6994328922495274,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004957375264714075,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.14441719949245452,
|
|
"num_tokens": 15371773.0,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"entropy": 5.731393432617187,
|
|
"epoch": 0.6998529720646923,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957317230908792,
|
|
"loss": 5.6078,
|
|
"mean_token_accuracy": 0.153985595703125,
|
|
"num_tokens": 15380881.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 5.69814658164978,
|
|
"epoch": 0.7002730518798572,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004957259158002022,
|
|
"loss": 5.4853,
|
|
"mean_token_accuracy": 0.16338536590337754,
|
|
"num_tokens": 15389310.0,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"entropy": 5.65314564704895,
|
|
"epoch": 0.700693131695022,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004957201045994791,
|
|
"loss": 5.585,
|
|
"mean_token_accuracy": 0.15192776024341584,
|
|
"num_tokens": 15398584.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 5.752124881744384,
|
|
"epoch": 0.7011132115101869,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004957142894888131,
|
|
"loss": 5.6244,
|
|
"mean_token_accuracy": 0.1605387285351753,
|
|
"num_tokens": 15407208.0,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"entropy": 5.781596279144287,
|
|
"epoch": 0.7015332913253518,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004957084704683071,
|
|
"loss": 5.6552,
|
|
"mean_token_accuracy": 0.15119443833827972,
|
|
"num_tokens": 15416474.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 5.796496915817261,
|
|
"epoch": 0.7019533711405167,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004957026475380642,
|
|
"loss": 5.6589,
|
|
"mean_token_accuracy": 0.1581042394042015,
|
|
"num_tokens": 15426101.0,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"entropy": 5.8482013702392575,
|
|
"epoch": 0.7023734509556816,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004956968206981875,
|
|
"loss": 5.6866,
|
|
"mean_token_accuracy": 0.1528375506401062,
|
|
"num_tokens": 15435910.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 5.838450860977173,
|
|
"epoch": 0.7027935307708465,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004956909899487803,
|
|
"loss": 5.7297,
|
|
"mean_token_accuracy": 0.14721868485212325,
|
|
"num_tokens": 15445494.0,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"entropy": 5.773874664306641,
|
|
"epoch": 0.7032136105860114,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004956851552899459,
|
|
"loss": 5.6133,
|
|
"mean_token_accuracy": 0.15867630988359452,
|
|
"num_tokens": 15455332.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 5.7730677127838135,
|
|
"epoch": 0.7036336904011762,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004956793167217874,
|
|
"loss": 5.6813,
|
|
"mean_token_accuracy": 0.1490170478820801,
|
|
"num_tokens": 15464241.0,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"entropy": 5.8777241706848145,
|
|
"epoch": 0.7040537702163411,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004956734742444087,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.15121965557336808,
|
|
"num_tokens": 15473473.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 5.744890403747559,
|
|
"epoch": 0.704473850031506,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004956676278579129,
|
|
"loss": 5.563,
|
|
"mean_token_accuracy": 0.15540574193000795,
|
|
"num_tokens": 15482494.0,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"entropy": 5.676463556289673,
|
|
"epoch": 0.7048939298466709,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004956617775624037,
|
|
"loss": 5.5724,
|
|
"mean_token_accuracy": 0.15146812200546264,
|
|
"num_tokens": 15491180.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 5.786671447753906,
|
|
"epoch": 0.7053140096618358,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004956559233579848,
|
|
"loss": 5.6148,
|
|
"mean_token_accuracy": 0.15258617997169494,
|
|
"num_tokens": 15501035.0,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"entropy": 5.7913405418396,
|
|
"epoch": 0.7057340894770007,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004956500652447598,
|
|
"loss": 5.5994,
|
|
"mean_token_accuracy": 0.15323785319924355,
|
|
"num_tokens": 15510191.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 5.706702041625976,
|
|
"epoch": 0.7061541692921655,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004956442032228324,
|
|
"loss": 5.6875,
|
|
"mean_token_accuracy": 0.15146460086107255,
|
|
"num_tokens": 15519253.0,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"entropy": 5.7468561172485355,
|
|
"epoch": 0.7065742491073304,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004956383372923067,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.15219423472881316,
|
|
"num_tokens": 15528348.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 5.909702920913697,
|
|
"epoch": 0.7069943289224953,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004956324674532864,
|
|
"loss": 5.7312,
|
|
"mean_token_accuracy": 0.14496915340423583,
|
|
"num_tokens": 15537557.0,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"entropy": 5.853457021713257,
|
|
"epoch": 0.7074144087376601,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004956265937058757,
|
|
"loss": 5.6662,
|
|
"mean_token_accuracy": 0.14985378384590148,
|
|
"num_tokens": 15546745.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 5.753704071044922,
|
|
"epoch": 0.707834488552825,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004956207160501784,
|
|
"loss": 5.5646,
|
|
"mean_token_accuracy": 0.15850543081760407,
|
|
"num_tokens": 15555532.0,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"entropy": 5.728769159317016,
|
|
"epoch": 0.70825456836799,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004956148344862987,
|
|
"loss": 5.6209,
|
|
"mean_token_accuracy": 0.1560587242245674,
|
|
"num_tokens": 15564189.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 5.664771509170532,
|
|
"epoch": 0.7086746481831548,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004956089490143408,
|
|
"loss": 5.6492,
|
|
"mean_token_accuracy": 0.15197667628526687,
|
|
"num_tokens": 15574116.0,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"entropy": 5.824323844909668,
|
|
"epoch": 0.7090947279983196,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004956030596344089,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.149012803286314,
|
|
"num_tokens": 15583031.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 5.836510467529297,
|
|
"epoch": 0.7095148078134845,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004955971663466075,
|
|
"loss": 5.7671,
|
|
"mean_token_accuracy": 0.15028237402439118,
|
|
"num_tokens": 15592576.0,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"entropy": 5.823656129837036,
|
|
"epoch": 0.7099348876286494,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004955912691510407,
|
|
"loss": 5.697,
|
|
"mean_token_accuracy": 0.15281013548374175,
|
|
"num_tokens": 15601065.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 5.751941967010498,
|
|
"epoch": 0.7103549674438143,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004955853680478134,
|
|
"loss": 5.633,
|
|
"mean_token_accuracy": 0.14754925668239594,
|
|
"num_tokens": 15610112.0,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"entropy": 5.778195095062256,
|
|
"epoch": 0.7107750472589792,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004955794630370297,
|
|
"loss": 5.6139,
|
|
"mean_token_accuracy": 0.15469905436038972,
|
|
"num_tokens": 15618890.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 5.750346851348877,
|
|
"epoch": 0.7111951270741441,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004955735541187945,
|
|
"loss": 5.6397,
|
|
"mean_token_accuracy": 0.15139740109443664,
|
|
"num_tokens": 15627678.0,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"entropy": 5.838537120819092,
|
|
"epoch": 0.711615206889309,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004955676412932124,
|
|
"loss": 5.6254,
|
|
"mean_token_accuracy": 0.15495479255914688,
|
|
"num_tokens": 15636833.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 5.758643341064453,
|
|
"epoch": 0.7120352867044738,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004955617245603881,
|
|
"loss": 5.6441,
|
|
"mean_token_accuracy": 0.1475740984082222,
|
|
"num_tokens": 15646571.0,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"entropy": 5.771809864044189,
|
|
"epoch": 0.7124553665196387,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004955558039204263,
|
|
"loss": 5.6883,
|
|
"mean_token_accuracy": 0.1559377834200859,
|
|
"num_tokens": 15654907.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 5.87169828414917,
|
|
"epoch": 0.7128754463348036,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004955498793734321,
|
|
"loss": 5.6259,
|
|
"mean_token_accuracy": 0.15253366231918336,
|
|
"num_tokens": 15664336.0,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"entropy": 5.775359678268432,
|
|
"epoch": 0.7132955261499685,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004955439509195103,
|
|
"loss": 5.6818,
|
|
"mean_token_accuracy": 0.15552834868431092,
|
|
"num_tokens": 15674000.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 5.817126750946045,
|
|
"epoch": 0.7137156059651334,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004955380185587661,
|
|
"loss": 5.6655,
|
|
"mean_token_accuracy": 0.15541905909776688,
|
|
"num_tokens": 15684214.0,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"entropy": 5.823128080368042,
|
|
"epoch": 0.7141356857802983,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0004955320822913043,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14909214079380034,
|
|
"num_tokens": 15693546.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 5.796035861968994,
|
|
"epoch": 0.7145557655954632,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004955261421172302,
|
|
"loss": 5.6006,
|
|
"mean_token_accuracy": 0.15094921365380287,
|
|
"num_tokens": 15702310.0,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"entropy": 5.765657234191894,
|
|
"epoch": 0.714975845410628,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004955201980366493,
|
|
"loss": 5.6549,
|
|
"mean_token_accuracy": 0.1583261877298355,
|
|
"num_tokens": 15711544.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 5.701775074005127,
|
|
"epoch": 0.7153959252257929,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004955142500496665,
|
|
"loss": 5.5378,
|
|
"mean_token_accuracy": 0.15932040065526962,
|
|
"num_tokens": 15720914.0,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"entropy": 5.806231927871704,
|
|
"epoch": 0.7158160050409578,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004955082981563872,
|
|
"loss": 5.636,
|
|
"mean_token_accuracy": 0.1497705653309822,
|
|
"num_tokens": 15729825.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 5.731112813949585,
|
|
"epoch": 0.7162360848561227,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000495502342356917,
|
|
"loss": 5.6407,
|
|
"mean_token_accuracy": 0.15358344316482545,
|
|
"num_tokens": 15739649.0,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"entropy": 5.775957298278809,
|
|
"epoch": 0.7166561646712876,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004954963826513614,
|
|
"loss": 5.5312,
|
|
"mean_token_accuracy": 0.15533651560544967,
|
|
"num_tokens": 15747805.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 5.848172760009765,
|
|
"epoch": 0.7170762444864525,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000495490419039826,
|
|
"loss": 5.6763,
|
|
"mean_token_accuracy": 0.15182012543082238,
|
|
"num_tokens": 15757267.0,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"entropy": 5.734999704360962,
|
|
"epoch": 0.7174963243016174,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954844515224162,
|
|
"loss": 5.6442,
|
|
"mean_token_accuracy": 0.15498089045286179,
|
|
"num_tokens": 15767412.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 5.702851438522339,
|
|
"epoch": 0.7179164041167821,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004954784800992379,
|
|
"loss": 5.6434,
|
|
"mean_token_accuracy": 0.1511929914355278,
|
|
"num_tokens": 15776813.0,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"entropy": 5.8534894466400145,
|
|
"epoch": 0.718336483931947,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004954725047703969,
|
|
"loss": 5.6771,
|
|
"mean_token_accuracy": 0.152647565305233,
|
|
"num_tokens": 15786258.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 5.836289310455323,
|
|
"epoch": 0.7187565637471119,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000495466525535999,
|
|
"loss": 5.6667,
|
|
"mean_token_accuracy": 0.15143323093652725,
|
|
"num_tokens": 15795673.0,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"entropy": 5.811659526824951,
|
|
"epoch": 0.7191766435622768,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004954605423961501,
|
|
"loss": 5.6561,
|
|
"mean_token_accuracy": 0.15157762318849563,
|
|
"num_tokens": 15805050.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 5.681427240371704,
|
|
"epoch": 0.7195967233774417,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954545553509562,
|
|
"loss": 5.606,
|
|
"mean_token_accuracy": 0.16409880369901658,
|
|
"num_tokens": 15813347.0,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"entropy": 5.839797496795654,
|
|
"epoch": 0.7200168031926066,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004954485644005235,
|
|
"loss": 5.7266,
|
|
"mean_token_accuracy": 0.1489485539495945,
|
|
"num_tokens": 15823528.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 5.8334362506866455,
|
|
"epoch": 0.7204368830077714,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954425695449578,
|
|
"loss": 5.6173,
|
|
"mean_token_accuracy": 0.15086468532681466,
|
|
"num_tokens": 15832727.0,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"entropy": 5.822533702850341,
|
|
"epoch": 0.7208569628229363,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004954365707843657,
|
|
"loss": 5.6976,
|
|
"mean_token_accuracy": 0.14436446502804756,
|
|
"num_tokens": 15842402.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 5.748192930221558,
|
|
"epoch": 0.7212770426381012,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004954305681188531,
|
|
"loss": 5.5623,
|
|
"mean_token_accuracy": 0.1519525095820427,
|
|
"num_tokens": 15850886.0,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"entropy": 5.9683891296386715,
|
|
"epoch": 0.7216971224532661,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004954245615485265,
|
|
"loss": 5.8576,
|
|
"mean_token_accuracy": 0.14881062209606172,
|
|
"num_tokens": 15860093.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 5.825228261947632,
|
|
"epoch": 0.722117202268431,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004954185510734924,
|
|
"loss": 5.5603,
|
|
"mean_token_accuracy": 0.15691882967948914,
|
|
"num_tokens": 15868681.0,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"entropy": 5.775141906738281,
|
|
"epoch": 0.7225372820835959,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004954125366938571,
|
|
"loss": 5.6425,
|
|
"mean_token_accuracy": 0.15889365077018738,
|
|
"num_tokens": 15878041.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 5.759042358398437,
|
|
"epoch": 0.7229573618987608,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004954065184097271,
|
|
"loss": 5.6357,
|
|
"mean_token_accuracy": 0.15483569353818893,
|
|
"num_tokens": 15887562.0,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"entropy": 5.751525020599365,
|
|
"epoch": 0.7233774417139256,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004954004962212092,
|
|
"loss": 5.5541,
|
|
"mean_token_accuracy": 0.1643654190003872,
|
|
"num_tokens": 15896480.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 5.911052465438843,
|
|
"epoch": 0.7237975215290905,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004953944701284101,
|
|
"loss": 5.7752,
|
|
"mean_token_accuracy": 0.1463731437921524,
|
|
"num_tokens": 15906743.0,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"entropy": 5.830478382110596,
|
|
"epoch": 0.7242176013442554,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004953884401314363,
|
|
"loss": 5.7213,
|
|
"mean_token_accuracy": 0.13995275720953942,
|
|
"num_tokens": 15915981.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 5.8113525867462155,
|
|
"epoch": 0.7246376811594203,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004953824062303949,
|
|
"loss": 5.5765,
|
|
"mean_token_accuracy": 0.1530995100736618,
|
|
"num_tokens": 15924117.0,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"entropy": 5.7734462261199955,
|
|
"epoch": 0.7250577609745852,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004953763684253926,
|
|
"loss": 5.6054,
|
|
"mean_token_accuracy": 0.16219132840633393,
|
|
"num_tokens": 15933124.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 5.7224249839782715,
|
|
"epoch": 0.7254778407897501,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004953703267165364,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.1558832585811615,
|
|
"num_tokens": 15942422.0,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"entropy": 5.749732875823975,
|
|
"epoch": 0.725897920604915,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004953642811039332,
|
|
"loss": 5.7128,
|
|
"mean_token_accuracy": 0.14854123890399934,
|
|
"num_tokens": 15950989.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 5.855362319946289,
|
|
"epoch": 0.7263180004200798,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004953582315876904,
|
|
"loss": 5.7185,
|
|
"mean_token_accuracy": 0.15013131573796273,
|
|
"num_tokens": 15959659.0,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"entropy": 5.837911462783813,
|
|
"epoch": 0.7267380802352447,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.000495352178167915,
|
|
"loss": 5.5977,
|
|
"mean_token_accuracy": 0.16410948783159257,
|
|
"num_tokens": 15968102.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 5.854554653167725,
|
|
"epoch": 0.7271581600504096,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004953461208447143,
|
|
"loss": 5.7132,
|
|
"mean_token_accuracy": 0.14808624759316444,
|
|
"num_tokens": 15977705.0,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"entropy": 5.801808023452759,
|
|
"epoch": 0.7275782398655745,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004953400596181953,
|
|
"loss": 5.7244,
|
|
"mean_token_accuracy": 0.1447308510541916,
|
|
"num_tokens": 15986703.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 5.839752292633056,
|
|
"epoch": 0.7279983196807394,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004953339944884657,
|
|
"loss": 5.6309,
|
|
"mean_token_accuracy": 0.15707603991031646,
|
|
"num_tokens": 15995672.0,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"entropy": 5.702234554290771,
|
|
"epoch": 0.7284183994959043,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004953279254556329,
|
|
"loss": 5.5683,
|
|
"mean_token_accuracy": 0.16529579162597657,
|
|
"num_tokens": 16004437.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 5.786400604248047,
|
|
"epoch": 0.7288384793110692,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004953218525198043,
|
|
"loss": 5.6136,
|
|
"mean_token_accuracy": 0.1482889771461487,
|
|
"num_tokens": 16012847.0,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"entropy": 5.820078039169312,
|
|
"epoch": 0.7292585591262339,
|
|
"grad_norm": 9.3125,
|
|
"learning_rate": 0.0004953157756810876,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.15196260213851928,
|
|
"num_tokens": 16022213.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 5.784472417831421,
|
|
"epoch": 0.7296786389413988,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004953096949395902,
|
|
"loss": 5.6938,
|
|
"mean_token_accuracy": 0.15605147629976274,
|
|
"num_tokens": 16031411.0,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"entropy": 5.822618913650513,
|
|
"epoch": 0.7300987187565637,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004953036102954202,
|
|
"loss": 5.7282,
|
|
"mean_token_accuracy": 0.14967211931943894,
|
|
"num_tokens": 16041227.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 5.778734588623047,
|
|
"epoch": 0.7305187985717286,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004952975217486852,
|
|
"loss": 5.5479,
|
|
"mean_token_accuracy": 0.1602558448910713,
|
|
"num_tokens": 16049777.0,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"entropy": 5.83000955581665,
|
|
"epoch": 0.7309388783868935,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0004952914292994928,
|
|
"loss": 5.659,
|
|
"mean_token_accuracy": 0.15439933240413667,
|
|
"num_tokens": 16059093.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 5.840744495391846,
|
|
"epoch": 0.7313589582020584,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004952853329479514,
|
|
"loss": 5.6861,
|
|
"mean_token_accuracy": 0.15537820011377335,
|
|
"num_tokens": 16068550.0,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"entropy": 5.810123777389526,
|
|
"epoch": 0.7317790380172233,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004952792326941686,
|
|
"loss": 5.7191,
|
|
"mean_token_accuracy": 0.14849043488502503,
|
|
"num_tokens": 16078286.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 5.814086198806763,
|
|
"epoch": 0.7321991178323881,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004952731285382527,
|
|
"loss": 5.6667,
|
|
"mean_token_accuracy": 0.15178068578243256,
|
|
"num_tokens": 16087560.0,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"entropy": 5.787434864044189,
|
|
"epoch": 0.732619197647553,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004952670204803118,
|
|
"loss": 5.6204,
|
|
"mean_token_accuracy": 0.1559364140033722,
|
|
"num_tokens": 16097478.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 5.850944232940674,
|
|
"epoch": 0.7330392774627179,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004952609085204539,
|
|
"loss": 5.7189,
|
|
"mean_token_accuracy": 0.15533626079559326,
|
|
"num_tokens": 16106884.0,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"entropy": 5.731724834442138,
|
|
"epoch": 0.7334593572778828,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004952547926587876,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.15004593282938003,
|
|
"num_tokens": 16115689.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 5.7415611743927,
|
|
"epoch": 0.7338794370930477,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004952486728954209,
|
|
"loss": 5.5761,
|
|
"mean_token_accuracy": 0.1599406212568283,
|
|
"num_tokens": 16125237.0,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"entropy": 5.7435039520263675,
|
|
"epoch": 0.7342995169082126,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004952425492304624,
|
|
"loss": 5.5816,
|
|
"mean_token_accuracy": 0.15830608755350112,
|
|
"num_tokens": 16133940.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 5.803058242797851,
|
|
"epoch": 0.7347195967233774,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004952364216640207,
|
|
"loss": 5.6865,
|
|
"mean_token_accuracy": 0.15288463681936265,
|
|
"num_tokens": 16143256.0,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"entropy": 5.834009265899658,
|
|
"epoch": 0.7351396765385423,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000495230290196204,
|
|
"loss": 5.5648,
|
|
"mean_token_accuracy": 0.15222593396902084,
|
|
"num_tokens": 16153259.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 5.86444673538208,
|
|
"epoch": 0.7355597563537072,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004952241548271212,
|
|
"loss": 5.8055,
|
|
"mean_token_accuracy": 0.14142679050564766,
|
|
"num_tokens": 16162125.0,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"entropy": 5.84849967956543,
|
|
"epoch": 0.7359798361688721,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004952180155568809,
|
|
"loss": 5.7224,
|
|
"mean_token_accuracy": 0.14703101068735122,
|
|
"num_tokens": 16171680.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 5.853292989730835,
|
|
"epoch": 0.736399915984037,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004952118723855919,
|
|
"loss": 5.7153,
|
|
"mean_token_accuracy": 0.15350899547338487,
|
|
"num_tokens": 16181559.0,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"entropy": 5.755408191680909,
|
|
"epoch": 0.7368199957992019,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004952057253133628,
|
|
"loss": 5.668,
|
|
"mean_token_accuracy": 0.15180395692586898,
|
|
"num_tokens": 16190611.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 5.833858060836792,
|
|
"epoch": 0.7372400756143668,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004951995743403028,
|
|
"loss": 5.6769,
|
|
"mean_token_accuracy": 0.15253981202840805,
|
|
"num_tokens": 16200156.0,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"entropy": 5.824840307235718,
|
|
"epoch": 0.7376601554295316,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004951934194665208,
|
|
"loss": 5.6458,
|
|
"mean_token_accuracy": 0.14709821194410325,
|
|
"num_tokens": 16209808.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 5.756002902984619,
|
|
"epoch": 0.7380802352446965,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004951872606921257,
|
|
"loss": 5.6136,
|
|
"mean_token_accuracy": 0.15270906686782837,
|
|
"num_tokens": 16219243.0,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"entropy": 5.72284197807312,
|
|
"epoch": 0.7385003150598614,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004951810980172265,
|
|
"loss": 5.627,
|
|
"mean_token_accuracy": 0.1641955330967903,
|
|
"num_tokens": 16228180.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 5.785319805145264,
|
|
"epoch": 0.7389203948750263,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004951749314419327,
|
|
"loss": 5.6417,
|
|
"mean_token_accuracy": 0.15115589275956154,
|
|
"num_tokens": 16237045.0,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"entropy": 5.791619110107422,
|
|
"epoch": 0.7393404746901912,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004951687609663533,
|
|
"loss": 5.5589,
|
|
"mean_token_accuracy": 0.15952047407627107,
|
|
"num_tokens": 16245307.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 5.765593528747559,
|
|
"epoch": 0.739760554505356,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004951625865905977,
|
|
"loss": 5.5974,
|
|
"mean_token_accuracy": 0.14921371787786483,
|
|
"num_tokens": 16255047.0,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"entropy": 5.749333095550537,
|
|
"epoch": 0.740180634320521,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004951564083147753,
|
|
"loss": 5.6447,
|
|
"mean_token_accuracy": 0.1600167080760002,
|
|
"num_tokens": 16264969.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 5.81842737197876,
|
|
"epoch": 0.7406007141356857,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004951502261389953,
|
|
"loss": 5.7327,
|
|
"mean_token_accuracy": 0.14656912833452224,
|
|
"num_tokens": 16274757.0,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"entropy": 5.780880069732666,
|
|
"epoch": 0.7410207939508506,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004951440400633677,
|
|
"loss": 5.6351,
|
|
"mean_token_accuracy": 0.16265199482440948,
|
|
"num_tokens": 16283409.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 5.687593412399292,
|
|
"epoch": 0.7414408737660155,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004951378500880015,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.1549723207950592,
|
|
"num_tokens": 16293206.0,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"entropy": 5.82498950958252,
|
|
"epoch": 0.7418609535811804,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004951316562130067,
|
|
"loss": 5.6332,
|
|
"mean_token_accuracy": 0.15318880528211593,
|
|
"num_tokens": 16303121.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 5.778778553009033,
|
|
"epoch": 0.7422810333963453,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000495125458438493,
|
|
"loss": 5.5975,
|
|
"mean_token_accuracy": 0.16230110377073287,
|
|
"num_tokens": 16312710.0,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"entropy": 5.8864704132080075,
|
|
"epoch": 0.7427011132115102,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004951192567645702,
|
|
"loss": 5.7853,
|
|
"mean_token_accuracy": 0.14685365781188012,
|
|
"num_tokens": 16322280.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 5.721866273880005,
|
|
"epoch": 0.7431211930266751,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004951130511913481,
|
|
"loss": 5.635,
|
|
"mean_token_accuracy": 0.15453375428915023,
|
|
"num_tokens": 16331656.0,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"entropy": 5.7635541439056395,
|
|
"epoch": 0.7435412728418399,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004951068417189366,
|
|
"loss": 5.6607,
|
|
"mean_token_accuracy": 0.15400536656379699,
|
|
"num_tokens": 16341074.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 5.806599426269531,
|
|
"epoch": 0.7439613526570048,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004951006283474457,
|
|
"loss": 5.6525,
|
|
"mean_token_accuracy": 0.15177395343780517,
|
|
"num_tokens": 16350097.0,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"entropy": 5.6168114185333256,
|
|
"epoch": 0.7443814324721697,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004950944110769856,
|
|
"loss": 5.5518,
|
|
"mean_token_accuracy": 0.16385273784399032,
|
|
"num_tokens": 16359274.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 5.655103158950806,
|
|
"epoch": 0.7448015122873346,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004950881899076663,
|
|
"loss": 5.5365,
|
|
"mean_token_accuracy": 0.1682687819004059,
|
|
"num_tokens": 16368445.0,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"entropy": 5.878038167953491,
|
|
"epoch": 0.7452215921024995,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004950819648395979,
|
|
"loss": 5.6423,
|
|
"mean_token_accuracy": 0.1565190926194191,
|
|
"num_tokens": 16377689.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 5.772777366638183,
|
|
"epoch": 0.7456416719176644,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000495075735872891,
|
|
"loss": 5.5949,
|
|
"mean_token_accuracy": 0.1571029394865036,
|
|
"num_tokens": 16386713.0,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"entropy": 5.772426748275757,
|
|
"epoch": 0.7460617517328293,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004950695030076557,
|
|
"loss": 5.6116,
|
|
"mean_token_accuracy": 0.152817103266716,
|
|
"num_tokens": 16395390.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 5.862038803100586,
|
|
"epoch": 0.7464818315479941,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004950632662440027,
|
|
"loss": 5.6909,
|
|
"mean_token_accuracy": 0.15143778100609778,
|
|
"num_tokens": 16404531.0,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"entropy": 5.734190988540649,
|
|
"epoch": 0.746901911363159,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004950570255820419,
|
|
"loss": 5.5892,
|
|
"mean_token_accuracy": 0.15557831078767775,
|
|
"num_tokens": 16413649.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 5.679434442520142,
|
|
"epoch": 0.7473219911783239,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004950507810218843,
|
|
"loss": 5.7074,
|
|
"mean_token_accuracy": 0.14878712072968484,
|
|
"num_tokens": 16423247.0,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"entropy": 5.8338196754455565,
|
|
"epoch": 0.7477420709934888,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004950445325636405,
|
|
"loss": 5.649,
|
|
"mean_token_accuracy": 0.14864842891693114,
|
|
"num_tokens": 16432190.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 5.864486503601074,
|
|
"epoch": 0.7481621508086537,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004950382802074211,
|
|
"loss": 5.6038,
|
|
"mean_token_accuracy": 0.15934911370277405,
|
|
"num_tokens": 16443091.0,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"entropy": 5.711412811279297,
|
|
"epoch": 0.7485822306238186,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004950320239533369,
|
|
"loss": 5.6338,
|
|
"mean_token_accuracy": 0.15670278668403625,
|
|
"num_tokens": 16452077.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 5.8399248123168945,
|
|
"epoch": 0.7490023104389834,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004950257638014986,
|
|
"loss": 5.7602,
|
|
"mean_token_accuracy": 0.14474717825651168,
|
|
"num_tokens": 16461893.0,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"entropy": 5.905817985534668,
|
|
"epoch": 0.7494223902541483,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004950194997520172,
|
|
"loss": 5.5814,
|
|
"mean_token_accuracy": 0.1564013957977295,
|
|
"num_tokens": 16470904.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 5.779659080505371,
|
|
"epoch": 0.7498424700693131,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004950132318050037,
|
|
"loss": 5.6502,
|
|
"mean_token_accuracy": 0.14872682839632034,
|
|
"num_tokens": 16480130.0,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"entropy": 5.735926008224487,
|
|
"epoch": 0.750262549884478,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004950069599605691,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.1561155989766121,
|
|
"num_tokens": 16489485.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 5.7690812110900875,
|
|
"epoch": 0.750682629699643,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004950006842188245,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.15704655051231384,
|
|
"num_tokens": 16498529.0,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"entropy": 5.776333618164062,
|
|
"epoch": 0.7511027095148078,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.000494994404579881,
|
|
"loss": 5.5733,
|
|
"mean_token_accuracy": 0.1540952205657959,
|
|
"num_tokens": 16508094.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 5.810970735549927,
|
|
"epoch": 0.7515227893299727,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00049498812104385,
|
|
"loss": 5.6854,
|
|
"mean_token_accuracy": 0.14840709492564202,
|
|
"num_tokens": 16517620.0,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"entropy": 5.717817068099976,
|
|
"epoch": 0.7519428691451375,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004949818336108425,
|
|
"loss": 5.6743,
|
|
"mean_token_accuracy": 0.1453969433903694,
|
|
"num_tokens": 16526720.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 5.782077789306641,
|
|
"epoch": 0.7523629489603024,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004949755422809703,
|
|
"loss": 5.6349,
|
|
"mean_token_accuracy": 0.15297809839248658,
|
|
"num_tokens": 16535979.0,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"entropy": 5.789309072494507,
|
|
"epoch": 0.7527830287754673,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004949692470543446,
|
|
"loss": 5.518,
|
|
"mean_token_accuracy": 0.16405045241117477,
|
|
"num_tokens": 16544538.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 5.700740957260132,
|
|
"epoch": 0.7532031085906322,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004949629479310769,
|
|
"loss": 5.6021,
|
|
"mean_token_accuracy": 0.15271754264831544,
|
|
"num_tokens": 16553962.0,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"entropy": 5.7723414421081545,
|
|
"epoch": 0.7536231884057971,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004949566449112788,
|
|
"loss": 5.5341,
|
|
"mean_token_accuracy": 0.1600716605782509,
|
|
"num_tokens": 16562652.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 5.816875839233399,
|
|
"epoch": 0.754043268220962,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004949503379950621,
|
|
"loss": 5.6381,
|
|
"mean_token_accuracy": 0.15340977758169175,
|
|
"num_tokens": 16570887.0,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"entropy": 5.825795125961304,
|
|
"epoch": 0.7544633480361269,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004949440271825385,
|
|
"loss": 5.7669,
|
|
"mean_token_accuracy": 0.15065207779407502,
|
|
"num_tokens": 16581469.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 5.783386135101319,
|
|
"epoch": 0.7548834278512917,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004949377124738196,
|
|
"loss": 5.6376,
|
|
"mean_token_accuracy": 0.15028667375445365,
|
|
"num_tokens": 16590213.0,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"entropy": 5.759113931655884,
|
|
"epoch": 0.7553035076664566,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004949313938690174,
|
|
"loss": 5.6301,
|
|
"mean_token_accuracy": 0.1542770192027092,
|
|
"num_tokens": 16598384.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 5.692385244369507,
|
|
"epoch": 0.7557235874816215,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004949250713682438,
|
|
"loss": 5.6114,
|
|
"mean_token_accuracy": 0.15893905013799667,
|
|
"num_tokens": 16607670.0,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"entropy": 5.830786418914795,
|
|
"epoch": 0.7561436672967864,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004949187449716107,
|
|
"loss": 5.6932,
|
|
"mean_token_accuracy": 0.15244348496198654,
|
|
"num_tokens": 16617560.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7561436672967864,
|
|
"eval_entropy": 5.638838640603793,
|
|
"eval_loss": 5.66161584854126,
|
|
"eval_mean_token_accuracy": 0.1600216546673523,
|
|
"eval_num_tokens": 16617560.0,
|
|
"eval_runtime": 27.3107,
|
|
"eval_samples_per_second": 1368.184,
|
|
"eval_steps_per_second": 171.032,
|
|
"step": 9000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 119020,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.4315411898368e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|