48123 lines
1.3 MiB
48123 lines
1.3 MiB
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0162990968283974,
|
|
"eval_steps": 3000,
|
|
"global_step": 24000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 10.69201192855835,
|
|
"epoch": 0.0004200798151648813,
|
|
"grad_norm": 13.375,
|
|
"learning_rate": 2e-06,
|
|
"loss": 10.8001,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 8348.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 10.691978454589844,
|
|
"epoch": 0.0008401596303297626,
|
|
"grad_norm": 12.5,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 10.7548,
|
|
"mean_token_accuracy": 0.00010881392518058419,
|
|
"num_tokens": 17465.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 10.691164684295654,
|
|
"epoch": 0.001260239445494644,
|
|
"grad_norm": 9.9375,
|
|
"learning_rate": 7e-06,
|
|
"loss": 10.5365,
|
|
"mean_token_accuracy": 0.021085147676058114,
|
|
"num_tokens": 26627.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 10.678658771514893,
|
|
"epoch": 0.0016803192606595252,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 10.2026,
|
|
"mean_token_accuracy": 0.046403773874044416,
|
|
"num_tokens": 36069.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 10.598964595794678,
|
|
"epoch": 0.002100399075824407,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 9.8984,
|
|
"mean_token_accuracy": 0.04546841159462929,
|
|
"num_tokens": 44967.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 10.592682838439941,
|
|
"epoch": 0.002520478890989288,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 9.8253,
|
|
"mean_token_accuracy": 0.04163686409592628,
|
|
"num_tokens": 55132.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.616032028198243,
|
|
"epoch": 0.0029405587061541692,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 9.6909,
|
|
"mean_token_accuracy": 0.04541983306407928,
|
|
"num_tokens": 65141.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.587666893005371,
|
|
"epoch": 0.0033606385213190504,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 9.6967,
|
|
"mean_token_accuracy": 0.040509892627596855,
|
|
"num_tokens": 74007.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.587863063812256,
|
|
"epoch": 0.003780718336483932,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 9.6278,
|
|
"mean_token_accuracy": 0.04380051270127296,
|
|
"num_tokens": 83736.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.581284713745116,
|
|
"epoch": 0.004200798151648814,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.5554,
|
|
"mean_token_accuracy": 0.04462047629058361,
|
|
"num_tokens": 92525.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.579821586608887,
|
|
"epoch": 0.004620877966813695,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.5042,
|
|
"mean_token_accuracy": 0.0499776991084218,
|
|
"num_tokens": 102015.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.527470588684082,
|
|
"epoch": 0.005040957781978576,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.4648,
|
|
"mean_token_accuracy": 0.05102687180042267,
|
|
"num_tokens": 110887.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.398450374603271,
|
|
"epoch": 0.005461037597143457,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.3768,
|
|
"mean_token_accuracy": 0.05401572398841381,
|
|
"num_tokens": 120442.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.466637897491456,
|
|
"epoch": 0.0058811174123083385,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.2516,
|
|
"mean_token_accuracy": 0.05276094898581505,
|
|
"num_tokens": 129297.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.477723217010498,
|
|
"epoch": 0.00630119722747322,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.1585,
|
|
"mean_token_accuracy": 0.05686353407800197,
|
|
"num_tokens": 138305.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.401033782958985,
|
|
"epoch": 0.006721277042638101,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.0976,
|
|
"mean_token_accuracy": 0.055690228939056396,
|
|
"num_tokens": 147640.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.44783878326416,
|
|
"epoch": 0.007141356857802983,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 8.9803,
|
|
"mean_token_accuracy": 0.05669833719730377,
|
|
"num_tokens": 157633.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.396310806274414,
|
|
"epoch": 0.007561436672967864,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 8.9499,
|
|
"mean_token_accuracy": 0.05056734494864941,
|
|
"num_tokens": 167984.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.333494663238525,
|
|
"epoch": 0.007981516488132745,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.8301,
|
|
"mean_token_accuracy": 0.06639725379645825,
|
|
"num_tokens": 176984.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.28737268447876,
|
|
"epoch": 0.008401596303297627,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.654,
|
|
"mean_token_accuracy": 0.06538619883358479,
|
|
"num_tokens": 185931.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 10.208460235595703,
|
|
"epoch": 0.008821676118462508,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.6478,
|
|
"mean_token_accuracy": 0.050938266515731814,
|
|
"num_tokens": 195065.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 10.092334175109864,
|
|
"epoch": 0.00924175593362739,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.5099,
|
|
"mean_token_accuracy": 0.06477361544966698,
|
|
"num_tokens": 203687.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 10.105284690856934,
|
|
"epoch": 0.00966183574879227,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.4081,
|
|
"mean_token_accuracy": 0.0666894868016243,
|
|
"num_tokens": 212847.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.957781219482422,
|
|
"epoch": 0.010081915563957152,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.3004,
|
|
"mean_token_accuracy": 0.0674133587628603,
|
|
"num_tokens": 222593.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.889359092712402,
|
|
"epoch": 0.010501995379122032,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.129,
|
|
"mean_token_accuracy": 0.07197456955909728,
|
|
"num_tokens": 231174.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.669556808471679,
|
|
"epoch": 0.010922075194286915,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 7.9843,
|
|
"mean_token_accuracy": 0.07425511926412583,
|
|
"num_tokens": 239833.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.519672775268555,
|
|
"epoch": 0.011342155009451797,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.0143,
|
|
"mean_token_accuracy": 0.07254141308367253,
|
|
"num_tokens": 248794.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.303325176239014,
|
|
"epoch": 0.011762234824616677,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.9537,
|
|
"mean_token_accuracy": 0.07010119631886483,
|
|
"num_tokens": 257123.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 9.143257808685302,
|
|
"epoch": 0.012182314639781559,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.6458,
|
|
"mean_token_accuracy": 0.07959595024585724,
|
|
"num_tokens": 266088.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.888239574432372,
|
|
"epoch": 0.01260239445494644,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.8236,
|
|
"mean_token_accuracy": 0.07102414257824421,
|
|
"num_tokens": 276074.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.727731895446777,
|
|
"epoch": 0.013022474270111321,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.7082,
|
|
"mean_token_accuracy": 0.07570267021656037,
|
|
"num_tokens": 285280.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.563877964019776,
|
|
"epoch": 0.013442554085276202,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.6962,
|
|
"mean_token_accuracy": 0.06895132511854171,
|
|
"num_tokens": 296115.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.412875747680664,
|
|
"epoch": 0.013862633900441084,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.5497,
|
|
"mean_token_accuracy": 0.07601302340626717,
|
|
"num_tokens": 305483.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.340911769866944,
|
|
"epoch": 0.014282713715605966,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.5593,
|
|
"mean_token_accuracy": 0.07040085420012474,
|
|
"num_tokens": 314000.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.245043659210205,
|
|
"epoch": 0.014702793530770846,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.5541,
|
|
"mean_token_accuracy": 0.07777635231614113,
|
|
"num_tokens": 323667.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.15629415512085,
|
|
"epoch": 0.015122873345935728,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.5554,
|
|
"mean_token_accuracy": 0.07515333034098148,
|
|
"num_tokens": 332695.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.065321111679078,
|
|
"epoch": 0.015542953161100609,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.3947,
|
|
"mean_token_accuracy": 0.07709791958332061,
|
|
"num_tokens": 342428.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.054158020019532,
|
|
"epoch": 0.01596303297626549,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.5079,
|
|
"mean_token_accuracy": 0.0735605925321579,
|
|
"num_tokens": 353587.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.988022661209106,
|
|
"epoch": 0.01638311279143037,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.443,
|
|
"mean_token_accuracy": 0.07551693692803382,
|
|
"num_tokens": 362997.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.02585473060608,
|
|
"epoch": 0.016803192606595255,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.4821,
|
|
"mean_token_accuracy": 0.07873391062021255,
|
|
"num_tokens": 372346.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.984146022796631,
|
|
"epoch": 0.017223272421760135,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.3473,
|
|
"mean_token_accuracy": 0.07624267861247062,
|
|
"num_tokens": 381575.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.912975454330445,
|
|
"epoch": 0.017643352236925015,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.4236,
|
|
"mean_token_accuracy": 0.0766436841338873,
|
|
"num_tokens": 390706.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.888600492477417,
|
|
"epoch": 0.018063432052089896,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.4209,
|
|
"mean_token_accuracy": 0.0734835498034954,
|
|
"num_tokens": 400000.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.803367996215821,
|
|
"epoch": 0.01848351186725478,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.3774,
|
|
"mean_token_accuracy": 0.08182684779167175,
|
|
"num_tokens": 409447.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.875886058807373,
|
|
"epoch": 0.01890359168241966,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.3393,
|
|
"mean_token_accuracy": 0.08449244052171707,
|
|
"num_tokens": 418417.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.78724856376648,
|
|
"epoch": 0.01932367149758454,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.3048,
|
|
"mean_token_accuracy": 0.08006256446242332,
|
|
"num_tokens": 427619.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.736767053604126,
|
|
"epoch": 0.019743751312749424,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.372,
|
|
"mean_token_accuracy": 0.07579129710793495,
|
|
"num_tokens": 437931.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.841858673095703,
|
|
"epoch": 0.020163831127914304,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.4001,
|
|
"mean_token_accuracy": 0.08351109325885772,
|
|
"num_tokens": 447595.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.7983135223388675,
|
|
"epoch": 0.020583910943079185,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.2633,
|
|
"mean_token_accuracy": 0.07488272562623025,
|
|
"num_tokens": 457062.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.813820743560791,
|
|
"epoch": 0.021003990758244065,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.3567,
|
|
"mean_token_accuracy": 0.07759504988789559,
|
|
"num_tokens": 466191.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.757200431823731,
|
|
"epoch": 0.02142407057340895,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.3146,
|
|
"mean_token_accuracy": 0.08031945005059242,
|
|
"num_tokens": 475693.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.7279805660247805,
|
|
"epoch": 0.02184415038857383,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.3269,
|
|
"mean_token_accuracy": 0.08141026981174945,
|
|
"num_tokens": 485173.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.724671411514282,
|
|
"epoch": 0.02226423020373871,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.2369,
|
|
"mean_token_accuracy": 0.083962532132864,
|
|
"num_tokens": 493985.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.6601485252380375,
|
|
"epoch": 0.022684310018903593,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.2687,
|
|
"mean_token_accuracy": 0.08190520852804184,
|
|
"num_tokens": 502837.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.751116943359375,
|
|
"epoch": 0.023104389834068473,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.2065,
|
|
"mean_token_accuracy": 0.0843705341219902,
|
|
"num_tokens": 511503.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.717013120651245,
|
|
"epoch": 0.023524469649233354,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.4058,
|
|
"mean_token_accuracy": 0.08034609854221345,
|
|
"num_tokens": 521499.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.592406368255615,
|
|
"epoch": 0.023944549464398234,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.166,
|
|
"mean_token_accuracy": 0.08277052193880081,
|
|
"num_tokens": 530067.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.6297852993011475,
|
|
"epoch": 0.024364629279563118,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.1721,
|
|
"mean_token_accuracy": 0.08475914299488067,
|
|
"num_tokens": 538559.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.705462646484375,
|
|
"epoch": 0.024784709094728,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.3653,
|
|
"mean_token_accuracy": 0.07328721843659877,
|
|
"num_tokens": 547288.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.596541261672973,
|
|
"epoch": 0.02520478890989288,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.2357,
|
|
"mean_token_accuracy": 0.07816045507788658,
|
|
"num_tokens": 557269.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.701767444610596,
|
|
"epoch": 0.025624868725057762,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.2628,
|
|
"mean_token_accuracy": 0.07311495915055274,
|
|
"num_tokens": 567280.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.602482271194458,
|
|
"epoch": 0.026044948540222643,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.0908,
|
|
"mean_token_accuracy": 0.08299101889133453,
|
|
"num_tokens": 576609.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.399111747741699,
|
|
"epoch": 0.026465028355387523,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.0032,
|
|
"mean_token_accuracy": 0.09095181971788406,
|
|
"num_tokens": 586053.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.507453203201294,
|
|
"epoch": 0.026885108170552403,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.203,
|
|
"mean_token_accuracy": 0.08823259696364402,
|
|
"num_tokens": 594649.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.599713850021362,
|
|
"epoch": 0.027305187985717287,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.1383,
|
|
"mean_token_accuracy": 0.08195743858814239,
|
|
"num_tokens": 603445.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.587759685516358,
|
|
"epoch": 0.027725267800882167,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.2543,
|
|
"mean_token_accuracy": 0.07800514288246632,
|
|
"num_tokens": 613611.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.745543384552002,
|
|
"epoch": 0.028145347616047048,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.429,
|
|
"mean_token_accuracy": 0.07839688062667846,
|
|
"num_tokens": 623024.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.4431709289550785,
|
|
"epoch": 0.02856542743121193,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.1028,
|
|
"mean_token_accuracy": 0.08672705665230751,
|
|
"num_tokens": 631624.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.574361371994018,
|
|
"epoch": 0.028985507246376812,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.0557,
|
|
"mean_token_accuracy": 0.08923942148685456,
|
|
"num_tokens": 640473.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.541849613189697,
|
|
"epoch": 0.029405587061541692,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.2383,
|
|
"mean_token_accuracy": 0.08173563033342361,
|
|
"num_tokens": 649692.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.571516275405884,
|
|
"epoch": 0.029825666876706573,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.1875,
|
|
"mean_token_accuracy": 0.08110572174191474,
|
|
"num_tokens": 658236.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.34685640335083,
|
|
"epoch": 0.030245746691871456,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.9645,
|
|
"mean_token_accuracy": 0.08569629490375519,
|
|
"num_tokens": 667175.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.556408214569092,
|
|
"epoch": 0.030665826507036337,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.2834,
|
|
"mean_token_accuracy": 0.08148858584463596,
|
|
"num_tokens": 676456.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.606632947921753,
|
|
"epoch": 0.031085906322201217,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.2448,
|
|
"mean_token_accuracy": 0.08052070513367653,
|
|
"num_tokens": 686881.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.371811389923096,
|
|
"epoch": 0.0315059861373661,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.0307,
|
|
"mean_token_accuracy": 0.08108055517077446,
|
|
"num_tokens": 696045.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.382633686065674,
|
|
"epoch": 0.03192606595253098,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.003,
|
|
"mean_token_accuracy": 0.09089459106326103,
|
|
"num_tokens": 704729.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.353933048248291,
|
|
"epoch": 0.032346145767695865,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.0639,
|
|
"mean_token_accuracy": 0.08123919740319252,
|
|
"num_tokens": 714331.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.430750465393066,
|
|
"epoch": 0.03276622558286074,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.0163,
|
|
"mean_token_accuracy": 0.08898987770080566,
|
|
"num_tokens": 722788.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.388132476806641,
|
|
"epoch": 0.033186305398025626,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.0996,
|
|
"mean_token_accuracy": 0.0889863982796669,
|
|
"num_tokens": 731417.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.394377708435059,
|
|
"epoch": 0.03360638521319051,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.0686,
|
|
"mean_token_accuracy": 0.0865507885813713,
|
|
"num_tokens": 741034.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.370957660675049,
|
|
"epoch": 0.034026465028355386,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.063,
|
|
"mean_token_accuracy": 0.08408316597342491,
|
|
"num_tokens": 749596.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.360737991333008,
|
|
"epoch": 0.03444654484352027,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.0166,
|
|
"mean_token_accuracy": 0.08443826884031295,
|
|
"num_tokens": 758931.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.253893661499023,
|
|
"epoch": 0.03486662465868515,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.9221,
|
|
"mean_token_accuracy": 0.08874604031443596,
|
|
"num_tokens": 767534.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.336139726638794,
|
|
"epoch": 0.03528670447385003,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.9742,
|
|
"mean_token_accuracy": 0.08901742175221443,
|
|
"num_tokens": 776456.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.32063570022583,
|
|
"epoch": 0.035706784289014915,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.0512,
|
|
"mean_token_accuracy": 0.0825334556400776,
|
|
"num_tokens": 786172.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.2836973667144775,
|
|
"epoch": 0.03612686410417979,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.9281,
|
|
"mean_token_accuracy": 0.09393875077366828,
|
|
"num_tokens": 795081.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.279390621185303,
|
|
"epoch": 0.036546943919344675,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.9729,
|
|
"mean_token_accuracy": 0.08336275964975357,
|
|
"num_tokens": 804259.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.3233130931854244,
|
|
"epoch": 0.03696702373450956,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.9836,
|
|
"mean_token_accuracy": 0.08346287980675697,
|
|
"num_tokens": 813463.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.265643119812012,
|
|
"epoch": 0.037387103549674436,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.915,
|
|
"mean_token_accuracy": 0.09436434507369995,
|
|
"num_tokens": 823029.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.2830162525177,
|
|
"epoch": 0.03780718336483932,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.9822,
|
|
"mean_token_accuracy": 0.08020757511258125,
|
|
"num_tokens": 832902.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.172808027267456,
|
|
"epoch": 0.0382272631800042,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.9269,
|
|
"mean_token_accuracy": 0.08937018439173698,
|
|
"num_tokens": 842162.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.261403322219849,
|
|
"epoch": 0.03864734299516908,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.9709,
|
|
"mean_token_accuracy": 0.09120814129710197,
|
|
"num_tokens": 852328.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.207744789123535,
|
|
"epoch": 0.039067422810333964,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.9283,
|
|
"mean_token_accuracy": 0.08966456726193428,
|
|
"num_tokens": 860929.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.253277540206909,
|
|
"epoch": 0.03948750262549885,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.0043,
|
|
"mean_token_accuracy": 0.0854820430278778,
|
|
"num_tokens": 869144.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.303921031951904,
|
|
"epoch": 0.039907582440663725,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.9451,
|
|
"mean_token_accuracy": 0.09673570543527603,
|
|
"num_tokens": 877447.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.20126519203186,
|
|
"epoch": 0.04032766225582861,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.9017,
|
|
"mean_token_accuracy": 0.08463463708758354,
|
|
"num_tokens": 887020.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.1618622779846195,
|
|
"epoch": 0.040747742070993485,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.9503,
|
|
"mean_token_accuracy": 0.08903224021196365,
|
|
"num_tokens": 895937.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.172050189971924,
|
|
"epoch": 0.04116782188615837,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.9573,
|
|
"mean_token_accuracy": 0.08436014279723167,
|
|
"num_tokens": 905446.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.1261190414428714,
|
|
"epoch": 0.04158790170132325,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.8507,
|
|
"mean_token_accuracy": 0.09782563373446465,
|
|
"num_tokens": 914547.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.219514274597168,
|
|
"epoch": 0.04200798151648813,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.8597,
|
|
"mean_token_accuracy": 0.09429225027561187,
|
|
"num_tokens": 922900.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.174054384231567,
|
|
"epoch": 0.042428061331653014,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.9026,
|
|
"mean_token_accuracy": 0.09461246877908706,
|
|
"num_tokens": 930876.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.149679851531983,
|
|
"epoch": 0.0428481411468179,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.9327,
|
|
"mean_token_accuracy": 0.09384474828839302,
|
|
"num_tokens": 939871.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.1536510467529295,
|
|
"epoch": 0.043268220961982774,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.9204,
|
|
"mean_token_accuracy": 0.08957441225647926,
|
|
"num_tokens": 948673.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.07887830734253,
|
|
"epoch": 0.04368830077714766,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.8686,
|
|
"mean_token_accuracy": 0.08727961704134941,
|
|
"num_tokens": 957603.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.11884388923645,
|
|
"epoch": 0.04410838059231254,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.9378,
|
|
"mean_token_accuracy": 0.08589621968567371,
|
|
"num_tokens": 967731.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.1688611030578615,
|
|
"epoch": 0.04452846040747742,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.9387,
|
|
"mean_token_accuracy": 0.09485394582152366,
|
|
"num_tokens": 977427.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.146421909332275,
|
|
"epoch": 0.0449485402226423,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.9243,
|
|
"mean_token_accuracy": 0.08625848963856697,
|
|
"num_tokens": 986758.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.25874433517456,
|
|
"epoch": 0.045368620037807186,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.92,
|
|
"mean_token_accuracy": 0.09832347258925438,
|
|
"num_tokens": 996377.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.057836389541626,
|
|
"epoch": 0.04578869985297206,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.9742,
|
|
"mean_token_accuracy": 0.08528567403554917,
|
|
"num_tokens": 1006483.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 6.995539855957031,
|
|
"epoch": 0.04620877966813695,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.8574,
|
|
"mean_token_accuracy": 0.08858747258782387,
|
|
"num_tokens": 1016132.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.106180238723755,
|
|
"epoch": 0.04662885948330183,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.7984,
|
|
"mean_token_accuracy": 0.09407598823308945,
|
|
"num_tokens": 1024970.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.142482328414917,
|
|
"epoch": 0.04704893929846671,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.8936,
|
|
"mean_token_accuracy": 0.08978619575500488,
|
|
"num_tokens": 1034335.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.139913558959961,
|
|
"epoch": 0.04746901911363159,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.9495,
|
|
"mean_token_accuracy": 0.0973325490951538,
|
|
"num_tokens": 1043954.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.08342981338501,
|
|
"epoch": 0.04788909892879647,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.8806,
|
|
"mean_token_accuracy": 0.09276892617344856,
|
|
"num_tokens": 1053554.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.0591119766235355,
|
|
"epoch": 0.04830917874396135,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.8354,
|
|
"mean_token_accuracy": 0.09314879402518272,
|
|
"num_tokens": 1062008.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.029165410995484,
|
|
"epoch": 0.048729258559126236,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.9074,
|
|
"mean_token_accuracy": 0.09056607261300087,
|
|
"num_tokens": 1070740.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.027670526504517,
|
|
"epoch": 0.04914933837429111,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.8895,
|
|
"mean_token_accuracy": 0.09351922869682312,
|
|
"num_tokens": 1079681.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.076567363739014,
|
|
"epoch": 0.049569418189456,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.7669,
|
|
"mean_token_accuracy": 0.0963557355105877,
|
|
"num_tokens": 1088979.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 6.955168056488037,
|
|
"epoch": 0.04998949800462088,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.7794,
|
|
"mean_token_accuracy": 0.09716788977384568,
|
|
"num_tokens": 1097870.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.0498795986175535,
|
|
"epoch": 0.05040957781978576,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.8985,
|
|
"mean_token_accuracy": 0.08934849128127098,
|
|
"num_tokens": 1107948.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.038954401016236,
|
|
"epoch": 0.05082965763495064,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.8034,
|
|
"mean_token_accuracy": 0.09711324200034141,
|
|
"num_tokens": 1117032.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.016556072235107,
|
|
"epoch": 0.051249737450115525,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.7736,
|
|
"mean_token_accuracy": 0.10140406414866447,
|
|
"num_tokens": 1127834.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.053543567657471,
|
|
"epoch": 0.0516698172652804,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.8664,
|
|
"mean_token_accuracy": 0.10583841800689697,
|
|
"num_tokens": 1137382.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 6.960672283172608,
|
|
"epoch": 0.052089897080445285,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.7295,
|
|
"mean_token_accuracy": 0.09906250685453415,
|
|
"num_tokens": 1146095.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 6.916978216171264,
|
|
"epoch": 0.05250997689561017,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.7648,
|
|
"mean_token_accuracy": 0.1004838652908802,
|
|
"num_tokens": 1154981.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 6.948708629608154,
|
|
"epoch": 0.052930056710775046,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.7765,
|
|
"mean_token_accuracy": 0.10312124192714692,
|
|
"num_tokens": 1164939.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.024917793273926,
|
|
"epoch": 0.05335013652593993,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.8939,
|
|
"mean_token_accuracy": 0.09090543612837791,
|
|
"num_tokens": 1174991.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.0208131790161135,
|
|
"epoch": 0.05377021634110481,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.9459,
|
|
"mean_token_accuracy": 0.08811391443014145,
|
|
"num_tokens": 1184885.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.984617424011231,
|
|
"epoch": 0.05419029615626969,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.8348,
|
|
"mean_token_accuracy": 0.09274234399199485,
|
|
"num_tokens": 1193637.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 6.901879405975341,
|
|
"epoch": 0.054610375971434574,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.6237,
|
|
"mean_token_accuracy": 0.10028594210743905,
|
|
"num_tokens": 1202188.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 6.964693355560303,
|
|
"epoch": 0.05503045578659945,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.7513,
|
|
"mean_token_accuracy": 0.09297072812914849,
|
|
"num_tokens": 1210768.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.921257066726684,
|
|
"epoch": 0.055450535601764335,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.7581,
|
|
"mean_token_accuracy": 0.09513410851359368,
|
|
"num_tokens": 1219819.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 6.969961500167846,
|
|
"epoch": 0.05587061541692922,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.8151,
|
|
"mean_token_accuracy": 0.08720013573765754,
|
|
"num_tokens": 1229703.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.008356428146362,
|
|
"epoch": 0.056290695232094096,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.8385,
|
|
"mean_token_accuracy": 0.09394309446215629,
|
|
"num_tokens": 1238942.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.041683959960937,
|
|
"epoch": 0.05671077504725898,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.8901,
|
|
"mean_token_accuracy": 0.0907767005264759,
|
|
"num_tokens": 1248943.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 6.869440269470215,
|
|
"epoch": 0.05713085486242386,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.7728,
|
|
"mean_token_accuracy": 0.09719423428177834,
|
|
"num_tokens": 1257761.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.80675859451294,
|
|
"epoch": 0.05755093467758874,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.722,
|
|
"mean_token_accuracy": 0.09433782026171685,
|
|
"num_tokens": 1267216.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 6.962690448760986,
|
|
"epoch": 0.057971014492753624,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.8182,
|
|
"mean_token_accuracy": 0.09524153247475624,
|
|
"num_tokens": 1277210.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 6.910012054443359,
|
|
"epoch": 0.05839109430791851,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.7268,
|
|
"mean_token_accuracy": 0.09480128362774849,
|
|
"num_tokens": 1285310.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.9359142780303955,
|
|
"epoch": 0.058811174123083385,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.7418,
|
|
"mean_token_accuracy": 0.09830545634031296,
|
|
"num_tokens": 1294421.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.773298215866089,
|
|
"epoch": 0.05923125393824827,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.5648,
|
|
"mean_token_accuracy": 0.10509093776345253,
|
|
"num_tokens": 1303281.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.848818397521972,
|
|
"epoch": 0.059651333753413145,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.7413,
|
|
"mean_token_accuracy": 0.10247144997119903,
|
|
"num_tokens": 1312280.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.792526483535767,
|
|
"epoch": 0.06007141356857803,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.703,
|
|
"mean_token_accuracy": 0.09476525709033012,
|
|
"num_tokens": 1321243.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.8667539119720455,
|
|
"epoch": 0.06049149338374291,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.8092,
|
|
"mean_token_accuracy": 0.10024766996502876,
|
|
"num_tokens": 1330324.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.874475002288818,
|
|
"epoch": 0.06091157319890779,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.6476,
|
|
"mean_token_accuracy": 0.10230677276849746,
|
|
"num_tokens": 1339485.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.930787801742554,
|
|
"epoch": 0.06133165301407267,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.8065,
|
|
"mean_token_accuracy": 0.09302590638399125,
|
|
"num_tokens": 1348640.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.799437236785889,
|
|
"epoch": 0.06175173282923756,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.6978,
|
|
"mean_token_accuracy": 0.09949951842427254,
|
|
"num_tokens": 1357581.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.888378238677978,
|
|
"epoch": 0.062171812644402434,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.7652,
|
|
"mean_token_accuracy": 0.09876005351543427,
|
|
"num_tokens": 1367883.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.812366771697998,
|
|
"epoch": 0.06259189245956731,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.7175,
|
|
"mean_token_accuracy": 0.09678780436515808,
|
|
"num_tokens": 1376936.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.708990812301636,
|
|
"epoch": 0.0630119722747322,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.6402,
|
|
"mean_token_accuracy": 0.09989499375224113,
|
|
"num_tokens": 1386359.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.86722469329834,
|
|
"epoch": 0.06343205208989708,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.6965,
|
|
"mean_token_accuracy": 0.10066593587398528,
|
|
"num_tokens": 1395223.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.944450616836548,
|
|
"epoch": 0.06385213190506196,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.847,
|
|
"mean_token_accuracy": 0.09334802627563477,
|
|
"num_tokens": 1404917.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.823553276062012,
|
|
"epoch": 0.06427221172022685,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.7474,
|
|
"mean_token_accuracy": 0.10658529698848725,
|
|
"num_tokens": 1413348.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.7500804424285885,
|
|
"epoch": 0.06469229153539173,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.7193,
|
|
"mean_token_accuracy": 0.09804128184914589,
|
|
"num_tokens": 1421726.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.822430419921875,
|
|
"epoch": 0.0651123713505566,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.7314,
|
|
"mean_token_accuracy": 0.09830505326390267,
|
|
"num_tokens": 1430686.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.889693403244019,
|
|
"epoch": 0.06553245116572148,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.7193,
|
|
"mean_token_accuracy": 0.1001870684325695,
|
|
"num_tokens": 1439499.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.836849641799927,
|
|
"epoch": 0.06595253098088637,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.7144,
|
|
"mean_token_accuracy": 0.10016432479023933,
|
|
"num_tokens": 1448220.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.703166866302491,
|
|
"epoch": 0.06637261079605125,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.7252,
|
|
"mean_token_accuracy": 0.09049011170864105,
|
|
"num_tokens": 1458217.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.805354738235474,
|
|
"epoch": 0.06679269061121614,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.6229,
|
|
"mean_token_accuracy": 0.0928824745118618,
|
|
"num_tokens": 1467422.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.788901376724243,
|
|
"epoch": 0.06721277042638102,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.6204,
|
|
"mean_token_accuracy": 0.10320913046598434,
|
|
"num_tokens": 1476152.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.731419372558594,
|
|
"epoch": 0.06763285024154589,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.7128,
|
|
"mean_token_accuracy": 0.09539571255445481,
|
|
"num_tokens": 1485248.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.7255181789398195,
|
|
"epoch": 0.06805293005671077,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.6711,
|
|
"mean_token_accuracy": 0.09965705946087837,
|
|
"num_tokens": 1494248.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.825131368637085,
|
|
"epoch": 0.06847300987187566,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.785,
|
|
"mean_token_accuracy": 0.09547284319996834,
|
|
"num_tokens": 1503565.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.932170867919922,
|
|
"epoch": 0.06889308968704054,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.8605,
|
|
"mean_token_accuracy": 0.09502148702740669,
|
|
"num_tokens": 1513227.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.8283134460449215,
|
|
"epoch": 0.06931316950220542,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.6616,
|
|
"mean_token_accuracy": 0.1039304107427597,
|
|
"num_tokens": 1522312.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.6956737518310545,
|
|
"epoch": 0.0697332493173703,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.5989,
|
|
"mean_token_accuracy": 0.10552669763565063,
|
|
"num_tokens": 1531720.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.70291919708252,
|
|
"epoch": 0.07015332913253518,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.7026,
|
|
"mean_token_accuracy": 0.09495449438691139,
|
|
"num_tokens": 1541238.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.867031812667847,
|
|
"epoch": 0.07057340894770006,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.7955,
|
|
"mean_token_accuracy": 0.09560235142707825,
|
|
"num_tokens": 1550875.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.679243516921997,
|
|
"epoch": 0.07099348876286495,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.7373,
|
|
"mean_token_accuracy": 0.10205229669809342,
|
|
"num_tokens": 1560287.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.812178373336792,
|
|
"epoch": 0.07141356857802983,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.6139,
|
|
"mean_token_accuracy": 0.10624400898814201,
|
|
"num_tokens": 1569043.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.66694450378418,
|
|
"epoch": 0.07183364839319471,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.6372,
|
|
"mean_token_accuracy": 0.10226837545633316,
|
|
"num_tokens": 1578112.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.592900228500366,
|
|
"epoch": 0.07225372820835958,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.5542,
|
|
"mean_token_accuracy": 0.10482543483376502,
|
|
"num_tokens": 1586587.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.831333017349243,
|
|
"epoch": 0.07267380802352447,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.7191,
|
|
"mean_token_accuracy": 0.0988001950085163,
|
|
"num_tokens": 1595585.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.7406104564666744,
|
|
"epoch": 0.07309388783868935,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.6715,
|
|
"mean_token_accuracy": 0.1029144361615181,
|
|
"num_tokens": 1605355.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.673774909973145,
|
|
"epoch": 0.07351396765385423,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.7087,
|
|
"mean_token_accuracy": 0.0972638413310051,
|
|
"num_tokens": 1613637.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.780192899703979,
|
|
"epoch": 0.07393404746901912,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.6547,
|
|
"mean_token_accuracy": 0.10374342575669289,
|
|
"num_tokens": 1622731.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.733386611938476,
|
|
"epoch": 0.074354127284184,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.6411,
|
|
"mean_token_accuracy": 0.09785914570093154,
|
|
"num_tokens": 1632098.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.656809377670288,
|
|
"epoch": 0.07477420709934887,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.6333,
|
|
"mean_token_accuracy": 0.09908856153488159,
|
|
"num_tokens": 1641259.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.787235689163208,
|
|
"epoch": 0.07519428691451376,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.7023,
|
|
"mean_token_accuracy": 0.09753435328602791,
|
|
"num_tokens": 1651362.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.644986867904663,
|
|
"epoch": 0.07561436672967864,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.6169,
|
|
"mean_token_accuracy": 0.09910911172628403,
|
|
"num_tokens": 1660190.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.722699403762817,
|
|
"epoch": 0.07603444654484352,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.659,
|
|
"mean_token_accuracy": 0.09519267976284027,
|
|
"num_tokens": 1669020.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.747388315200806,
|
|
"epoch": 0.0764545263600084,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.6775,
|
|
"mean_token_accuracy": 0.10076266825199127,
|
|
"num_tokens": 1678158.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.702866649627685,
|
|
"epoch": 0.07687460617517328,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.6868,
|
|
"mean_token_accuracy": 0.09906790256500245,
|
|
"num_tokens": 1687481.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.647071504592896,
|
|
"epoch": 0.07729468599033816,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.6511,
|
|
"mean_token_accuracy": 0.10402323752641678,
|
|
"num_tokens": 1696782.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.6832818508148195,
|
|
"epoch": 0.07771476580550304,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.6575,
|
|
"mean_token_accuracy": 0.10666462555527687,
|
|
"num_tokens": 1706153.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.698217678070068,
|
|
"epoch": 0.07813484562066793,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.6895,
|
|
"mean_token_accuracy": 0.10017500966787338,
|
|
"num_tokens": 1715585.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.823991441726685,
|
|
"epoch": 0.07855492543583281,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.8005,
|
|
"mean_token_accuracy": 0.09734346494078636,
|
|
"num_tokens": 1724857.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.700028705596924,
|
|
"epoch": 0.0789750052509977,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.6103,
|
|
"mean_token_accuracy": 0.10624456107616424,
|
|
"num_tokens": 1733528.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.742655563354492,
|
|
"epoch": 0.07939508506616257,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.7304,
|
|
"mean_token_accuracy": 0.10352228581905365,
|
|
"num_tokens": 1742953.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.669600582122802,
|
|
"epoch": 0.07981516488132745,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.6746,
|
|
"mean_token_accuracy": 0.10271603912115097,
|
|
"num_tokens": 1752155.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.660818243026734,
|
|
"epoch": 0.08023524469649233,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.5695,
|
|
"mean_token_accuracy": 0.10144439786672592,
|
|
"num_tokens": 1760562.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.623502588272094,
|
|
"epoch": 0.08065532451165722,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.5902,
|
|
"mean_token_accuracy": 0.1015326887369156,
|
|
"num_tokens": 1769631.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.647875261306763,
|
|
"epoch": 0.0810754043268221,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.624,
|
|
"mean_token_accuracy": 0.10202456414699554,
|
|
"num_tokens": 1779080.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.654635858535767,
|
|
"epoch": 0.08149548414198697,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.6146,
|
|
"mean_token_accuracy": 0.10121759623289109,
|
|
"num_tokens": 1787830.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.546731615066529,
|
|
"epoch": 0.08191556395715185,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.5331,
|
|
"mean_token_accuracy": 0.10186785906553268,
|
|
"num_tokens": 1796998.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.6796527862548825,
|
|
"epoch": 0.08233564377231674,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.619,
|
|
"mean_token_accuracy": 0.10591355115175247,
|
|
"num_tokens": 1806194.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.40926570892334,
|
|
"epoch": 0.08275572358748162,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.514,
|
|
"mean_token_accuracy": 0.10517977550625801,
|
|
"num_tokens": 1815751.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.57440676689148,
|
|
"epoch": 0.0831758034026465,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.5942,
|
|
"mean_token_accuracy": 0.10343918055295945,
|
|
"num_tokens": 1825379.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.637695789337158,
|
|
"epoch": 0.08359588321781139,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.5522,
|
|
"mean_token_accuracy": 0.10346684157848358,
|
|
"num_tokens": 1834158.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.537919807434082,
|
|
"epoch": 0.08401596303297626,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.5098,
|
|
"mean_token_accuracy": 0.10425886288285255,
|
|
"num_tokens": 1842724.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.62498288154602,
|
|
"epoch": 0.08443604284814114,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499999998724557,
|
|
"loss": 6.5288,
|
|
"mean_token_accuracy": 0.10198150128126145,
|
|
"num_tokens": 1852485.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.57701358795166,
|
|
"epoch": 0.08485612266330603,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999999935430703,
|
|
"loss": 6.5545,
|
|
"mean_token_accuracy": 0.11041983366012573,
|
|
"num_tokens": 1861303.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.423639154434204,
|
|
"epoch": 0.08527620247847091,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999999843758243,
|
|
"loss": 6.5428,
|
|
"mean_token_accuracy": 0.11022127270698548,
|
|
"num_tokens": 1870859.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.760848808288574,
|
|
"epoch": 0.0856962822936358,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999999712228196,
|
|
"loss": 6.7105,
|
|
"mean_token_accuracy": 0.09618140533566474,
|
|
"num_tokens": 1880295.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.645368003845215,
|
|
"epoch": 0.08611636210880068,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999999540840562,
|
|
"loss": 6.6079,
|
|
"mean_token_accuracy": 0.1056639552116394,
|
|
"num_tokens": 1889193.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.568785905838013,
|
|
"epoch": 0.08653644192396555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999999329595345,
|
|
"loss": 6.7096,
|
|
"mean_token_accuracy": 0.09398577436804771,
|
|
"num_tokens": 1899437.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.708119821548462,
|
|
"epoch": 0.08695652173913043,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999999078492548,
|
|
"loss": 6.5939,
|
|
"mean_token_accuracy": 0.1046712227165699,
|
|
"num_tokens": 1907882.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.493611288070679,
|
|
"epoch": 0.08737660155429532,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999998787532176,
|
|
"loss": 6.5021,
|
|
"mean_token_accuracy": 0.10290396809577942,
|
|
"num_tokens": 1916872.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.608988046646118,
|
|
"epoch": 0.0877966813694602,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999998456714234,
|
|
"loss": 6.675,
|
|
"mean_token_accuracy": 0.10352342054247857,
|
|
"num_tokens": 1926636.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.586896228790283,
|
|
"epoch": 0.08821676118462508,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999998086038729,
|
|
"loss": 6.5742,
|
|
"mean_token_accuracy": 0.10714709535241126,
|
|
"num_tokens": 1935962.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.579021549224853,
|
|
"epoch": 0.08863684099978995,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999997675505665,
|
|
"loss": 6.5514,
|
|
"mean_token_accuracy": 0.10487730801105499,
|
|
"num_tokens": 1944600.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.625632095336914,
|
|
"epoch": 0.08905692081495484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999997225115052,
|
|
"loss": 6.7269,
|
|
"mean_token_accuracy": 0.10071012005209923,
|
|
"num_tokens": 1954234.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.7796577453613285,
|
|
"epoch": 0.08947700063011972,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999996734866896,
|
|
"loss": 6.683,
|
|
"mean_token_accuracy": 0.09888390973210334,
|
|
"num_tokens": 1964499.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.377533006668091,
|
|
"epoch": 0.0898970804452846,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999996204761206,
|
|
"loss": 6.3832,
|
|
"mean_token_accuracy": 0.11216704472899437,
|
|
"num_tokens": 1973635.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.54502387046814,
|
|
"epoch": 0.09031716026044949,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999995634797993,
|
|
"loss": 6.5308,
|
|
"mean_token_accuracy": 0.11021102443337441,
|
|
"num_tokens": 1983509.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.567485332489014,
|
|
"epoch": 0.09073724007561437,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999995024977265,
|
|
"loss": 6.5197,
|
|
"mean_token_accuracy": 0.11247633025050163,
|
|
"num_tokens": 1992336.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.545616102218628,
|
|
"epoch": 0.09115731989077924,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999994375299034,
|
|
"loss": 6.5532,
|
|
"mean_token_accuracy": 0.10819393768906593,
|
|
"num_tokens": 2001931.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.484406518936157,
|
|
"epoch": 0.09157739970594413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499999368576331,
|
|
"loss": 6.4218,
|
|
"mean_token_accuracy": 0.11132358983159066,
|
|
"num_tokens": 2010935.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.49219536781311,
|
|
"epoch": 0.09199747952110901,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999992956370109,
|
|
"loss": 6.4842,
|
|
"mean_token_accuracy": 0.10731736794114113,
|
|
"num_tokens": 2020587.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.410812473297119,
|
|
"epoch": 0.0924175593362739,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499999218711944,
|
|
"loss": 6.5089,
|
|
"mean_token_accuracy": 0.11067400127649307,
|
|
"num_tokens": 2029743.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.581059837341309,
|
|
"epoch": 0.09283763915143878,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999991378011317,
|
|
"loss": 6.5257,
|
|
"mean_token_accuracy": 0.10916591510176658,
|
|
"num_tokens": 2038468.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.456353855133057,
|
|
"epoch": 0.09325771896660366,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999990529045757,
|
|
"loss": 6.4482,
|
|
"mean_token_accuracy": 0.10893432199954986,
|
|
"num_tokens": 2047456.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.627411127090454,
|
|
"epoch": 0.09367779878176853,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999989640222771,
|
|
"loss": 6.7525,
|
|
"mean_token_accuracy": 0.09431043416261672,
|
|
"num_tokens": 2056691.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.684362411499023,
|
|
"epoch": 0.09409787859693342,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499998871154238,
|
|
"loss": 6.5462,
|
|
"mean_token_accuracy": 0.10591837242245675,
|
|
"num_tokens": 2066068.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.578407287597656,
|
|
"epoch": 0.0945179584120983,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999987743004597,
|
|
"loss": 6.4733,
|
|
"mean_token_accuracy": 0.1102992869913578,
|
|
"num_tokens": 2075113.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.506056404113769,
|
|
"epoch": 0.09493803822726318,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999986734609438,
|
|
"loss": 6.6105,
|
|
"mean_token_accuracy": 0.10494827926158905,
|
|
"num_tokens": 2084557.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.6157310009002686,
|
|
"epoch": 0.09535811804242807,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999985686356923,
|
|
"loss": 6.5139,
|
|
"mean_token_accuracy": 0.1062320664525032,
|
|
"num_tokens": 2093424.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.539625740051269,
|
|
"epoch": 0.09577819785759294,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499998459824707,
|
|
"loss": 6.6346,
|
|
"mean_token_accuracy": 0.10304314494132996,
|
|
"num_tokens": 2103066.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.53157410621643,
|
|
"epoch": 0.09619827767275782,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00049999834702799,
|
|
"loss": 6.5013,
|
|
"mean_token_accuracy": 0.10883507803082466,
|
|
"num_tokens": 2112447.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.507535743713379,
|
|
"epoch": 0.0966183574879227,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999982302455431,
|
|
"loss": 6.5269,
|
|
"mean_token_accuracy": 0.11191204637289047,
|
|
"num_tokens": 2121949.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.507864904403687,
|
|
"epoch": 0.09703843730308759,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999981094773683,
|
|
"loss": 6.4328,
|
|
"mean_token_accuracy": 0.11216317638754844,
|
|
"num_tokens": 2130464.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.520567464828491,
|
|
"epoch": 0.09745851711825247,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499997984723468,
|
|
"loss": 6.5942,
|
|
"mean_token_accuracy": 0.10294081419706344,
|
|
"num_tokens": 2139577.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.288797092437744,
|
|
"epoch": 0.09787859693341736,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999978559838441,
|
|
"loss": 6.3204,
|
|
"mean_token_accuracy": 0.11208199337124825,
|
|
"num_tokens": 2147919.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.472030353546143,
|
|
"epoch": 0.09829867674858223,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999977232584991,
|
|
"loss": 6.4949,
|
|
"mean_token_accuracy": 0.10832359045743942,
|
|
"num_tokens": 2156936.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.558899450302124,
|
|
"epoch": 0.09871875656374711,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999975865474354,
|
|
"loss": 6.5512,
|
|
"mean_token_accuracy": 0.10766256302595138,
|
|
"num_tokens": 2165362.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.469175338745117,
|
|
"epoch": 0.099138836378912,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999974458506551,
|
|
"loss": 6.4643,
|
|
"mean_token_accuracy": 0.10836688205599784,
|
|
"num_tokens": 2173665.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.551422071456909,
|
|
"epoch": 0.09955891619407688,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499997301168161,
|
|
"loss": 6.4532,
|
|
"mean_token_accuracy": 0.11138271391391755,
|
|
"num_tokens": 2182222.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.531885147094727,
|
|
"epoch": 0.09997899600924176,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999971524999556,
|
|
"loss": 6.5228,
|
|
"mean_token_accuracy": 0.11111016869544983,
|
|
"num_tokens": 2192358.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.534890985488891,
|
|
"epoch": 0.10039907582440663,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999969998460414,
|
|
"loss": 6.5355,
|
|
"mean_token_accuracy": 0.10454710125923157,
|
|
"num_tokens": 2201889.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.433488464355468,
|
|
"epoch": 0.10081915563957151,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004999968432064213,
|
|
"loss": 6.5322,
|
|
"mean_token_accuracy": 0.1198379322886467,
|
|
"num_tokens": 2211810.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.474250078201294,
|
|
"epoch": 0.1012392354547364,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999966825810979,
|
|
"loss": 6.4684,
|
|
"mean_token_accuracy": 0.10700508952140808,
|
|
"num_tokens": 2221123.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.384520959854126,
|
|
"epoch": 0.10165931526990128,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999965179700742,
|
|
"loss": 6.3986,
|
|
"mean_token_accuracy": 0.11781087368726731,
|
|
"num_tokens": 2230129.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.4176534652709964,
|
|
"epoch": 0.10207939508506617,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499996349373353,
|
|
"loss": 6.4609,
|
|
"mean_token_accuracy": 0.10817519575357437,
|
|
"num_tokens": 2239929.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.5110820770263675,
|
|
"epoch": 0.10249947490023105,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999961767909374,
|
|
"loss": 6.4372,
|
|
"mean_token_accuracy": 0.1148509480059147,
|
|
"num_tokens": 2248078.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.4125104427337645,
|
|
"epoch": 0.10291955471539592,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999960002228303,
|
|
"loss": 6.5274,
|
|
"mean_token_accuracy": 0.10999985039234161,
|
|
"num_tokens": 2256975.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.474673461914063,
|
|
"epoch": 0.1033396345305608,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999958196690349,
|
|
"loss": 6.3849,
|
|
"mean_token_accuracy": 0.11320202201604843,
|
|
"num_tokens": 2265797.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.479385900497436,
|
|
"epoch": 0.10375971434572569,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999956351295545,
|
|
"loss": 6.4946,
|
|
"mean_token_accuracy": 0.11450825035572051,
|
|
"num_tokens": 2274099.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.3540520668029785,
|
|
"epoch": 0.10417979416089057,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999954466043922,
|
|
"loss": 6.3917,
|
|
"mean_token_accuracy": 0.11258968263864517,
|
|
"num_tokens": 2282360.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.481705999374389,
|
|
"epoch": 0.10459987397605545,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999952540935514,
|
|
"loss": 6.5009,
|
|
"mean_token_accuracy": 0.10285271480679511,
|
|
"num_tokens": 2292714.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.455303287506103,
|
|
"epoch": 0.10501995379122034,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999950575970356,
|
|
"loss": 6.426,
|
|
"mean_token_accuracy": 0.11442826837301254,
|
|
"num_tokens": 2301633.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.465747499465943,
|
|
"epoch": 0.10544003360638521,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999948571148482,
|
|
"loss": 6.4138,
|
|
"mean_token_accuracy": 0.11426257789134979,
|
|
"num_tokens": 2310067.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.466140460968018,
|
|
"epoch": 0.10586011342155009,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999946526469927,
|
|
"loss": 6.4932,
|
|
"mean_token_accuracy": 0.11244904398918151,
|
|
"num_tokens": 2320090.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.438083505630493,
|
|
"epoch": 0.10628019323671498,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999944441934728,
|
|
"loss": 6.4509,
|
|
"mean_token_accuracy": 0.11593573912978172,
|
|
"num_tokens": 2329255.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.467304992675781,
|
|
"epoch": 0.10670027305187986,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999942317542922,
|
|
"loss": 6.5481,
|
|
"mean_token_accuracy": 0.10965899974107743,
|
|
"num_tokens": 2339535.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.434674501419067,
|
|
"epoch": 0.10712035286704474,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999940153294546,
|
|
"loss": 6.4448,
|
|
"mean_token_accuracy": 0.11061845496296882,
|
|
"num_tokens": 2348948.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.447847843170166,
|
|
"epoch": 0.10754043268220961,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499993794918964,
|
|
"loss": 6.4628,
|
|
"mean_token_accuracy": 0.10641181394457817,
|
|
"num_tokens": 2359141.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.401166343688965,
|
|
"epoch": 0.1079605124973745,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999935705228241,
|
|
"loss": 6.5084,
|
|
"mean_token_accuracy": 0.1094856470823288,
|
|
"num_tokens": 2368906.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.554097080230713,
|
|
"epoch": 0.10838059231253938,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999933421410389,
|
|
"loss": 6.4839,
|
|
"mean_token_accuracy": 0.11065066531300545,
|
|
"num_tokens": 2377029.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.5027672290802006,
|
|
"epoch": 0.10880067212770426,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999931097736125,
|
|
"loss": 6.5541,
|
|
"mean_token_accuracy": 0.10604767650365829,
|
|
"num_tokens": 2387088.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.470385646820068,
|
|
"epoch": 0.10922075194286915,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999928734205492,
|
|
"loss": 6.4468,
|
|
"mean_token_accuracy": 0.11056585833430291,
|
|
"num_tokens": 2395596.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.403819370269775,
|
|
"epoch": 0.10964083175803403,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999926330818528,
|
|
"loss": 6.4393,
|
|
"mean_token_accuracy": 0.11377019882202148,
|
|
"num_tokens": 2404506.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.469174242019653,
|
|
"epoch": 0.1100609115731989,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999923887575278,
|
|
"loss": 6.4777,
|
|
"mean_token_accuracy": 0.11094499379396439,
|
|
"num_tokens": 2414342.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.476234006881714,
|
|
"epoch": 0.11048099138836379,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999921404475785,
|
|
"loss": 6.4422,
|
|
"mean_token_accuracy": 0.11336205825209618,
|
|
"num_tokens": 2423076.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.415568065643311,
|
|
"epoch": 0.11090107120352867,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999918881520093,
|
|
"loss": 6.391,
|
|
"mean_token_accuracy": 0.11621783077716827,
|
|
"num_tokens": 2432492.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.362053871154785,
|
|
"epoch": 0.11132115101869355,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999916318708246,
|
|
"loss": 6.354,
|
|
"mean_token_accuracy": 0.11400164812803268,
|
|
"num_tokens": 2441916.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.406490755081177,
|
|
"epoch": 0.11174123083385844,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999913716040291,
|
|
"loss": 6.4072,
|
|
"mean_token_accuracy": 0.11762610748410225,
|
|
"num_tokens": 2450932.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.336502504348755,
|
|
"epoch": 0.11216131064902331,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999911073516272,
|
|
"loss": 6.4319,
|
|
"mean_token_accuracy": 0.11254018545150757,
|
|
"num_tokens": 2460058.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.392711496353149,
|
|
"epoch": 0.11258139046418819,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999908391136237,
|
|
"loss": 6.3569,
|
|
"mean_token_accuracy": 0.11563631743192673,
|
|
"num_tokens": 2469607.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.441662883758545,
|
|
"epoch": 0.11300147027935308,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999905668900234,
|
|
"loss": 6.4002,
|
|
"mean_token_accuracy": 0.11395884156227112,
|
|
"num_tokens": 2478345.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.438292360305786,
|
|
"epoch": 0.11342155009451796,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499990290680831,
|
|
"loss": 6.3261,
|
|
"mean_token_accuracy": 0.11877992302179337,
|
|
"num_tokens": 2486662.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.379430055618286,
|
|
"epoch": 0.11384162990968284,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999900104860516,
|
|
"loss": 6.472,
|
|
"mean_token_accuracy": 0.11443257331848145,
|
|
"num_tokens": 2495392.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.437303638458252,
|
|
"epoch": 0.11426170972484773,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999897263056898,
|
|
"loss": 6.4969,
|
|
"mean_token_accuracy": 0.10801200717687606,
|
|
"num_tokens": 2505254.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.457095766067505,
|
|
"epoch": 0.1146817895400126,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499989438139751,
|
|
"loss": 6.3155,
|
|
"mean_token_accuracy": 0.11900854557752609,
|
|
"num_tokens": 2514096.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.339952230453491,
|
|
"epoch": 0.11510186935517748,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999891459882401,
|
|
"loss": 6.3262,
|
|
"mean_token_accuracy": 0.1178194098174572,
|
|
"num_tokens": 2523635.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.318808507919312,
|
|
"epoch": 0.11552194917034236,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999888498511624,
|
|
"loss": 6.3954,
|
|
"mean_token_accuracy": 0.11501155719161034,
|
|
"num_tokens": 2532528.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.366592121124268,
|
|
"epoch": 0.11594202898550725,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999885497285229,
|
|
"loss": 6.307,
|
|
"mean_token_accuracy": 0.11583952903747559,
|
|
"num_tokens": 2541893.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.354608488082886,
|
|
"epoch": 0.11636210880067213,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999882456203273,
|
|
"loss": 6.3581,
|
|
"mean_token_accuracy": 0.11632645949721336,
|
|
"num_tokens": 2551551.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.349077987670898,
|
|
"epoch": 0.11678218861583702,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999879375265806,
|
|
"loss": 6.3146,
|
|
"mean_token_accuracy": 0.1158558964729309,
|
|
"num_tokens": 2560183.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.344199848175049,
|
|
"epoch": 0.11720226843100189,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999876254472886,
|
|
"loss": 6.1959,
|
|
"mean_token_accuracy": 0.12459081262350083,
|
|
"num_tokens": 2568697.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.348653078079224,
|
|
"epoch": 0.11762234824616677,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999873093824565,
|
|
"loss": 6.4194,
|
|
"mean_token_accuracy": 0.11410524025559425,
|
|
"num_tokens": 2578151.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.50674262046814,
|
|
"epoch": 0.11804242806133165,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999869893320902,
|
|
"loss": 6.5289,
|
|
"mean_token_accuracy": 0.1147321492433548,
|
|
"num_tokens": 2585901.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.338491153717041,
|
|
"epoch": 0.11846250787649654,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999866652961952,
|
|
"loss": 6.3629,
|
|
"mean_token_accuracy": 0.11298267319798469,
|
|
"num_tokens": 2595655.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.389230489730835,
|
|
"epoch": 0.11888258769166142,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999863372747773,
|
|
"loss": 6.3335,
|
|
"mean_token_accuracy": 0.11225836053490638,
|
|
"num_tokens": 2604949.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.439256811141968,
|
|
"epoch": 0.11930266750682629,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999860052678423,
|
|
"loss": 6.3989,
|
|
"mean_token_accuracy": 0.11546840667724609,
|
|
"num_tokens": 2614260.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.299542999267578,
|
|
"epoch": 0.11972274732199117,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999856692753959,
|
|
"loss": 6.3905,
|
|
"mean_token_accuracy": 0.11243033632636071,
|
|
"num_tokens": 2623740.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.37091474533081,
|
|
"epoch": 0.12014282713715606,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999853292974444,
|
|
"loss": 6.2964,
|
|
"mean_token_accuracy": 0.1178373210132122,
|
|
"num_tokens": 2631998.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.372178649902343,
|
|
"epoch": 0.12056290695232094,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999849853339936,
|
|
"loss": 6.4358,
|
|
"mean_token_accuracy": 0.11526904925704003,
|
|
"num_tokens": 2641169.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.44800329208374,
|
|
"epoch": 0.12098298676748583,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999846373850497,
|
|
"loss": 6.2945,
|
|
"mean_token_accuracy": 0.11855239495635032,
|
|
"num_tokens": 2650576.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.257949161529541,
|
|
"epoch": 0.12140306658265071,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999842854506186,
|
|
"loss": 6.3807,
|
|
"mean_token_accuracy": 0.11334980726242065,
|
|
"num_tokens": 2660817.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.38723406791687,
|
|
"epoch": 0.12182314639781558,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999839295307069,
|
|
"loss": 6.3212,
|
|
"mean_token_accuracy": 0.11455826535820961,
|
|
"num_tokens": 2669338.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.404263877868653,
|
|
"epoch": 0.12224322621298046,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999835696253206,
|
|
"loss": 6.3789,
|
|
"mean_token_accuracy": 0.11618088632822036,
|
|
"num_tokens": 2679108.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.435732698440551,
|
|
"epoch": 0.12266330602814535,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999832057344664,
|
|
"loss": 6.3325,
|
|
"mean_token_accuracy": 0.1142914392054081,
|
|
"num_tokens": 2688126.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.152384519577026,
|
|
"epoch": 0.12308338584331023,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999828378581504,
|
|
"loss": 6.3063,
|
|
"mean_token_accuracy": 0.12400648295879364,
|
|
"num_tokens": 2697245.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.425075197219849,
|
|
"epoch": 0.12350346565847511,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999824659963793,
|
|
"loss": 6.3465,
|
|
"mean_token_accuracy": 0.1198640413582325,
|
|
"num_tokens": 2705934.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.265953540802002,
|
|
"epoch": 0.12392354547364,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999820901491598,
|
|
"loss": 6.2796,
|
|
"mean_token_accuracy": 0.12351771965622901,
|
|
"num_tokens": 2714367.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.334036827087402,
|
|
"epoch": 0.12434362528880487,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999817103164983,
|
|
"loss": 6.3413,
|
|
"mean_token_accuracy": 0.11931266412138938,
|
|
"num_tokens": 2724366.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.360864496231079,
|
|
"epoch": 0.12476370510396975,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999813264984017,
|
|
"loss": 6.3448,
|
|
"mean_token_accuracy": 0.11467731669545174,
|
|
"num_tokens": 2733980.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.366592979431152,
|
|
"epoch": 0.12518378491913462,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999809386948767,
|
|
"loss": 6.3342,
|
|
"mean_token_accuracy": 0.12208072617650031,
|
|
"num_tokens": 2744013.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.299022817611695,
|
|
"epoch": 0.12560386473429952,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999805469059302,
|
|
"loss": 6.4186,
|
|
"mean_token_accuracy": 0.11027913689613342,
|
|
"num_tokens": 2753385.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.366168975830078,
|
|
"epoch": 0.1260239445494644,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999801511315693,
|
|
"loss": 6.256,
|
|
"mean_token_accuracy": 0.11804210916161537,
|
|
"num_tokens": 2762875.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.342552661895752,
|
|
"epoch": 0.1264440243646293,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999797513718007,
|
|
"loss": 6.3108,
|
|
"mean_token_accuracy": 0.12443676739931106,
|
|
"num_tokens": 2772182.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.206664896011352,
|
|
"epoch": 0.12686410417979416,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999793476266317,
|
|
"loss": 6.2711,
|
|
"mean_token_accuracy": 0.12031201645731926,
|
|
"num_tokens": 2780814.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.639998197555542,
|
|
"epoch": 0.12728418399495905,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999789398960695,
|
|
"loss": 6.5474,
|
|
"mean_token_accuracy": 0.1183062419295311,
|
|
"num_tokens": 2791104.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.19776029586792,
|
|
"epoch": 0.12770426381012392,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999785281801212,
|
|
"loss": 6.256,
|
|
"mean_token_accuracy": 0.11993122175335884,
|
|
"num_tokens": 2800081.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.334916496276856,
|
|
"epoch": 0.1281243436252888,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499978112478794,
|
|
"loss": 6.3835,
|
|
"mean_token_accuracy": 0.11843734234571457,
|
|
"num_tokens": 2809096.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.403998374938965,
|
|
"epoch": 0.1285444234404537,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999776927920955,
|
|
"loss": 6.3545,
|
|
"mean_token_accuracy": 0.12085104510188102,
|
|
"num_tokens": 2818857.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.3299469470977785,
|
|
"epoch": 0.12896450325561856,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499977269120033,
|
|
"loss": 6.4167,
|
|
"mean_token_accuracy": 0.11449578031897545,
|
|
"num_tokens": 2829332.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.3263038158416744,
|
|
"epoch": 0.12938458307078346,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499976841462614,
|
|
"loss": 6.3436,
|
|
"mean_token_accuracy": 0.11686776131391526,
|
|
"num_tokens": 2839193.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.397625589370728,
|
|
"epoch": 0.12980466288594833,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499976409819846,
|
|
"loss": 6.3117,
|
|
"mean_token_accuracy": 0.11800177842378616,
|
|
"num_tokens": 2848535.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.116656970977783,
|
|
"epoch": 0.1302247427011132,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999759741917369,
|
|
"loss": 6.2278,
|
|
"mean_token_accuracy": 0.12729543596506118,
|
|
"num_tokens": 2858090.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.364631414413452,
|
|
"epoch": 0.1306448225162781,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999755345782941,
|
|
"loss": 6.378,
|
|
"mean_token_accuracy": 0.11326263695955277,
|
|
"num_tokens": 2866984.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.246821451187134,
|
|
"epoch": 0.13106490233144297,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999750909795256,
|
|
"loss": 6.1885,
|
|
"mean_token_accuracy": 0.1256905347108841,
|
|
"num_tokens": 2876550.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.341800737380981,
|
|
"epoch": 0.13148498214660786,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999746433954394,
|
|
"loss": 6.286,
|
|
"mean_token_accuracy": 0.12146776840090752,
|
|
"num_tokens": 2885782.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.275845241546631,
|
|
"epoch": 0.13190506196177273,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499974191826043,
|
|
"loss": 6.2653,
|
|
"mean_token_accuracy": 0.13301032781600952,
|
|
"num_tokens": 2894807.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.351547765731811,
|
|
"epoch": 0.1323251417769376,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999737362713448,
|
|
"loss": 6.304,
|
|
"mean_token_accuracy": 0.12145641520619392,
|
|
"num_tokens": 2904076.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.267245769500732,
|
|
"epoch": 0.1327452215921025,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999732767313527,
|
|
"loss": 6.2029,
|
|
"mean_token_accuracy": 0.12209122702479362,
|
|
"num_tokens": 2913761.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.383308267593383,
|
|
"epoch": 0.13316530140726737,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999728132060746,
|
|
"loss": 6.439,
|
|
"mean_token_accuracy": 0.12098384723067283,
|
|
"num_tokens": 2922848.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.364631271362304,
|
|
"epoch": 0.13358538122243227,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999723456955192,
|
|
"loss": 6.3245,
|
|
"mean_token_accuracy": 0.11949731931090354,
|
|
"num_tokens": 2932718.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.2494594097137455,
|
|
"epoch": 0.13400546103759714,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999718741996945,
|
|
"loss": 6.2837,
|
|
"mean_token_accuracy": 0.12003797963261605,
|
|
"num_tokens": 2942686.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.2547472476959225,
|
|
"epoch": 0.13442554085276204,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499971398718609,
|
|
"loss": 6.2407,
|
|
"mean_token_accuracy": 0.1179835021495819,
|
|
"num_tokens": 2952096.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.3157384395599365,
|
|
"epoch": 0.1348456206679269,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999709192522708,
|
|
"loss": 6.3129,
|
|
"mean_token_accuracy": 0.12474863901734352,
|
|
"num_tokens": 2960660.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.379588079452515,
|
|
"epoch": 0.13526570048309178,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999704358006887,
|
|
"loss": 6.3158,
|
|
"mean_token_accuracy": 0.11744728311896324,
|
|
"num_tokens": 2969834.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.285486459732056,
|
|
"epoch": 0.13568578029825668,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999699483638712,
|
|
"loss": 6.311,
|
|
"mean_token_accuracy": 0.12142582982778549,
|
|
"num_tokens": 2979023.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.294291210174561,
|
|
"epoch": 0.13610586011342155,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999694569418269,
|
|
"loss": 6.3063,
|
|
"mean_token_accuracy": 0.12201808094978332,
|
|
"num_tokens": 2988083.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.2657451152801515,
|
|
"epoch": 0.13652593992858644,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999689615345645,
|
|
"loss": 6.2388,
|
|
"mean_token_accuracy": 0.1231310561299324,
|
|
"num_tokens": 2997240.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.308252573013306,
|
|
"epoch": 0.1369460197437513,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999684621420928,
|
|
"loss": 6.3111,
|
|
"mean_token_accuracy": 0.1184695117175579,
|
|
"num_tokens": 3007077.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.319302654266357,
|
|
"epoch": 0.13736609955891618,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999679587644205,
|
|
"loss": 6.3497,
|
|
"mean_token_accuracy": 0.11671060770750045,
|
|
"num_tokens": 3015821.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.236631298065186,
|
|
"epoch": 0.13778617937408108,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999674514015568,
|
|
"loss": 6.2724,
|
|
"mean_token_accuracy": 0.11908711194992065,
|
|
"num_tokens": 3025858.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.3658030986785885,
|
|
"epoch": 0.13820625918924595,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999669400535105,
|
|
"loss": 6.2416,
|
|
"mean_token_accuracy": 0.11343135982751847,
|
|
"num_tokens": 3035537.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.147812271118164,
|
|
"epoch": 0.13862633900441085,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999664247202907,
|
|
"loss": 6.1617,
|
|
"mean_token_accuracy": 0.11974595785140991,
|
|
"num_tokens": 3044204.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.327428913116455,
|
|
"epoch": 0.13904641881957572,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999659054019066,
|
|
"loss": 6.3345,
|
|
"mean_token_accuracy": 0.11974811106920243,
|
|
"num_tokens": 3053111.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.258665418624878,
|
|
"epoch": 0.1394664986347406,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999653820983673,
|
|
"loss": 6.2415,
|
|
"mean_token_accuracy": 0.12036412507295609,
|
|
"num_tokens": 3062456.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.2644579887390135,
|
|
"epoch": 0.13988657844990549,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499964854809682,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.12668107002973555,
|
|
"num_tokens": 3071132.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.261227464675903,
|
|
"epoch": 0.14030665826507036,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999643235358602,
|
|
"loss": 6.222,
|
|
"mean_token_accuracy": 0.125965429097414,
|
|
"num_tokens": 3080892.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.215318775177002,
|
|
"epoch": 0.14072673808023525,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999637882769112,
|
|
"loss": 6.1526,
|
|
"mean_token_accuracy": 0.12532262802124022,
|
|
"num_tokens": 3089874.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.308867406845093,
|
|
"epoch": 0.14114681789540012,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999632490328447,
|
|
"loss": 6.3008,
|
|
"mean_token_accuracy": 0.12098695039749145,
|
|
"num_tokens": 3099535.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.281496620178222,
|
|
"epoch": 0.14156689771056502,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999627058036699,
|
|
"loss": 6.2552,
|
|
"mean_token_accuracy": 0.12044425159692765,
|
|
"num_tokens": 3108772.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.311051607131958,
|
|
"epoch": 0.1419869775257299,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999621585893966,
|
|
"loss": 6.2799,
|
|
"mean_token_accuracy": 0.11901640743017197,
|
|
"num_tokens": 3118333.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.305313062667847,
|
|
"epoch": 0.14240705734089476,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999616073900346,
|
|
"loss": 6.3091,
|
|
"mean_token_accuracy": 0.12129790410399437,
|
|
"num_tokens": 3127356.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.2683678150177,
|
|
"epoch": 0.14282713715605966,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999610522055935,
|
|
"loss": 6.2794,
|
|
"mean_token_accuracy": 0.11691329404711723,
|
|
"num_tokens": 3136859.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.303126668930053,
|
|
"epoch": 0.14324721697122453,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999604930360832,
|
|
"loss": 6.304,
|
|
"mean_token_accuracy": 0.11767303720116615,
|
|
"num_tokens": 3146607.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.214645338058472,
|
|
"epoch": 0.14366729678638943,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999599298815136,
|
|
"loss": 6.2515,
|
|
"mean_token_accuracy": 0.12662419229745864,
|
|
"num_tokens": 3156327.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.21446213722229,
|
|
"epoch": 0.1440873766015543,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004999593627418947,
|
|
"loss": 6.2009,
|
|
"mean_token_accuracy": 0.1281860999763012,
|
|
"num_tokens": 3165559.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.299745416641235,
|
|
"epoch": 0.14450745641671917,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999587916172365,
|
|
"loss": 6.2848,
|
|
"mean_token_accuracy": 0.11663243547081947,
|
|
"num_tokens": 3173850.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.324022483825684,
|
|
"epoch": 0.14492753623188406,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999582165075492,
|
|
"loss": 6.2353,
|
|
"mean_token_accuracy": 0.11788406521081925,
|
|
"num_tokens": 3182838.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.144151782989502,
|
|
"epoch": 0.14534761604704893,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999576374128429,
|
|
"loss": 6.2299,
|
|
"mean_token_accuracy": 0.1223968394100666,
|
|
"num_tokens": 3191692.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.343899536132812,
|
|
"epoch": 0.14576769586221383,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999570543331279,
|
|
"loss": 6.2507,
|
|
"mean_token_accuracy": 0.12281694263219833,
|
|
"num_tokens": 3200069.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.2878196239471436,
|
|
"epoch": 0.1461877756773787,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999564672684145,
|
|
"loss": 6.3406,
|
|
"mean_token_accuracy": 0.11862553879618645,
|
|
"num_tokens": 3209653.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.361492061614991,
|
|
"epoch": 0.14660785549254357,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999558762187131,
|
|
"loss": 6.2041,
|
|
"mean_token_accuracy": 0.12774061411619186,
|
|
"num_tokens": 3218313.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.146276044845581,
|
|
"epoch": 0.14702793530770847,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999552811840342,
|
|
"loss": 6.1521,
|
|
"mean_token_accuracy": 0.1273271396756172,
|
|
"num_tokens": 3227525.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.241751718521118,
|
|
"epoch": 0.14744801512287334,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999546821643884,
|
|
"loss": 6.2657,
|
|
"mean_token_accuracy": 0.121260417252779,
|
|
"num_tokens": 3237022.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.169715499877929,
|
|
"epoch": 0.14786809493803824,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999540791597861,
|
|
"loss": 6.156,
|
|
"mean_token_accuracy": 0.12248859778046609,
|
|
"num_tokens": 3246605.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.1003180027008055,
|
|
"epoch": 0.1482881747532031,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999534721702383,
|
|
"loss": 6.1054,
|
|
"mean_token_accuracy": 0.12855856791138648,
|
|
"num_tokens": 3255587.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.226248407363892,
|
|
"epoch": 0.148708254568368,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999528611957553,
|
|
"loss": 6.2171,
|
|
"mean_token_accuracy": 0.12187446802854537,
|
|
"num_tokens": 3265669.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.278449535369873,
|
|
"epoch": 0.14912833438353287,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999522462363485,
|
|
"loss": 6.1919,
|
|
"mean_token_accuracy": 0.1278035633265972,
|
|
"num_tokens": 3275013.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.265809679031372,
|
|
"epoch": 0.14954841419869774,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999516272920283,
|
|
"loss": 6.311,
|
|
"mean_token_accuracy": 0.1240921102464199,
|
|
"num_tokens": 3284723.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.131893539428711,
|
|
"epoch": 0.14996849401386264,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499951004362806,
|
|
"loss": 6.1325,
|
|
"mean_token_accuracy": 0.12936908155679702,
|
|
"num_tokens": 3293860.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.151740789413452,
|
|
"epoch": 0.1503885738290275,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999503774486924,
|
|
"loss": 6.1833,
|
|
"mean_token_accuracy": 0.12577988132834433,
|
|
"num_tokens": 3303158.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.184361696243286,
|
|
"epoch": 0.1508086536441924,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999497465496987,
|
|
"loss": 6.1137,
|
|
"mean_token_accuracy": 0.11985947787761689,
|
|
"num_tokens": 3313068.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.191692352294922,
|
|
"epoch": 0.15122873345935728,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499949111665836,
|
|
"loss": 6.2033,
|
|
"mean_token_accuracy": 0.12312208265066146,
|
|
"num_tokens": 3321885.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.25971827507019,
|
|
"epoch": 0.15164881327452215,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999484727971158,
|
|
"loss": 6.1858,
|
|
"mean_token_accuracy": 0.12474783286452293,
|
|
"num_tokens": 3330924.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.176667261123657,
|
|
"epoch": 0.15206889308968705,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499947829943549,
|
|
"loss": 6.2248,
|
|
"mean_token_accuracy": 0.12161886692047119,
|
|
"num_tokens": 3340070.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.295008039474487,
|
|
"epoch": 0.15248897290485192,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999471831051474,
|
|
"loss": 6.213,
|
|
"mean_token_accuracy": 0.13358828723430632,
|
|
"num_tokens": 3349870.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.278341436386109,
|
|
"epoch": 0.1529090527200168,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999465322819222,
|
|
"loss": 6.2576,
|
|
"mean_token_accuracy": 0.11560158357024193,
|
|
"num_tokens": 3359573.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.279096603393555,
|
|
"epoch": 0.15332913253518168,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999458774738851,
|
|
"loss": 6.1999,
|
|
"mean_token_accuracy": 0.13126230910420417,
|
|
"num_tokens": 3368577.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.1456389904022215,
|
|
"epoch": 0.15374921235034655,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999452186810476,
|
|
"loss": 6.1662,
|
|
"mean_token_accuracy": 0.12922282814979552,
|
|
"num_tokens": 3377801.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.282723903656006,
|
|
"epoch": 0.15416929216551145,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999445559034214,
|
|
"loss": 6.2248,
|
|
"mean_token_accuracy": 0.12709890604019164,
|
|
"num_tokens": 3386666.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.3540504455566404,
|
|
"epoch": 0.15458937198067632,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999438891410181,
|
|
"loss": 6.3599,
|
|
"mean_token_accuracy": 0.12122973501682281,
|
|
"num_tokens": 3396086.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.2125379085540775,
|
|
"epoch": 0.15500945179584122,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999432183938496,
|
|
"loss": 6.2646,
|
|
"mean_token_accuracy": 0.1275039754807949,
|
|
"num_tokens": 3404894.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.214909315109253,
|
|
"epoch": 0.1554295316110061,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999425436619279,
|
|
"loss": 6.2499,
|
|
"mean_token_accuracy": 0.12167986705899239,
|
|
"num_tokens": 3414172.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.310878896713257,
|
|
"epoch": 0.15584961142617096,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499941864945265,
|
|
"loss": 6.2176,
|
|
"mean_token_accuracy": 0.11906537339091301,
|
|
"num_tokens": 3423409.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.134654092788696,
|
|
"epoch": 0.15626969124133586,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999411822438726,
|
|
"loss": 6.1799,
|
|
"mean_token_accuracy": 0.12394418343901634,
|
|
"num_tokens": 3433047.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.2948554992675785,
|
|
"epoch": 0.15668977105650073,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499940495557763,
|
|
"loss": 6.173,
|
|
"mean_token_accuracy": 0.12352384477853776,
|
|
"num_tokens": 3442490.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.233772277832031,
|
|
"epoch": 0.15710985087166562,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999398048869485,
|
|
"loss": 6.2356,
|
|
"mean_token_accuracy": 0.1239772841334343,
|
|
"num_tokens": 3451804.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.296554517745972,
|
|
"epoch": 0.1575299306868305,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499939110231441,
|
|
"loss": 6.2223,
|
|
"mean_token_accuracy": 0.12610766440629959,
|
|
"num_tokens": 3461481.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.218039226531983,
|
|
"epoch": 0.1579500105019954,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999384115912531,
|
|
"loss": 6.2673,
|
|
"mean_token_accuracy": 0.1208581991493702,
|
|
"num_tokens": 3471798.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.088755655288696,
|
|
"epoch": 0.15837009031716026,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499937708966397,
|
|
"loss": 6.1755,
|
|
"mean_token_accuracy": 0.12277546525001526,
|
|
"num_tokens": 3481386.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.257310009002685,
|
|
"epoch": 0.15879017013232513,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999370023568853,
|
|
"loss": 6.1643,
|
|
"mean_token_accuracy": 0.12328559309244155,
|
|
"num_tokens": 3489981.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.140112638473511,
|
|
"epoch": 0.15921024994749003,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999362917627304,
|
|
"loss": 6.1438,
|
|
"mean_token_accuracy": 0.12805134281516076,
|
|
"num_tokens": 3498551.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.224145746231079,
|
|
"epoch": 0.1596303297626549,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999355771839448,
|
|
"loss": 6.1267,
|
|
"mean_token_accuracy": 0.1276252895593643,
|
|
"num_tokens": 3507921.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.316604804992676,
|
|
"epoch": 0.1600504095778198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999348586205414,
|
|
"loss": 6.2984,
|
|
"mean_token_accuracy": 0.12361158952116966,
|
|
"num_tokens": 3517570.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.265382909774781,
|
|
"epoch": 0.16047048939298467,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999341360725327,
|
|
"loss": 6.2786,
|
|
"mean_token_accuracy": 0.11925147697329522,
|
|
"num_tokens": 3526774.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.244428873062134,
|
|
"epoch": 0.16089056920814954,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999334095399317,
|
|
"loss": 6.2167,
|
|
"mean_token_accuracy": 0.1289656363427639,
|
|
"num_tokens": 3535319.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.091944026947021,
|
|
"epoch": 0.16131064902331443,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999326790227512,
|
|
"loss": 6.1819,
|
|
"mean_token_accuracy": 0.12599623277783395,
|
|
"num_tokens": 3544468.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.069698667526245,
|
|
"epoch": 0.1617307288384793,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999319445210041,
|
|
"loss": 6.0574,
|
|
"mean_token_accuracy": 0.13135963827371597,
|
|
"num_tokens": 3553529.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.176232147216797,
|
|
"epoch": 0.1621508086536442,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999312060347034,
|
|
"loss": 6.1206,
|
|
"mean_token_accuracy": 0.12521466836333275,
|
|
"num_tokens": 3563053.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.155474901199341,
|
|
"epoch": 0.16257088846880907,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999304635638621,
|
|
"loss": 6.0713,
|
|
"mean_token_accuracy": 0.13156753256917,
|
|
"num_tokens": 3571877.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.117454576492309,
|
|
"epoch": 0.16299096828397394,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999297171084935,
|
|
"loss": 6.1211,
|
|
"mean_token_accuracy": 0.12843042388558387,
|
|
"num_tokens": 3581496.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.246276712417602,
|
|
"epoch": 0.16341104809913884,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999289666686109,
|
|
"loss": 6.1408,
|
|
"mean_token_accuracy": 0.12944318503141403,
|
|
"num_tokens": 3590752.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.026504850387573,
|
|
"epoch": 0.1638311279143037,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999282122442274,
|
|
"loss": 6.1427,
|
|
"mean_token_accuracy": 0.12940528690814973,
|
|
"num_tokens": 3599885.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.306515789031982,
|
|
"epoch": 0.1642512077294686,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999274538353564,
|
|
"loss": 6.2127,
|
|
"mean_token_accuracy": 0.12124313414096832,
|
|
"num_tokens": 3610039.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.1400439739227295,
|
|
"epoch": 0.16467128754463348,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999266914420114,
|
|
"loss": 6.1432,
|
|
"mean_token_accuracy": 0.12274663522839546,
|
|
"num_tokens": 3619954.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.1886210441589355,
|
|
"epoch": 0.16509136735979837,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499925925064206,
|
|
"loss": 6.0913,
|
|
"mean_token_accuracy": 0.13008279874920844,
|
|
"num_tokens": 3628164.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.256851673126221,
|
|
"epoch": 0.16551144717496324,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999251547019535,
|
|
"loss": 6.2411,
|
|
"mean_token_accuracy": 0.1288958877325058,
|
|
"num_tokens": 3636778.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.259689378738403,
|
|
"epoch": 0.16593152699012811,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999243803552678,
|
|
"loss": 6.2104,
|
|
"mean_token_accuracy": 0.1265132576227188,
|
|
"num_tokens": 3647046.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.134534025192261,
|
|
"epoch": 0.166351606805293,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999236020241625,
|
|
"loss": 6.1237,
|
|
"mean_token_accuracy": 0.1289564423263073,
|
|
"num_tokens": 3656130.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.189244413375855,
|
|
"epoch": 0.16677168662045788,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999228197086514,
|
|
"loss": 6.2018,
|
|
"mean_token_accuracy": 0.11904976442456246,
|
|
"num_tokens": 3666145.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.2379295349121096,
|
|
"epoch": 0.16719176643562278,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999220334087484,
|
|
"loss": 6.2356,
|
|
"mean_token_accuracy": 0.12509587332606315,
|
|
"num_tokens": 3676722.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.233392667770386,
|
|
"epoch": 0.16761184625078765,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999212431244673,
|
|
"loss": 6.2382,
|
|
"mean_token_accuracy": 0.1240171104669571,
|
|
"num_tokens": 3685880.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.1124889850616455,
|
|
"epoch": 0.16803192606595252,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999204488558222,
|
|
"loss": 6.0582,
|
|
"mean_token_accuracy": 0.13227254450321196,
|
|
"num_tokens": 3695167.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.222057247161866,
|
|
"epoch": 0.16845200588111742,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999196506028273,
|
|
"loss": 6.1797,
|
|
"mean_token_accuracy": 0.12606113404035568,
|
|
"num_tokens": 3703700.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.204267930984497,
|
|
"epoch": 0.1688720856962823,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999188483654965,
|
|
"loss": 6.1263,
|
|
"mean_token_accuracy": 0.12780678346753122,
|
|
"num_tokens": 3712825.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 6.068148231506347,
|
|
"epoch": 0.16929216551144718,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999180421438442,
|
|
"loss": 6.0953,
|
|
"mean_token_accuracy": 0.12944422513246537,
|
|
"num_tokens": 3721807.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 6.252347660064697,
|
|
"epoch": 0.16971224532661205,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999172319378846,
|
|
"loss": 6.2617,
|
|
"mean_token_accuracy": 0.12066083624958993,
|
|
"num_tokens": 3730502.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 6.223606538772583,
|
|
"epoch": 0.17013232514177692,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999164177476319,
|
|
"loss": 6.1457,
|
|
"mean_token_accuracy": 0.13003366217017173,
|
|
"num_tokens": 3739696.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 6.0265522480010985,
|
|
"epoch": 0.17055240495694182,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999155995731009,
|
|
"loss": 6.1404,
|
|
"mean_token_accuracy": 0.1299336552619934,
|
|
"num_tokens": 3748675.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.380355882644653,
|
|
"epoch": 0.1709724847721067,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999147774143057,
|
|
"loss": 6.2221,
|
|
"mean_token_accuracy": 0.12048738449811935,
|
|
"num_tokens": 3757714.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 6.067580938339233,
|
|
"epoch": 0.1713925645872716,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499913951271261,
|
|
"loss": 6.0375,
|
|
"mean_token_accuracy": 0.13202561810612679,
|
|
"num_tokens": 3767589.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.142302322387695,
|
|
"epoch": 0.17181264440243646,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004999131211439816,
|
|
"loss": 6.1596,
|
|
"mean_token_accuracy": 0.12828587144613265,
|
|
"num_tokens": 3777261.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.232779121398925,
|
|
"epoch": 0.17223272421760136,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499912287032482,
|
|
"loss": 6.1001,
|
|
"mean_token_accuracy": 0.1372594192624092,
|
|
"num_tokens": 3786658.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 6.025224256515503,
|
|
"epoch": 0.17265280403276623,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499911448936777,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.13396917879581452,
|
|
"num_tokens": 3794977.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 6.084959363937378,
|
|
"epoch": 0.1730728838479311,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999106068568816,
|
|
"loss": 6.1787,
|
|
"mean_token_accuracy": 0.12529570311307908,
|
|
"num_tokens": 3805138.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.263661098480225,
|
|
"epoch": 0.173492963663096,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999097607928106,
|
|
"loss": 6.1258,
|
|
"mean_token_accuracy": 0.13813115134835244,
|
|
"num_tokens": 3814444.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.166193580627441,
|
|
"epoch": 0.17391304347826086,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999089107445788,
|
|
"loss": 6.0785,
|
|
"mean_token_accuracy": 0.12874337583780288,
|
|
"num_tokens": 3822859.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 6.0040192127227785,
|
|
"epoch": 0.17433312329342576,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999080567122016,
|
|
"loss": 6.102,
|
|
"mean_token_accuracy": 0.1266925446689129,
|
|
"num_tokens": 3833159.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 6.185031747817993,
|
|
"epoch": 0.17475320310859063,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999071986956941,
|
|
"loss": 6.1269,
|
|
"mean_token_accuracy": 0.1295515276491642,
|
|
"num_tokens": 3842136.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.116478013992309,
|
|
"epoch": 0.1751732829237555,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999063366950713,
|
|
"loss": 6.1939,
|
|
"mean_token_accuracy": 0.1253967322409153,
|
|
"num_tokens": 3851406.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 6.1408590316772464,
|
|
"epoch": 0.1755933627389204,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999054707103486,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.1274511694908142,
|
|
"num_tokens": 3861061.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 6.164148044586182,
|
|
"epoch": 0.17601344255408527,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999046007415412,
|
|
"loss": 6.067,
|
|
"mean_token_accuracy": 0.12591860070824623,
|
|
"num_tokens": 3870357.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 6.192416858673096,
|
|
"epoch": 0.17643352236925017,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999037267886646,
|
|
"loss": 6.0964,
|
|
"mean_token_accuracy": 0.1299741767346859,
|
|
"num_tokens": 3879393.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 6.0785363674163815,
|
|
"epoch": 0.17685360218441504,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999028488517343,
|
|
"loss": 6.1037,
|
|
"mean_token_accuracy": 0.12889744639396666,
|
|
"num_tokens": 3888030.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 6.11736216545105,
|
|
"epoch": 0.1772736819995799,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999019669307659,
|
|
"loss": 6.1275,
|
|
"mean_token_accuracy": 0.13039418011903764,
|
|
"num_tokens": 3897430.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 6.1809111595153805,
|
|
"epoch": 0.1776937618147448,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999010810257749,
|
|
"loss": 6.1428,
|
|
"mean_token_accuracy": 0.1269817218184471,
|
|
"num_tokens": 3907711.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 6.062447786331177,
|
|
"epoch": 0.17811384162990967,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999001911367771,
|
|
"loss": 6.0668,
|
|
"mean_token_accuracy": 0.1323694571852684,
|
|
"num_tokens": 3915816.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 6.1604491710662845,
|
|
"epoch": 0.17853392144507457,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998992972637883,
|
|
"loss": 6.1943,
|
|
"mean_token_accuracy": 0.1183660313487053,
|
|
"num_tokens": 3925162.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 6.203741979598999,
|
|
"epoch": 0.17895400126023944,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998983994068242,
|
|
"loss": 6.0864,
|
|
"mean_token_accuracy": 0.1282353989779949,
|
|
"num_tokens": 3934476.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 6.044822025299072,
|
|
"epoch": 0.17937408107540434,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998974975659006,
|
|
"loss": 6.124,
|
|
"mean_token_accuracy": 0.12441963106393814,
|
|
"num_tokens": 3943501.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.184865283966064,
|
|
"epoch": 0.1797941608905692,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998965917410338,
|
|
"loss": 6.1111,
|
|
"mean_token_accuracy": 0.12969196289777757,
|
|
"num_tokens": 3953663.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 6.129238748550415,
|
|
"epoch": 0.18021424070573408,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998956819322397,
|
|
"loss": 6.0839,
|
|
"mean_token_accuracy": 0.13072072938084603,
|
|
"num_tokens": 3962634.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.135206937789917,
|
|
"epoch": 0.18063432052089898,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998947681395343,
|
|
"loss": 6.0859,
|
|
"mean_token_accuracy": 0.1366378679871559,
|
|
"num_tokens": 3972496.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.271072053909302,
|
|
"epoch": 0.18105440033606385,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499893850362934,
|
|
"loss": 6.3296,
|
|
"mean_token_accuracy": 0.12187584564089775,
|
|
"num_tokens": 3980724.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 6.224115467071533,
|
|
"epoch": 0.18147448015122875,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998929286024548,
|
|
"loss": 6.1594,
|
|
"mean_token_accuracy": 0.12844373360276223,
|
|
"num_tokens": 3989842.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 6.123717546463013,
|
|
"epoch": 0.18189455996639362,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998920028581133,
|
|
"loss": 6.0814,
|
|
"mean_token_accuracy": 0.13656101748347282,
|
|
"num_tokens": 3998534.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.150679874420166,
|
|
"epoch": 0.18231463978155849,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998910731299258,
|
|
"loss": 6.1088,
|
|
"mean_token_accuracy": 0.12456604689359665,
|
|
"num_tokens": 4007677.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.126907587051392,
|
|
"epoch": 0.18273471959672338,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998901394179085,
|
|
"loss": 6.1638,
|
|
"mean_token_accuracy": 0.12525054216384887,
|
|
"num_tokens": 4016347.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 6.135372829437256,
|
|
"epoch": 0.18315479941188825,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998892017220784,
|
|
"loss": 6.0213,
|
|
"mean_token_accuracy": 0.13323480933904647,
|
|
"num_tokens": 4025199.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 6.137722158432007,
|
|
"epoch": 0.18357487922705315,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998882600424519,
|
|
"loss": 6.0876,
|
|
"mean_token_accuracy": 0.12551357075572014,
|
|
"num_tokens": 4033933.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 6.108227968215942,
|
|
"epoch": 0.18399495904221802,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004998873143790455,
|
|
"loss": 6.0183,
|
|
"mean_token_accuracy": 0.1379354938864708,
|
|
"num_tokens": 4042891.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 6.1591612815856935,
|
|
"epoch": 0.1844150388573829,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998863647318763,
|
|
"loss": 6.1366,
|
|
"mean_token_accuracy": 0.1241612270474434,
|
|
"num_tokens": 4051123.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 6.089571523666382,
|
|
"epoch": 0.1848351186725478,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998854111009608,
|
|
"loss": 6.113,
|
|
"mean_token_accuracy": 0.12376126572489739,
|
|
"num_tokens": 4060025.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 6.11730580329895,
|
|
"epoch": 0.18525519848771266,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998844534863161,
|
|
"loss": 6.0217,
|
|
"mean_token_accuracy": 0.12926619052886962,
|
|
"num_tokens": 4069363.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 6.176160907745361,
|
|
"epoch": 0.18567527830287756,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998834918879592,
|
|
"loss": 6.1692,
|
|
"mean_token_accuracy": 0.12947654128074645,
|
|
"num_tokens": 4078855.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 6.131696176528931,
|
|
"epoch": 0.18609535811804243,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499882526305907,
|
|
"loss": 6.1424,
|
|
"mean_token_accuracy": 0.12837494984269143,
|
|
"num_tokens": 4087801.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 6.191353893280029,
|
|
"epoch": 0.18651543793320732,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998815567401765,
|
|
"loss": 6.1351,
|
|
"mean_token_accuracy": 0.12790770679712296,
|
|
"num_tokens": 4096949.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 6.171415328979492,
|
|
"epoch": 0.1869355177483722,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998805831907851,
|
|
"loss": 6.084,
|
|
"mean_token_accuracy": 0.1275387942790985,
|
|
"num_tokens": 4105399.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 6.12052903175354,
|
|
"epoch": 0.18735559756353706,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998796056577501,
|
|
"loss": 6.0391,
|
|
"mean_token_accuracy": 0.1234730213880539,
|
|
"num_tokens": 4113873.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 6.033805179595947,
|
|
"epoch": 0.18777567737870196,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998786241410886,
|
|
"loss": 6.1003,
|
|
"mean_token_accuracy": 0.12796764224767684,
|
|
"num_tokens": 4123528.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.244566345214844,
|
|
"epoch": 0.18819575719386683,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499877638640818,
|
|
"loss": 6.1131,
|
|
"mean_token_accuracy": 0.12414761930704117,
|
|
"num_tokens": 4133370.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 6.0351306915283205,
|
|
"epoch": 0.18861583700903173,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499876649156956,
|
|
"loss": 6.0237,
|
|
"mean_token_accuracy": 0.13068948239088057,
|
|
"num_tokens": 4142370.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 6.075446557998657,
|
|
"epoch": 0.1890359168241966,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998756556895196,
|
|
"loss": 6.1176,
|
|
"mean_token_accuracy": 0.12780525609850885,
|
|
"num_tokens": 4152367.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 6.182886552810669,
|
|
"epoch": 0.18945599663936147,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499874658238527,
|
|
"loss": 6.0979,
|
|
"mean_token_accuracy": 0.1277949795126915,
|
|
"num_tokens": 4161126.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 6.106898975372315,
|
|
"epoch": 0.18987607645452637,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998736568039957,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13100193440914154,
|
|
"num_tokens": 4169910.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 6.133787775039673,
|
|
"epoch": 0.19029615626969124,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998726513859432,
|
|
"loss": 6.1599,
|
|
"mean_token_accuracy": 0.12446666359901429,
|
|
"num_tokens": 4179893.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.202354001998901,
|
|
"epoch": 0.19071623608485613,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998716419843875,
|
|
"loss": 6.1617,
|
|
"mean_token_accuracy": 0.1319762259721756,
|
|
"num_tokens": 4190065.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 6.011490678787231,
|
|
"epoch": 0.191136315900021,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998706285993465,
|
|
"loss": 6.069,
|
|
"mean_token_accuracy": 0.13331144750118257,
|
|
"num_tokens": 4198395.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 6.173086833953858,
|
|
"epoch": 0.19155639571518587,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998696112308381,
|
|
"loss": 6.093,
|
|
"mean_token_accuracy": 0.1271330051124096,
|
|
"num_tokens": 4207555.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 6.0555767059326175,
|
|
"epoch": 0.19197647553035077,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998685898788803,
|
|
"loss": 6.0375,
|
|
"mean_token_accuracy": 0.1309538424015045,
|
|
"num_tokens": 4216533.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 6.211866235733032,
|
|
"epoch": 0.19239655534551564,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998675645434914,
|
|
"loss": 6.1419,
|
|
"mean_token_accuracy": 0.1353093557059765,
|
|
"num_tokens": 4225575.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 6.018606328964234,
|
|
"epoch": 0.19281663516068054,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998665352246891,
|
|
"loss": 5.9193,
|
|
"mean_token_accuracy": 0.13810657039284707,
|
|
"num_tokens": 4234306.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 6.014672660827637,
|
|
"epoch": 0.1932367149758454,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998655019224921,
|
|
"loss": 6.1267,
|
|
"mean_token_accuracy": 0.12904786244034766,
|
|
"num_tokens": 4243998.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 6.134347867965698,
|
|
"epoch": 0.19365679479101028,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998644646369185,
|
|
"loss": 6.0238,
|
|
"mean_token_accuracy": 0.12680166810750962,
|
|
"num_tokens": 4253653.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 6.066501617431641,
|
|
"epoch": 0.19407687460617518,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998634233679865,
|
|
"loss": 6.0895,
|
|
"mean_token_accuracy": 0.12311211153864861,
|
|
"num_tokens": 4263305.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.049868440628051,
|
|
"epoch": 0.19449695442134005,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499862378115715,
|
|
"loss": 5.983,
|
|
"mean_token_accuracy": 0.13395097106695175,
|
|
"num_tokens": 4272212.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 6.165916633605957,
|
|
"epoch": 0.19491703423650494,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998613288801221,
|
|
"loss": 6.1922,
|
|
"mean_token_accuracy": 0.1247316338121891,
|
|
"num_tokens": 4281445.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.179806041717529,
|
|
"epoch": 0.1953371140516698,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998602756612267,
|
|
"loss": 6.0898,
|
|
"mean_token_accuracy": 0.12693395391106604,
|
|
"num_tokens": 4290938.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 6.070136451721192,
|
|
"epoch": 0.1957571938668347,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998592184590471,
|
|
"loss": 6.1397,
|
|
"mean_token_accuracy": 0.12676772177219392,
|
|
"num_tokens": 4300022.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 6.06673412322998,
|
|
"epoch": 0.19617727368199958,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998581572736024,
|
|
"loss": 6.0179,
|
|
"mean_token_accuracy": 0.13165862262248992,
|
|
"num_tokens": 4308910.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.994941234588623,
|
|
"epoch": 0.19659735349716445,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998570921049112,
|
|
"loss": 5.9863,
|
|
"mean_token_accuracy": 0.135918989777565,
|
|
"num_tokens": 4317136.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 6.102301931381225,
|
|
"epoch": 0.19701743331232935,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998560229529924,
|
|
"loss": 6.0425,
|
|
"mean_token_accuracy": 0.13503788635134698,
|
|
"num_tokens": 4326163.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 6.227736186981201,
|
|
"epoch": 0.19743751312749422,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998549498178649,
|
|
"loss": 6.1881,
|
|
"mean_token_accuracy": 0.13264173418283462,
|
|
"num_tokens": 4335837.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 6.1506922245025635,
|
|
"epoch": 0.19785759294265912,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998538726995477,
|
|
"loss": 6.1094,
|
|
"mean_token_accuracy": 0.13223380818963051,
|
|
"num_tokens": 4345108.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 6.144142389297485,
|
|
"epoch": 0.198277672757824,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00049985279159806,
|
|
"loss": 6.1229,
|
|
"mean_token_accuracy": 0.1271647334098816,
|
|
"num_tokens": 4353761.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 6.1053972244262695,
|
|
"epoch": 0.19869775257298886,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998517065134208,
|
|
"loss": 6.0771,
|
|
"mean_token_accuracy": 0.1304875746369362,
|
|
"num_tokens": 4363244.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 6.125473690032959,
|
|
"epoch": 0.19911783238815375,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998506174456494,
|
|
"loss": 6.0856,
|
|
"mean_token_accuracy": 0.1269718214869499,
|
|
"num_tokens": 4373034.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 6.056502437591552,
|
|
"epoch": 0.19953791220331862,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998495243947653,
|
|
"loss": 6.0113,
|
|
"mean_token_accuracy": 0.12611002326011658,
|
|
"num_tokens": 4382554.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 6.116158485412598,
|
|
"epoch": 0.19995799201848352,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998484273607875,
|
|
"loss": 6.0324,
|
|
"mean_token_accuracy": 0.13722692728042601,
|
|
"num_tokens": 4391001.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.908738136291504,
|
|
"epoch": 0.2003780718336484,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998473263437356,
|
|
"loss": 5.9468,
|
|
"mean_token_accuracy": 0.1328367456793785,
|
|
"num_tokens": 4400632.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 6.068370723724366,
|
|
"epoch": 0.20079815164881326,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499846221343629,
|
|
"loss": 6.0486,
|
|
"mean_token_accuracy": 0.12969876527786256,
|
|
"num_tokens": 4409565.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 6.078929996490478,
|
|
"epoch": 0.20121823146397816,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998451123604875,
|
|
"loss": 5.9972,
|
|
"mean_token_accuracy": 0.13624220937490464,
|
|
"num_tokens": 4418384.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 6.103708171844483,
|
|
"epoch": 0.20163831127914303,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998439993943306,
|
|
"loss": 6.11,
|
|
"mean_token_accuracy": 0.13608327358961106,
|
|
"num_tokens": 4427581.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 6.2018999576568605,
|
|
"epoch": 0.20205839109430793,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998428824451779,
|
|
"loss": 6.1047,
|
|
"mean_token_accuracy": 0.1272777199745178,
|
|
"num_tokens": 4436572.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 6.056638908386231,
|
|
"epoch": 0.2024784709094728,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998417615130495,
|
|
"loss": 6.1099,
|
|
"mean_token_accuracy": 0.12568870037794114,
|
|
"num_tokens": 4445230.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 6.192966461181641,
|
|
"epoch": 0.2028985507246377,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998406365979649,
|
|
"loss": 6.1712,
|
|
"mean_token_accuracy": 0.12947247475385665,
|
|
"num_tokens": 4454251.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 6.0738544940948485,
|
|
"epoch": 0.20331863053980256,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998395076999443,
|
|
"loss": 6.0246,
|
|
"mean_token_accuracy": 0.1331735722720623,
|
|
"num_tokens": 4463949.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 6.164913845062256,
|
|
"epoch": 0.20373871035496743,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998383748190076,
|
|
"loss": 6.2178,
|
|
"mean_token_accuracy": 0.12642809972167016,
|
|
"num_tokens": 4473373.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 6.169246625900269,
|
|
"epoch": 0.20415879017013233,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998372379551748,
|
|
"loss": 6.0443,
|
|
"mean_token_accuracy": 0.13512365892529488,
|
|
"num_tokens": 4482303.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 6.000651454925537,
|
|
"epoch": 0.2045788699852972,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998360971084663,
|
|
"loss": 6.0248,
|
|
"mean_token_accuracy": 0.1257840245962143,
|
|
"num_tokens": 4491214.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 6.060888242721558,
|
|
"epoch": 0.2049989498004621,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998349522789019,
|
|
"loss": 5.9365,
|
|
"mean_token_accuracy": 0.14086327105760574,
|
|
"num_tokens": 4500099.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 6.020166492462158,
|
|
"epoch": 0.20541902961562697,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998338034665021,
|
|
"loss": 6.0199,
|
|
"mean_token_accuracy": 0.13966668471693994,
|
|
"num_tokens": 4509893.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 6.064390420913696,
|
|
"epoch": 0.20583910943079184,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998326506712872,
|
|
"loss": 5.9974,
|
|
"mean_token_accuracy": 0.13378938734531404,
|
|
"num_tokens": 4518606.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 6.097909021377563,
|
|
"epoch": 0.20625918924595674,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998314938932778,
|
|
"loss": 6.0759,
|
|
"mean_token_accuracy": 0.1298009656369686,
|
|
"num_tokens": 4528392.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 6.1035826206207275,
|
|
"epoch": 0.2066792690611216,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998303331324943,
|
|
"loss": 6.0416,
|
|
"mean_token_accuracy": 0.13463694974780083,
|
|
"num_tokens": 4536983.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.9858495712280275,
|
|
"epoch": 0.2070993488762865,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998291683889571,
|
|
"loss": 5.9442,
|
|
"mean_token_accuracy": 0.13662122339010238,
|
|
"num_tokens": 4544967.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 6.056029415130615,
|
|
"epoch": 0.20751942869145137,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000499827999662687,
|
|
"loss": 6.0242,
|
|
"mean_token_accuracy": 0.12964650020003318,
|
|
"num_tokens": 4554646.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.118838214874268,
|
|
"epoch": 0.20793950850661624,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998268269537046,
|
|
"loss": 6.0401,
|
|
"mean_token_accuracy": 0.13539641574025155,
|
|
"num_tokens": 4564040.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 6.022972631454468,
|
|
"epoch": 0.20835958832178114,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998256502620308,
|
|
"loss": 6.0624,
|
|
"mean_token_accuracy": 0.13345976546406746,
|
|
"num_tokens": 4573758.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 6.193491125106812,
|
|
"epoch": 0.208779668136946,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998244695876864,
|
|
"loss": 6.0874,
|
|
"mean_token_accuracy": 0.13196430653333663,
|
|
"num_tokens": 4582097.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 6.018001937866211,
|
|
"epoch": 0.2091997479521109,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998232849306921,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.1368905283510685,
|
|
"num_tokens": 4590687.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 6.152202367782593,
|
|
"epoch": 0.20961982776727578,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998220962910693,
|
|
"loss": 6.0475,
|
|
"mean_token_accuracy": 0.12533890679478646,
|
|
"num_tokens": 4599497.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 6.059301280975342,
|
|
"epoch": 0.21003990758244068,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998209036688386,
|
|
"loss": 6.0091,
|
|
"mean_token_accuracy": 0.12979092076420784,
|
|
"num_tokens": 4607958.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 6.12682089805603,
|
|
"epoch": 0.21045998739760555,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998197070640216,
|
|
"loss": 6.1445,
|
|
"mean_token_accuracy": 0.12323907017707825,
|
|
"num_tokens": 4617515.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 6.13975419998169,
|
|
"epoch": 0.21088006721277042,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998185064766391,
|
|
"loss": 6.028,
|
|
"mean_token_accuracy": 0.13126113414764404,
|
|
"num_tokens": 4627037.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.999127197265625,
|
|
"epoch": 0.21130014702793531,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998173019067127,
|
|
"loss": 6.0335,
|
|
"mean_token_accuracy": 0.13387575298547744,
|
|
"num_tokens": 4637393.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 6.049172449111938,
|
|
"epoch": 0.21172022684310018,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998160933542633,
|
|
"loss": 6.0685,
|
|
"mean_token_accuracy": 0.12128801420331001,
|
|
"num_tokens": 4646832.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 6.16112699508667,
|
|
"epoch": 0.21214030665826508,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004998148808193128,
|
|
"loss": 6.095,
|
|
"mean_token_accuracy": 0.1346332848072052,
|
|
"num_tokens": 4655719.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 6.126083850860596,
|
|
"epoch": 0.21256038647342995,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998136643018823,
|
|
"loss": 6.0477,
|
|
"mean_token_accuracy": 0.12910717576742173,
|
|
"num_tokens": 4665364.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 6.087383460998535,
|
|
"epoch": 0.21298046628859482,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998124438019935,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.1316668502986431,
|
|
"num_tokens": 4674760.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.993421936035157,
|
|
"epoch": 0.21340054610375972,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998112193196681,
|
|
"loss": 5.9488,
|
|
"mean_token_accuracy": 0.13391186147928238,
|
|
"num_tokens": 4683900.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.969591331481934,
|
|
"epoch": 0.2138206259189246,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998099908549277,
|
|
"loss": 5.9886,
|
|
"mean_token_accuracy": 0.1273488573729992,
|
|
"num_tokens": 4693915.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.9875883102417,
|
|
"epoch": 0.2142407057340895,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499808758407794,
|
|
"loss": 5.8619,
|
|
"mean_token_accuracy": 0.13991126343607901,
|
|
"num_tokens": 4703102.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 6.031775951385498,
|
|
"epoch": 0.21466078554925436,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998075219782889,
|
|
"loss": 6.0787,
|
|
"mean_token_accuracy": 0.1323968604207039,
|
|
"num_tokens": 4712925.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 6.099209594726562,
|
|
"epoch": 0.21508086536441923,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998062815664344,
|
|
"loss": 6.0069,
|
|
"mean_token_accuracy": 0.12949655801057816,
|
|
"num_tokens": 4722641.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 6.046544742584229,
|
|
"epoch": 0.21550094517958412,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998050371722524,
|
|
"loss": 6.0781,
|
|
"mean_token_accuracy": 0.12990766763687134,
|
|
"num_tokens": 4732603.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.932075929641724,
|
|
"epoch": 0.215921024994749,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998037887957649,
|
|
"loss": 5.9211,
|
|
"mean_token_accuracy": 0.13785294219851493,
|
|
"num_tokens": 4742644.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 6.21406192779541,
|
|
"epoch": 0.2163411048099139,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998025364369939,
|
|
"loss": 6.2335,
|
|
"mean_token_accuracy": 0.1234040841460228,
|
|
"num_tokens": 4751482.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 6.237205886840821,
|
|
"epoch": 0.21676118462507876,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998012800959619,
|
|
"loss": 6.0891,
|
|
"mean_token_accuracy": 0.12757375389337539,
|
|
"num_tokens": 4760593.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 6.093921661376953,
|
|
"epoch": 0.21718126444024366,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998000197726909,
|
|
"loss": 6.0827,
|
|
"mean_token_accuracy": 0.13335589170455933,
|
|
"num_tokens": 4769294.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 6.031546688079834,
|
|
"epoch": 0.21760134425540853,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004997987554672033,
|
|
"loss": 6.0081,
|
|
"mean_token_accuracy": 0.13305121287703514,
|
|
"num_tokens": 4779239.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 6.059205436706543,
|
|
"epoch": 0.2180214240705734,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997974871795215,
|
|
"loss": 6.0716,
|
|
"mean_token_accuracy": 0.13057481795549392,
|
|
"num_tokens": 4788211.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 6.109251928329468,
|
|
"epoch": 0.2184415038857383,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499796214909668,
|
|
"loss": 6.0447,
|
|
"mean_token_accuracy": 0.13531798869371414,
|
|
"num_tokens": 4797921.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 6.092241191864014,
|
|
"epoch": 0.21886158370090317,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997949386576653,
|
|
"loss": 6.0378,
|
|
"mean_token_accuracy": 0.13213689997792244,
|
|
"num_tokens": 4807772.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 6.042962265014649,
|
|
"epoch": 0.21928166351606806,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499793658423536,
|
|
"loss": 6.0593,
|
|
"mean_token_accuracy": 0.13149860948324205,
|
|
"num_tokens": 4817999.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 6.057756137847901,
|
|
"epoch": 0.21970174333123293,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997923742073028,
|
|
"loss": 6.0136,
|
|
"mean_token_accuracy": 0.13949006497859956,
|
|
"num_tokens": 4826679.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.998235082626342,
|
|
"epoch": 0.2201218231463978,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997910860089884,
|
|
"loss": 6.0157,
|
|
"mean_token_accuracy": 0.13456794619560242,
|
|
"num_tokens": 4834998.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 6.064208889007569,
|
|
"epoch": 0.2205419029615627,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997897938286156,
|
|
"loss": 5.9717,
|
|
"mean_token_accuracy": 0.1337368108332157,
|
|
"num_tokens": 4843635.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 6.085119295120239,
|
|
"epoch": 0.22096198277672757,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997884976662075,
|
|
"loss": 6.0919,
|
|
"mean_token_accuracy": 0.12607687711715698,
|
|
"num_tokens": 4852027.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 6.183318328857422,
|
|
"epoch": 0.22138206259189247,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997871975217868,
|
|
"loss": 6.0165,
|
|
"mean_token_accuracy": 0.1429324761033058,
|
|
"num_tokens": 4861244.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.912706756591797,
|
|
"epoch": 0.22180214240705734,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997858933953768,
|
|
"loss": 5.9326,
|
|
"mean_token_accuracy": 0.1404939979314804,
|
|
"num_tokens": 4869902.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.963629674911499,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997845852870004,
|
|
"loss": 5.8982,
|
|
"mean_token_accuracy": 0.14085923954844476,
|
|
"num_tokens": 4878502.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.986082458496094,
|
|
"epoch": 0.2226423020373871,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997832731966806,
|
|
"loss": 5.964,
|
|
"mean_token_accuracy": 0.14047276899218558,
|
|
"num_tokens": 4888348.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 6.051373815536499,
|
|
"epoch": 0.22306238185255198,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997819571244411,
|
|
"loss": 6.0172,
|
|
"mean_token_accuracy": 0.13845039829611777,
|
|
"num_tokens": 4897302.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 6.01381549835205,
|
|
"epoch": 0.22348246166771688,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997806370703049,
|
|
"loss": 6.0476,
|
|
"mean_token_accuracy": 0.13289312049746513,
|
|
"num_tokens": 4907078.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.983912467956543,
|
|
"epoch": 0.22390254148288175,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997793130342954,
|
|
"loss": 5.8784,
|
|
"mean_token_accuracy": 0.1382697917521,
|
|
"num_tokens": 4917489.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.94772891998291,
|
|
"epoch": 0.22432262129804661,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997779850164363,
|
|
"loss": 5.9836,
|
|
"mean_token_accuracy": 0.13369291126728058,
|
|
"num_tokens": 4927073.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 6.121642923355102,
|
|
"epoch": 0.2247427011132115,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997766530167508,
|
|
"loss": 6.0821,
|
|
"mean_token_accuracy": 0.1270790107548237,
|
|
"num_tokens": 4935464.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 6.221409273147583,
|
|
"epoch": 0.22516278092837638,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997753170352627,
|
|
"loss": 6.1649,
|
|
"mean_token_accuracy": 0.12717002481222153,
|
|
"num_tokens": 4944718.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 6.084948205947876,
|
|
"epoch": 0.22558286074354128,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997739770719955,
|
|
"loss": 6.0396,
|
|
"mean_token_accuracy": 0.1332695096731186,
|
|
"num_tokens": 4954223.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 6.003955984115601,
|
|
"epoch": 0.22600294055870615,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499772633126973,
|
|
"loss": 6.0733,
|
|
"mean_token_accuracy": 0.1317312702536583,
|
|
"num_tokens": 4963371.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 6.013844203948975,
|
|
"epoch": 0.22642302037387105,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997712852002192,
|
|
"loss": 5.9358,
|
|
"mean_token_accuracy": 0.14093514010310174,
|
|
"num_tokens": 4972973.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 6.059261226654053,
|
|
"epoch": 0.22684310018903592,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997699332917578,
|
|
"loss": 6.1739,
|
|
"mean_token_accuracy": 0.12389883399009705,
|
|
"num_tokens": 4982808.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 6.180717802047729,
|
|
"epoch": 0.2272631800042008,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997685774016127,
|
|
"loss": 6.0444,
|
|
"mean_token_accuracy": 0.13330344706773758,
|
|
"num_tokens": 4992427.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 6.1143828392028805,
|
|
"epoch": 0.22768325981936569,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499767217529808,
|
|
"loss": 6.2262,
|
|
"mean_token_accuracy": 0.12522902861237525,
|
|
"num_tokens": 5003562.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 6.120408248901367,
|
|
"epoch": 0.22810333963453056,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997658536763678,
|
|
"loss": 5.9207,
|
|
"mean_token_accuracy": 0.13713482916355133,
|
|
"num_tokens": 5013429.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 6.080751562118531,
|
|
"epoch": 0.22852341944969545,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004997644858413163,
|
|
"loss": 6.046,
|
|
"mean_token_accuracy": 0.13544052764773368,
|
|
"num_tokens": 5022045.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.984566640853882,
|
|
"epoch": 0.22894349926486032,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997631140246775,
|
|
"loss": 5.8853,
|
|
"mean_token_accuracy": 0.14113514721393586,
|
|
"num_tokens": 5032260.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.9389331340789795,
|
|
"epoch": 0.2293635790800252,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499761738226476,
|
|
"loss": 5.9276,
|
|
"mean_token_accuracy": 0.13583676218986512,
|
|
"num_tokens": 5041688.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 6.007482099533081,
|
|
"epoch": 0.2297836588951901,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499760358446736,
|
|
"loss": 6.0417,
|
|
"mean_token_accuracy": 0.1291549324989319,
|
|
"num_tokens": 5051005.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 6.1208288192749025,
|
|
"epoch": 0.23020373871035496,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499758974685482,
|
|
"loss": 5.9698,
|
|
"mean_token_accuracy": 0.13492617905139923,
|
|
"num_tokens": 5060084.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 6.010481119155884,
|
|
"epoch": 0.23062381852551986,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004997575869427385,
|
|
"loss": 5.9731,
|
|
"mean_token_accuracy": 0.14254927188158034,
|
|
"num_tokens": 5069081.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 6.021266603469849,
|
|
"epoch": 0.23104389834068473,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00049975619521853,
|
|
"loss": 5.9703,
|
|
"mean_token_accuracy": 0.13409337997436524,
|
|
"num_tokens": 5078597.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.943169069290161,
|
|
"epoch": 0.2314639781558496,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997547995128814,
|
|
"loss": 6.0084,
|
|
"mean_token_accuracy": 0.13727526888251304,
|
|
"num_tokens": 5087607.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 6.111000204086304,
|
|
"epoch": 0.2318840579710145,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004997533998258171,
|
|
"loss": 6.0123,
|
|
"mean_token_accuracy": 0.1351937808096409,
|
|
"num_tokens": 5097412.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 6.129235696792603,
|
|
"epoch": 0.23230413778617937,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997519961573622,
|
|
"loss": 6.0735,
|
|
"mean_token_accuracy": 0.1282409645617008,
|
|
"num_tokens": 5105817.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 6.1673665046691895,
|
|
"epoch": 0.23272421760134426,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997505885075414,
|
|
"loss": 6.1269,
|
|
"mean_token_accuracy": 0.12907201573252677,
|
|
"num_tokens": 5114958.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 6.069322109222412,
|
|
"epoch": 0.23314429741650913,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997491768763795,
|
|
"loss": 6.0425,
|
|
"mean_token_accuracy": 0.13409897387027742,
|
|
"num_tokens": 5123728.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 6.003434944152832,
|
|
"epoch": 0.23356437723167403,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997477612639018,
|
|
"loss": 6.0871,
|
|
"mean_token_accuracy": 0.12734304070472718,
|
|
"num_tokens": 5134099.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 6.186435317993164,
|
|
"epoch": 0.2339844570468389,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997463416701332,
|
|
"loss": 6.094,
|
|
"mean_token_accuracy": 0.1274227410554886,
|
|
"num_tokens": 5142934.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 6.043578577041626,
|
|
"epoch": 0.23440453686200377,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997449180950989,
|
|
"loss": 5.9298,
|
|
"mean_token_accuracy": 0.1532392293214798,
|
|
"num_tokens": 5151835.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.953121995925903,
|
|
"epoch": 0.23482461667716867,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997434905388241,
|
|
"loss": 5.9842,
|
|
"mean_token_accuracy": 0.1413706734776497,
|
|
"num_tokens": 5161136.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 6.0334107875823975,
|
|
"epoch": 0.23524469649233354,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499742059001334,
|
|
"loss": 5.9191,
|
|
"mean_token_accuracy": 0.1378956101834774,
|
|
"num_tokens": 5170741.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.991379880905152,
|
|
"epoch": 0.23566477630749844,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997406234826541,
|
|
"loss": 5.9539,
|
|
"mean_token_accuracy": 0.14059103950858115,
|
|
"num_tokens": 5180549.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.995284509658814,
|
|
"epoch": 0.2360848561226633,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004997391839828098,
|
|
"loss": 5.9249,
|
|
"mean_token_accuracy": 0.14390118718147277,
|
|
"num_tokens": 5189486.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 6.030531978607177,
|
|
"epoch": 0.23650493593782818,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997377405018266,
|
|
"loss": 6.0032,
|
|
"mean_token_accuracy": 0.13120983093976973,
|
|
"num_tokens": 5198525.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 6.0725666046142575,
|
|
"epoch": 0.23692501575299307,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00049973629303973,
|
|
"loss": 6.0662,
|
|
"mean_token_accuracy": 0.1294946141541004,
|
|
"num_tokens": 5207124.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.958557415008545,
|
|
"epoch": 0.23734509556815794,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997348415965457,
|
|
"loss": 5.878,
|
|
"mean_token_accuracy": 0.13335178643465043,
|
|
"num_tokens": 5216529.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 6.007561159133911,
|
|
"epoch": 0.23776517538332284,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997333861722995,
|
|
"loss": 6.0169,
|
|
"mean_token_accuracy": 0.13635273203253745,
|
|
"num_tokens": 5225796.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 6.125902462005615,
|
|
"epoch": 0.2381852551984877,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499731926767017,
|
|
"loss": 6.0359,
|
|
"mean_token_accuracy": 0.1375264048576355,
|
|
"num_tokens": 5233876.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.989985036849975,
|
|
"epoch": 0.23860533501365258,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997304633807242,
|
|
"loss": 6.0396,
|
|
"mean_token_accuracy": 0.12682786211371422,
|
|
"num_tokens": 5244782.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 6.019674825668335,
|
|
"epoch": 0.23902541482881748,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997289960134468,
|
|
"loss": 5.9886,
|
|
"mean_token_accuracy": 0.13695719763636588,
|
|
"num_tokens": 5253453.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 6.0026778221130375,
|
|
"epoch": 0.23944549464398235,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997275246652111,
|
|
"loss": 6.0149,
|
|
"mean_token_accuracy": 0.13926383331418038,
|
|
"num_tokens": 5262355.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.99656400680542,
|
|
"epoch": 0.23986557445914725,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499726049336043,
|
|
"loss": 5.9374,
|
|
"mean_token_accuracy": 0.13838583379983901,
|
|
"num_tokens": 5271959.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 6.058608770370483,
|
|
"epoch": 0.24028565427431212,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997245700259686,
|
|
"loss": 5.9673,
|
|
"mean_token_accuracy": 0.1403045229613781,
|
|
"num_tokens": 5281393.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 6.061829471588135,
|
|
"epoch": 0.240705734089477,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997230867350141,
|
|
"loss": 6.0878,
|
|
"mean_token_accuracy": 0.1320396728813648,
|
|
"num_tokens": 5290979.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 6.128190040588379,
|
|
"epoch": 0.24112581390464188,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997215994632059,
|
|
"loss": 6.0392,
|
|
"mean_token_accuracy": 0.13521442338824272,
|
|
"num_tokens": 5300263.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 6.065250301361084,
|
|
"epoch": 0.24154589371980675,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997201082105704,
|
|
"loss": 6.0654,
|
|
"mean_token_accuracy": 0.12793515026569366,
|
|
"num_tokens": 5309522.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 6.059223175048828,
|
|
"epoch": 0.24196597353497165,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997186129771338,
|
|
"loss": 6.0625,
|
|
"mean_token_accuracy": 0.13326726630330085,
|
|
"num_tokens": 5319770.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 6.18207311630249,
|
|
"epoch": 0.24238605335013652,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997171137629226,
|
|
"loss": 6.0695,
|
|
"mean_token_accuracy": 0.13562847971916198,
|
|
"num_tokens": 5328400.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.968668270111084,
|
|
"epoch": 0.24280613316530142,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997156105679636,
|
|
"loss": 5.8716,
|
|
"mean_token_accuracy": 0.14514228701591492,
|
|
"num_tokens": 5336338.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.89683952331543,
|
|
"epoch": 0.2432262129804663,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997141033922832,
|
|
"loss": 5.9748,
|
|
"mean_token_accuracy": 0.1309155747294426,
|
|
"num_tokens": 5345391.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 6.103964805603027,
|
|
"epoch": 0.24364629279563116,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997125922359081,
|
|
"loss": 6.0044,
|
|
"mean_token_accuracy": 0.12651756703853606,
|
|
"num_tokens": 5354709.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 6.039173555374146,
|
|
"epoch": 0.24406637261079606,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997110770988652,
|
|
"loss": 5.9187,
|
|
"mean_token_accuracy": 0.13533097133040428,
|
|
"num_tokens": 5363738.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 6.009365177154541,
|
|
"epoch": 0.24448645242596093,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004997095579811813,
|
|
"loss": 6.0492,
|
|
"mean_token_accuracy": 0.13356854170560836,
|
|
"num_tokens": 5373583.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 6.10346941947937,
|
|
"epoch": 0.24490653224112582,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997080348828833,
|
|
"loss": 6.0964,
|
|
"mean_token_accuracy": 0.1329493686556816,
|
|
"num_tokens": 5383486.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 6.022554492950439,
|
|
"epoch": 0.2453266120562907,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997065078039981,
|
|
"loss": 5.995,
|
|
"mean_token_accuracy": 0.1254143126308918,
|
|
"num_tokens": 5391974.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 6.089977025985718,
|
|
"epoch": 0.24574669187145556,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997049767445529,
|
|
"loss": 6.0288,
|
|
"mean_token_accuracy": 0.12984034791588783,
|
|
"num_tokens": 5400882.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 6.110510158538818,
|
|
"epoch": 0.24616677168662046,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997034417045746,
|
|
"loss": 5.9927,
|
|
"mean_token_accuracy": 0.1267140880227089,
|
|
"num_tokens": 5410538.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.971307563781738,
|
|
"epoch": 0.24658685150178533,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997019026840907,
|
|
"loss": 5.8743,
|
|
"mean_token_accuracy": 0.13612414821982383,
|
|
"num_tokens": 5419406.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.88221755027771,
|
|
"epoch": 0.24700693131695023,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997003596831282,
|
|
"loss": 5.9978,
|
|
"mean_token_accuracy": 0.13463943675160409,
|
|
"num_tokens": 5428817.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 6.0984635829925535,
|
|
"epoch": 0.2474270111321151,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996988127017145,
|
|
"loss": 6.0253,
|
|
"mean_token_accuracy": 0.13181837573647498,
|
|
"num_tokens": 5438277.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 6.0544061183929445,
|
|
"epoch": 0.24784709094728,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004996972617398772,
|
|
"loss": 6.042,
|
|
"mean_token_accuracy": 0.13205936923623085,
|
|
"num_tokens": 5447440.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 6.0680958271026615,
|
|
"epoch": 0.24826717076244487,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996957067976435,
|
|
"loss": 5.9541,
|
|
"mean_token_accuracy": 0.1357963502407074,
|
|
"num_tokens": 5455988.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 6.0058001518249515,
|
|
"epoch": 0.24868725057760974,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004996941478750411,
|
|
"loss": 5.9769,
|
|
"mean_token_accuracy": 0.1373401865363121,
|
|
"num_tokens": 5464996.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 6.083559465408325,
|
|
"epoch": 0.24910733039277463,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996925849720975,
|
|
"loss": 6.1025,
|
|
"mean_token_accuracy": 0.12863337025046348,
|
|
"num_tokens": 5474174.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 6.146986627578736,
|
|
"epoch": 0.2495274102079395,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004996910180888405,
|
|
"loss": 5.9994,
|
|
"mean_token_accuracy": 0.13324794694781303,
|
|
"num_tokens": 5482838.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 6.005090427398682,
|
|
"epoch": 0.2499474900231044,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996894472252977,
|
|
"loss": 6.0195,
|
|
"mean_token_accuracy": 0.13370491713285446,
|
|
"num_tokens": 5491616.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.99453763961792,
|
|
"epoch": 0.25036756983826924,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996878723814973,
|
|
"loss": 5.9972,
|
|
"mean_token_accuracy": 0.12933446019887923,
|
|
"num_tokens": 5500942.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 6.035016107559204,
|
|
"epoch": 0.25078764965343414,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996862935574667,
|
|
"loss": 5.9539,
|
|
"mean_token_accuracy": 0.13152176290750503,
|
|
"num_tokens": 5510078.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.9494434833526615,
|
|
"epoch": 0.25120772946859904,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996847107532342,
|
|
"loss": 5.9763,
|
|
"mean_token_accuracy": 0.13343006893992423,
|
|
"num_tokens": 5518924.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 6.115957880020142,
|
|
"epoch": 0.25162780928376394,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996831239688277,
|
|
"loss": 5.9896,
|
|
"mean_token_accuracy": 0.12950923070311546,
|
|
"num_tokens": 5527385.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.96525821685791,
|
|
"epoch": 0.2520478890989288,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996815332042754,
|
|
"loss": 5.8456,
|
|
"mean_token_accuracy": 0.14307771176099776,
|
|
"num_tokens": 5536781.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2520478890989288,
|
|
"eval_entropy": 5.826104599310177,
|
|
"eval_loss": 6.01594352722168,
|
|
"eval_mean_token_accuracy": 0.13980411247313787,
|
|
"eval_num_tokens": 5536781.0,
|
|
"eval_runtime": 27.3461,
|
|
"eval_samples_per_second": 1366.412,
|
|
"eval_steps_per_second": 170.811,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 6.008435201644898,
|
|
"epoch": 0.2524679689140937,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996799384596054,
|
|
"loss": 6.0261,
|
|
"mean_token_accuracy": 0.1376914620399475,
|
|
"num_tokens": 5545893.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 6.02188720703125,
|
|
"epoch": 0.2528880487292586,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996783397348461,
|
|
"loss": 5.9762,
|
|
"mean_token_accuracy": 0.1329520359635353,
|
|
"num_tokens": 5555818.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 6.045353794097901,
|
|
"epoch": 0.2533081285444234,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996767370300256,
|
|
"loss": 5.9502,
|
|
"mean_token_accuracy": 0.13486573100090027,
|
|
"num_tokens": 5565331.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 6.056732606887818,
|
|
"epoch": 0.2537282083595883,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996751303451724,
|
|
"loss": 5.9577,
|
|
"mean_token_accuracy": 0.13709068223834037,
|
|
"num_tokens": 5574003.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.993344259262085,
|
|
"epoch": 0.2541482881747532,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996735196803149,
|
|
"loss": 5.8551,
|
|
"mean_token_accuracy": 0.1428755633533001,
|
|
"num_tokens": 5582517.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.977582693099976,
|
|
"epoch": 0.2545683679899181,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996719050354818,
|
|
"loss": 6.0686,
|
|
"mean_token_accuracy": 0.13471986055374147,
|
|
"num_tokens": 5591952.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 6.0037376403808596,
|
|
"epoch": 0.25498844780508295,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996702864107015,
|
|
"loss": 5.9609,
|
|
"mean_token_accuracy": 0.1396644115447998,
|
|
"num_tokens": 5601460.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 6.176335668563842,
|
|
"epoch": 0.25540852762024785,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996686638060028,
|
|
"loss": 6.0902,
|
|
"mean_token_accuracy": 0.1306911051273346,
|
|
"num_tokens": 5610776.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.970763540267944,
|
|
"epoch": 0.25582860743541275,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996670372214144,
|
|
"loss": 5.9871,
|
|
"mean_token_accuracy": 0.13826777338981627,
|
|
"num_tokens": 5619627.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.914526128768921,
|
|
"epoch": 0.2562486872505776,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996654066569651,
|
|
"loss": 5.8622,
|
|
"mean_token_accuracy": 0.14179132953286172,
|
|
"num_tokens": 5628969.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.981579828262329,
|
|
"epoch": 0.2566687670657425,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996637721126839,
|
|
"loss": 5.9332,
|
|
"mean_token_accuracy": 0.13520999103784562,
|
|
"num_tokens": 5638629.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 6.005596733093261,
|
|
"epoch": 0.2570888468809074,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004996621335885996,
|
|
"loss": 5.9991,
|
|
"mean_token_accuracy": 0.13599340468645096,
|
|
"num_tokens": 5647571.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 6.013420534133911,
|
|
"epoch": 0.2575089266960722,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004996604910847413,
|
|
"loss": 5.916,
|
|
"mean_token_accuracy": 0.14960622489452363,
|
|
"num_tokens": 5656709.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 6.038319206237793,
|
|
"epoch": 0.2579290065112371,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499658844601138,
|
|
"loss": 6.1017,
|
|
"mean_token_accuracy": 0.13502436354756356,
|
|
"num_tokens": 5665714.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 6.07736644744873,
|
|
"epoch": 0.258349086326402,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499657194137819,
|
|
"loss": 6.0546,
|
|
"mean_token_accuracy": 0.13854038044810296,
|
|
"num_tokens": 5675854.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 6.074629402160644,
|
|
"epoch": 0.2587691661415669,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996555396948136,
|
|
"loss": 5.8721,
|
|
"mean_token_accuracy": 0.13419756293296814,
|
|
"num_tokens": 5685690.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.940470170974732,
|
|
"epoch": 0.25918924595673176,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996538812721509,
|
|
"loss": 5.9341,
|
|
"mean_token_accuracy": 0.14152218475937844,
|
|
"num_tokens": 5695766.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 6.018071937561035,
|
|
"epoch": 0.25960932577189666,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004996522188698603,
|
|
"loss": 5.9909,
|
|
"mean_token_accuracy": 0.13503170683979987,
|
|
"num_tokens": 5704365.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 6.13015513420105,
|
|
"epoch": 0.26002940558706156,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004996505524879714,
|
|
"loss": 6.0965,
|
|
"mean_token_accuracy": 0.13045159131288528,
|
|
"num_tokens": 5713345.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 6.053025817871093,
|
|
"epoch": 0.2604494854022264,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996488821265137,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.14050639048218727,
|
|
"num_tokens": 5722907.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.928135585784912,
|
|
"epoch": 0.2608695652173913,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996472077855166,
|
|
"loss": 5.9387,
|
|
"mean_token_accuracy": 0.13793488591909409,
|
|
"num_tokens": 5731589.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.923902750015259,
|
|
"epoch": 0.2612896450325562,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00049964552946501,
|
|
"loss": 5.9237,
|
|
"mean_token_accuracy": 0.1389499545097351,
|
|
"num_tokens": 5739922.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.905591726303101,
|
|
"epoch": 0.2617097248477211,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996438471650235,
|
|
"loss": 5.8397,
|
|
"mean_token_accuracy": 0.145526784658432,
|
|
"num_tokens": 5749206.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 6.01796875,
|
|
"epoch": 0.26212980466288593,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996421608855869,
|
|
"loss": 5.8992,
|
|
"mean_token_accuracy": 0.1419477328658104,
|
|
"num_tokens": 5758803.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.962277746200561,
|
|
"epoch": 0.26254988447805083,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996404706267301,
|
|
"loss": 5.9991,
|
|
"mean_token_accuracy": 0.1301351211965084,
|
|
"num_tokens": 5768368.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.935734415054322,
|
|
"epoch": 0.26296996429321573,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499638776388483,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14718177318572997,
|
|
"num_tokens": 5776707.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.992966365814209,
|
|
"epoch": 0.26339004410838057,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996370781708757,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.13097626715898514,
|
|
"num_tokens": 5787037.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 6.120069789886474,
|
|
"epoch": 0.26381012392354547,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004996353759739382,
|
|
"loss": 5.9819,
|
|
"mean_token_accuracy": 0.140574112534523,
|
|
"num_tokens": 5796630.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.9368353366851805,
|
|
"epoch": 0.26423020373871037,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004996336697977007,
|
|
"loss": 5.978,
|
|
"mean_token_accuracy": 0.13346768617630006,
|
|
"num_tokens": 5806402.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.97723422050476,
|
|
"epoch": 0.2646502835538752,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004996319596421933,
|
|
"loss": 5.9278,
|
|
"mean_token_accuracy": 0.13734676092863082,
|
|
"num_tokens": 5815742.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.945355033874511,
|
|
"epoch": 0.2650703633690401,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996302455074466,
|
|
"loss": 5.9322,
|
|
"mean_token_accuracy": 0.1382609039545059,
|
|
"num_tokens": 5824915.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 6.0514014720916744,
|
|
"epoch": 0.265490443184205,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996285273934906,
|
|
"loss": 5.9852,
|
|
"mean_token_accuracy": 0.13715496361255647,
|
|
"num_tokens": 5834978.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 6.052202987670898,
|
|
"epoch": 0.2659105229993699,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499626805300356,
|
|
"loss": 6.1228,
|
|
"mean_token_accuracy": 0.1326017878949642,
|
|
"num_tokens": 5845684.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 6.146022653579712,
|
|
"epoch": 0.26633060281453474,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004996250792280732,
|
|
"loss": 5.9964,
|
|
"mean_token_accuracy": 0.13485243916511536,
|
|
"num_tokens": 5854905.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 6.040951061248779,
|
|
"epoch": 0.26675068262969964,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996233491766727,
|
|
"loss": 6.0164,
|
|
"mean_token_accuracy": 0.1350037656724453,
|
|
"num_tokens": 5863654.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 6.058253955841065,
|
|
"epoch": 0.26717076244486454,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996216151461854,
|
|
"loss": 6.0152,
|
|
"mean_token_accuracy": 0.13996267989277839,
|
|
"num_tokens": 5872442.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 6.012804937362671,
|
|
"epoch": 0.2675908422600294,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996198771366417,
|
|
"loss": 5.9378,
|
|
"mean_token_accuracy": 0.13716716319322586,
|
|
"num_tokens": 5882372.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.8219091415405275,
|
|
"epoch": 0.2680109220751943,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996181351480726,
|
|
"loss": 5.7487,
|
|
"mean_token_accuracy": 0.14560527056455613,
|
|
"num_tokens": 5891113.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.941916608810425,
|
|
"epoch": 0.2684310018903592,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996163891805089,
|
|
"loss": 5.9892,
|
|
"mean_token_accuracy": 0.14109294563531877,
|
|
"num_tokens": 5899582.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 6.037355852127075,
|
|
"epoch": 0.2688510817055241,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996146392339815,
|
|
"loss": 5.9353,
|
|
"mean_token_accuracy": 0.1392637461423874,
|
|
"num_tokens": 5908938.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.9513650894165036,
|
|
"epoch": 0.2692711615206889,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996128853085215,
|
|
"loss": 5.9041,
|
|
"mean_token_accuracy": 0.13895752876996995,
|
|
"num_tokens": 5918055.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.997664451599121,
|
|
"epoch": 0.2696912413358538,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996111274041598,
|
|
"loss": 5.8986,
|
|
"mean_token_accuracy": 0.13369553461670874,
|
|
"num_tokens": 5926744.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.959716939926148,
|
|
"epoch": 0.2701113211510187,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996093655209277,
|
|
"loss": 5.9958,
|
|
"mean_token_accuracy": 0.1349453993141651,
|
|
"num_tokens": 5936521.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 6.088764905929565,
|
|
"epoch": 0.27053140096618356,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004996075996588563,
|
|
"loss": 6.0616,
|
|
"mean_token_accuracy": 0.13318859413266182,
|
|
"num_tokens": 5945010.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 6.052014112472534,
|
|
"epoch": 0.27095148078134845,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499605829817977,
|
|
"loss": 5.9638,
|
|
"mean_token_accuracy": 0.14223103746771812,
|
|
"num_tokens": 5953766.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.979779624938965,
|
|
"epoch": 0.27137156059651335,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499604055998321,
|
|
"loss": 5.875,
|
|
"mean_token_accuracy": 0.13957174718379975,
|
|
"num_tokens": 5962168.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.906911420822143,
|
|
"epoch": 0.2717916404116782,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996022781999198,
|
|
"loss": 5.9063,
|
|
"mean_token_accuracy": 0.13852998465299607,
|
|
"num_tokens": 5971627.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.9631248950958256,
|
|
"epoch": 0.2722117202268431,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499600496422805,
|
|
"loss": 5.9925,
|
|
"mean_token_accuracy": 0.13308593779802322,
|
|
"num_tokens": 5981775.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.993693208694458,
|
|
"epoch": 0.272631800042008,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499598710667008,
|
|
"loss": 5.9061,
|
|
"mean_token_accuracy": 0.1379516489803791,
|
|
"num_tokens": 5991097.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.984791469573975,
|
|
"epoch": 0.2730518798571729,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004995969209325604,
|
|
"loss": 5.9693,
|
|
"mean_token_accuracy": 0.13060558065772057,
|
|
"num_tokens": 5999517.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.930228567123413,
|
|
"epoch": 0.2734719596723377,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995951272194941,
|
|
"loss": 5.9479,
|
|
"mean_token_accuracy": 0.12969653084874153,
|
|
"num_tokens": 6008545.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 6.119350004196167,
|
|
"epoch": 0.2738920394875026,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995933295278407,
|
|
"loss": 5.9365,
|
|
"mean_token_accuracy": 0.1350548431277275,
|
|
"num_tokens": 6017366.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.9179764747619625,
|
|
"epoch": 0.2743121193026675,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995915278576321,
|
|
"loss": 5.8875,
|
|
"mean_token_accuracy": 0.14413413256406785,
|
|
"num_tokens": 6025597.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 5.981735897064209,
|
|
"epoch": 0.27473219911783237,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995897222089004,
|
|
"loss": 5.9867,
|
|
"mean_token_accuracy": 0.13929954469203948,
|
|
"num_tokens": 6034239.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 6.11962890625,
|
|
"epoch": 0.27515227893299726,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995879125816772,
|
|
"loss": 6.0068,
|
|
"mean_token_accuracy": 0.13686064183712005,
|
|
"num_tokens": 6043837.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.9640697002410885,
|
|
"epoch": 0.27557235874816216,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995860989759949,
|
|
"loss": 5.956,
|
|
"mean_token_accuracy": 0.1416999839246273,
|
|
"num_tokens": 6053217.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 6.0521222114562985,
|
|
"epoch": 0.27599243856332706,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995842813918855,
|
|
"loss": 5.9551,
|
|
"mean_token_accuracy": 0.13722361102700234,
|
|
"num_tokens": 6061553.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.9697545051574705,
|
|
"epoch": 0.2764125183784919,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995824598293812,
|
|
"loss": 5.8601,
|
|
"mean_token_accuracy": 0.14069184213876723,
|
|
"num_tokens": 6070080.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.995730686187744,
|
|
"epoch": 0.2768325981936568,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995806342885142,
|
|
"loss": 5.9852,
|
|
"mean_token_accuracy": 0.14142092764377595,
|
|
"num_tokens": 6078438.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 6.019344282150269,
|
|
"epoch": 0.2772526780088217,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499578804769317,
|
|
"loss": 5.9771,
|
|
"mean_token_accuracy": 0.13406604304909706,
|
|
"num_tokens": 6087794.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 6.085688066482544,
|
|
"epoch": 0.27767275782398654,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995769712718218,
|
|
"loss": 6.0065,
|
|
"mean_token_accuracy": 0.13597604855895043,
|
|
"num_tokens": 6096709.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.9711473941802975,
|
|
"epoch": 0.27809283763915144,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995751337960613,
|
|
"loss": 5.9269,
|
|
"mean_token_accuracy": 0.13786234930157662,
|
|
"num_tokens": 6105866.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 6.074538946151733,
|
|
"epoch": 0.27851291745431633,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995732923420679,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.13884977921843528,
|
|
"num_tokens": 6114882.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.857705545425415,
|
|
"epoch": 0.2789329972694812,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995714469098743,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.13618046417832375,
|
|
"num_tokens": 6123978.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.886438226699829,
|
|
"epoch": 0.2793530770846461,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499569597499513,
|
|
"loss": 5.9946,
|
|
"mean_token_accuracy": 0.1375075623393059,
|
|
"num_tokens": 6133246.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.993762636184693,
|
|
"epoch": 0.27977315689981097,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004995677441110172,
|
|
"loss": 5.8559,
|
|
"mean_token_accuracy": 0.14045721143484116,
|
|
"num_tokens": 6142865.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 6.025714874267578,
|
|
"epoch": 0.28019323671497587,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995658867444192,
|
|
"loss": 5.9512,
|
|
"mean_token_accuracy": 0.13522876128554345,
|
|
"num_tokens": 6152492.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.981087923049927,
|
|
"epoch": 0.2806133165301407,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004995640253997523,
|
|
"loss": 5.959,
|
|
"mean_token_accuracy": 0.1329936422407627,
|
|
"num_tokens": 6161953.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.841523504257202,
|
|
"epoch": 0.2810333963453056,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995621600770492,
|
|
"loss": 5.8129,
|
|
"mean_token_accuracy": 0.1412846788764,
|
|
"num_tokens": 6171467.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.90531325340271,
|
|
"epoch": 0.2814534761604705,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995602907763431,
|
|
"loss": 5.8859,
|
|
"mean_token_accuracy": 0.13736898675560952,
|
|
"num_tokens": 6180646.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.981820106506348,
|
|
"epoch": 0.28187355597563535,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004995584174976672,
|
|
"loss": 5.9116,
|
|
"mean_token_accuracy": 0.13150710314512254,
|
|
"num_tokens": 6189832.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.980225324630737,
|
|
"epoch": 0.28229363579080025,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995565402410544,
|
|
"loss": 5.7994,
|
|
"mean_token_accuracy": 0.14472294151782988,
|
|
"num_tokens": 6198339.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.924914312362671,
|
|
"epoch": 0.28271371560596514,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004995546590065383,
|
|
"loss": 5.8935,
|
|
"mean_token_accuracy": 0.1394026793539524,
|
|
"num_tokens": 6207564.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.931164789199829,
|
|
"epoch": 0.28313379542113004,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995527737941518,
|
|
"loss": 5.9781,
|
|
"mean_token_accuracy": 0.13914698138833045,
|
|
"num_tokens": 6216056.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.968091154098511,
|
|
"epoch": 0.2835538752362949,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995508846039287,
|
|
"loss": 5.9114,
|
|
"mean_token_accuracy": 0.13818917274475098,
|
|
"num_tokens": 6225573.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 6.069493198394776,
|
|
"epoch": 0.2839739550514598,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004995489914359023,
|
|
"loss": 6.0417,
|
|
"mean_token_accuracy": 0.13078732788562775,
|
|
"num_tokens": 6235057.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 6.030756092071533,
|
|
"epoch": 0.2843940348666247,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004995470942901061,
|
|
"loss": 5.9557,
|
|
"mean_token_accuracy": 0.13645285964012147,
|
|
"num_tokens": 6244164.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 6.068174362182617,
|
|
"epoch": 0.2848141146817895,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004995451931665738,
|
|
"loss": 5.9588,
|
|
"mean_token_accuracy": 0.13424528315663337,
|
|
"num_tokens": 6253095.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.918725109100341,
|
|
"epoch": 0.2852341944969544,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499543288065339,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.13533290028572081,
|
|
"num_tokens": 6261134.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.926444101333618,
|
|
"epoch": 0.2856542743121193,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004995413789864354,
|
|
"loss": 5.9066,
|
|
"mean_token_accuracy": 0.1413659855723381,
|
|
"num_tokens": 6270384.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.974505090713501,
|
|
"epoch": 0.28607435412728416,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995394659298971,
|
|
"loss": 5.842,
|
|
"mean_token_accuracy": 0.14783402383327485,
|
|
"num_tokens": 6279702.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.924916839599609,
|
|
"epoch": 0.28649443394244906,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004995375488957576,
|
|
"loss": 5.8871,
|
|
"mean_token_accuracy": 0.1403558671474457,
|
|
"num_tokens": 6288297.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.979348230361938,
|
|
"epoch": 0.28691451375761395,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499535627884051,
|
|
"loss": 5.983,
|
|
"mean_token_accuracy": 0.12937102988362312,
|
|
"num_tokens": 6297288.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 6.12882170677185,
|
|
"epoch": 0.28733459357277885,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995337028948115,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13142260611057283,
|
|
"num_tokens": 6306719.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.93622145652771,
|
|
"epoch": 0.2877546733879437,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995317739280731,
|
|
"loss": 5.8256,
|
|
"mean_token_accuracy": 0.14748729318380355,
|
|
"num_tokens": 6316639.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.951609373092651,
|
|
"epoch": 0.2881747532031086,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004995298409838699,
|
|
"loss": 5.9555,
|
|
"mean_token_accuracy": 0.1391440898180008,
|
|
"num_tokens": 6326879.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.9383097171783445,
|
|
"epoch": 0.2885948330182735,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499527904062236,
|
|
"loss": 5.8671,
|
|
"mean_token_accuracy": 0.139659284055233,
|
|
"num_tokens": 6335729.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.971969127655029,
|
|
"epoch": 0.28901491283343833,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995259631632061,
|
|
"loss": 5.9185,
|
|
"mean_token_accuracy": 0.1310904636979103,
|
|
"num_tokens": 6345154.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.977327823638916,
|
|
"epoch": 0.28943499264860323,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995240182868143,
|
|
"loss": 5.8858,
|
|
"mean_token_accuracy": 0.14063168689608574,
|
|
"num_tokens": 6354309.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.8834575653076175,
|
|
"epoch": 0.2898550724637681,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995220694330951,
|
|
"loss": 5.8586,
|
|
"mean_token_accuracy": 0.14082162082195282,
|
|
"num_tokens": 6363389.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.92822527885437,
|
|
"epoch": 0.290275152278933,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995201166020832,
|
|
"loss": 5.9065,
|
|
"mean_token_accuracy": 0.13562884032726288,
|
|
"num_tokens": 6372475.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 6.024522161483764,
|
|
"epoch": 0.29069523209409787,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499518159793813,
|
|
"loss": 5.8677,
|
|
"mean_token_accuracy": 0.14305904358625413,
|
|
"num_tokens": 6380906.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.884508085250855,
|
|
"epoch": 0.29111531190926276,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499516199008319,
|
|
"loss": 5.8659,
|
|
"mean_token_accuracy": 0.14293192625045775,
|
|
"num_tokens": 6390085.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 6.008301162719727,
|
|
"epoch": 0.29153539172442766,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995142342456364,
|
|
"loss": 5.9391,
|
|
"mean_token_accuracy": 0.13623592853546143,
|
|
"num_tokens": 6399441.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 6.066584539413452,
|
|
"epoch": 0.2919554715395925,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995122655057997,
|
|
"loss": 6.0208,
|
|
"mean_token_accuracy": 0.13953343629837037,
|
|
"num_tokens": 6408995.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.888063764572143,
|
|
"epoch": 0.2923755513547574,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995102927888437,
|
|
"loss": 5.7722,
|
|
"mean_token_accuracy": 0.1459358014166355,
|
|
"num_tokens": 6418080.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.952468156814575,
|
|
"epoch": 0.2927956311699223,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004995083160948036,
|
|
"loss": 5.9318,
|
|
"mean_token_accuracy": 0.14023924767971038,
|
|
"num_tokens": 6426732.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.971553039550781,
|
|
"epoch": 0.29321571098508714,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004995063354237141,
|
|
"loss": 5.9538,
|
|
"mean_token_accuracy": 0.14043337404727935,
|
|
"num_tokens": 6435957.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.94589900970459,
|
|
"epoch": 0.29363579080025204,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004995043507756107,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.133124540746212,
|
|
"num_tokens": 6445642.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.974902820587158,
|
|
"epoch": 0.29405587061541694,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004995023621505282,
|
|
"loss": 5.9363,
|
|
"mean_token_accuracy": 0.1418766610324383,
|
|
"num_tokens": 6454664.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.940143728256226,
|
|
"epoch": 0.29447595043058183,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000499500369548502,
|
|
"loss": 5.8583,
|
|
"mean_token_accuracy": 0.1379205584526062,
|
|
"num_tokens": 6463224.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 6.120481824874878,
|
|
"epoch": 0.2948960302457467,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994983729695674,
|
|
"loss": 6.0926,
|
|
"mean_token_accuracy": 0.1296972803771496,
|
|
"num_tokens": 6473112.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.980841064453125,
|
|
"epoch": 0.2953161100609116,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004994963724137595,
|
|
"loss": 5.9214,
|
|
"mean_token_accuracy": 0.1389226034283638,
|
|
"num_tokens": 6482062.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.932737588882446,
|
|
"epoch": 0.29573618987607647,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004994943678811142,
|
|
"loss": 5.9004,
|
|
"mean_token_accuracy": 0.13374803215265274,
|
|
"num_tokens": 6490568.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.997820091247559,
|
|
"epoch": 0.2961562696912413,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994923593716667,
|
|
"loss": 5.963,
|
|
"mean_token_accuracy": 0.14052257165312768,
|
|
"num_tokens": 6500815.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.916243839263916,
|
|
"epoch": 0.2965763495064062,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004994903468854527,
|
|
"loss": 5.8376,
|
|
"mean_token_accuracy": 0.14926647543907165,
|
|
"num_tokens": 6509529.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.922206735610962,
|
|
"epoch": 0.2969964293215711,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994883304225077,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.13852014467120172,
|
|
"num_tokens": 6517934.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.9876025199890135,
|
|
"epoch": 0.297416509136736,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994863099828675,
|
|
"loss": 5.8695,
|
|
"mean_token_accuracy": 0.14087166935205458,
|
|
"num_tokens": 6526098.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.935700082778931,
|
|
"epoch": 0.29783658895190085,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499484285566568,
|
|
"loss": 5.906,
|
|
"mean_token_accuracy": 0.13566448390483857,
|
|
"num_tokens": 6535831.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.939550399780273,
|
|
"epoch": 0.29825666876706575,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994822571736449,
|
|
"loss": 5.8255,
|
|
"mean_token_accuracy": 0.13489115089178086,
|
|
"num_tokens": 6545704.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.947116851806641,
|
|
"epoch": 0.29867674858223064,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994802248041342,
|
|
"loss": 5.8548,
|
|
"mean_token_accuracy": 0.14142827019095422,
|
|
"num_tokens": 6554423.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.969081258773803,
|
|
"epoch": 0.2990968283973955,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000499478188458072,
|
|
"loss": 5.9073,
|
|
"mean_token_accuracy": 0.13533755540847778,
|
|
"num_tokens": 6563989.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.9689305305480955,
|
|
"epoch": 0.2995169082125604,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004994761481354943,
|
|
"loss": 6.0328,
|
|
"mean_token_accuracy": 0.13800237625837325,
|
|
"num_tokens": 6572745.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 6.133339929580688,
|
|
"epoch": 0.2999369880277253,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004994741038364371,
|
|
"loss": 6.0333,
|
|
"mean_token_accuracy": 0.13616435453295708,
|
|
"num_tokens": 6581723.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.896167135238647,
|
|
"epoch": 0.3003570678428901,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004994720555609369,
|
|
"loss": 5.7604,
|
|
"mean_token_accuracy": 0.1434899814426899,
|
|
"num_tokens": 6590342.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.878182983398437,
|
|
"epoch": 0.300777147658055,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004994700033090297,
|
|
"loss": 5.8344,
|
|
"mean_token_accuracy": 0.14836035221815108,
|
|
"num_tokens": 6599206.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 6.036917591094971,
|
|
"epoch": 0.3011972274732199,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000499467947080752,
|
|
"loss": 6.1289,
|
|
"mean_token_accuracy": 0.13054108917713164,
|
|
"num_tokens": 6608947.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 6.017320966720581,
|
|
"epoch": 0.3016173072883848,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004994658868761402,
|
|
"loss": 5.9128,
|
|
"mean_token_accuracy": 0.14748418629169463,
|
|
"num_tokens": 6618378.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.987727546691895,
|
|
"epoch": 0.30203738710354966,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994638226952307,
|
|
"loss": 5.9681,
|
|
"mean_token_accuracy": 0.13054394274950026,
|
|
"num_tokens": 6627527.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.996758890151978,
|
|
"epoch": 0.30245746691871456,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004994617545380604,
|
|
"loss": 5.8919,
|
|
"mean_token_accuracy": 0.13826094195246696,
|
|
"num_tokens": 6636964.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.905787420272827,
|
|
"epoch": 0.30287754673387945,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004994596824046656,
|
|
"loss": 5.8569,
|
|
"mean_token_accuracy": 0.141887067258358,
|
|
"num_tokens": 6646074.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.99219708442688,
|
|
"epoch": 0.3032976265490443,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000499457606295083,
|
|
"loss": 5.9311,
|
|
"mean_token_accuracy": 0.13836071118712426,
|
|
"num_tokens": 6655027.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.7845015048980715,
|
|
"epoch": 0.3037177063642092,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004994555262093495,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.15755455046892167,
|
|
"num_tokens": 6663747.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 6.036468362808227,
|
|
"epoch": 0.3041377861793741,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499453442147502,
|
|
"loss": 6.0392,
|
|
"mean_token_accuracy": 0.13115543723106385,
|
|
"num_tokens": 6672922.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.979010963439942,
|
|
"epoch": 0.304557865994539,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004994513541095773,
|
|
"loss": 5.8654,
|
|
"mean_token_accuracy": 0.14586904942989348,
|
|
"num_tokens": 6682233.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.928103733062744,
|
|
"epoch": 0.30497794580970383,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004994492620956126,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.14258120208978653,
|
|
"num_tokens": 6691593.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.953917217254639,
|
|
"epoch": 0.30539802562486873,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994471661056445,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.14142323583364486,
|
|
"num_tokens": 6701318.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 5.986124277114868,
|
|
"epoch": 0.3058181054400336,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994450661397106,
|
|
"loss": 5.9176,
|
|
"mean_token_accuracy": 0.14466760009527208,
|
|
"num_tokens": 6710059.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 6.110535717010498,
|
|
"epoch": 0.30623818525519847,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499442962197848,
|
|
"loss": 6.0091,
|
|
"mean_token_accuracy": 0.1349786825478077,
|
|
"num_tokens": 6719811.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.885643482208252,
|
|
"epoch": 0.30665826507036337,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994408542800937,
|
|
"loss": 5.8848,
|
|
"mean_token_accuracy": 0.13900379538536073,
|
|
"num_tokens": 6728789.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.929373550415039,
|
|
"epoch": 0.30707834488552826,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004994387423864855,
|
|
"loss": 5.8632,
|
|
"mean_token_accuracy": 0.1396006353199482,
|
|
"num_tokens": 6737706.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.928421974182129,
|
|
"epoch": 0.3074984247006931,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994366265170603,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.1530800625681877,
|
|
"num_tokens": 6746861.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 6.01959867477417,
|
|
"epoch": 0.307918504515858,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994345066718558,
|
|
"loss": 6.0207,
|
|
"mean_token_accuracy": 0.13322951793670654,
|
|
"num_tokens": 6755242.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 6.026466798782349,
|
|
"epoch": 0.3083385843310229,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994323828509098,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.13347591310739518,
|
|
"num_tokens": 6764549.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.915293598175049,
|
|
"epoch": 0.3087586641461878,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004994302550542596,
|
|
"loss": 5.9418,
|
|
"mean_token_accuracy": 0.14316236823797227,
|
|
"num_tokens": 6774123.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.850841808319092,
|
|
"epoch": 0.30917874396135264,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000499428123281943,
|
|
"loss": 5.7122,
|
|
"mean_token_accuracy": 0.1474112629890442,
|
|
"num_tokens": 6782922.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.9184730052948,
|
|
"epoch": 0.30959882377651754,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994259875339978,
|
|
"loss": 5.9611,
|
|
"mean_token_accuracy": 0.13746373876929283,
|
|
"num_tokens": 6792042.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 6.05865330696106,
|
|
"epoch": 0.31001890359168244,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004994238478104617,
|
|
"loss": 5.9598,
|
|
"mean_token_accuracy": 0.1366279661655426,
|
|
"num_tokens": 6800994.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.93690128326416,
|
|
"epoch": 0.3104389834068473,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994217041113727,
|
|
"loss": 5.8868,
|
|
"mean_token_accuracy": 0.14316150173544884,
|
|
"num_tokens": 6809938.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 6.014241790771484,
|
|
"epoch": 0.3108590632220122,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994195564367688,
|
|
"loss": 6.0213,
|
|
"mean_token_accuracy": 0.13116879239678383,
|
|
"num_tokens": 6820289.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 6.002475690841675,
|
|
"epoch": 0.3112791430371771,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994174047866882,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14203700423240662,
|
|
"num_tokens": 6830068.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.788861274719238,
|
|
"epoch": 0.3116992228523419,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994152491611686,
|
|
"loss": 5.8813,
|
|
"mean_token_accuracy": 0.13960717990994453,
|
|
"num_tokens": 6838591.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.89765567779541,
|
|
"epoch": 0.3121193026675068,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994130895602485,
|
|
"loss": 5.8505,
|
|
"mean_token_accuracy": 0.13729089125990868,
|
|
"num_tokens": 6847796.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 6.010899591445923,
|
|
"epoch": 0.3125393824826717,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499410925983966,
|
|
"loss": 5.941,
|
|
"mean_token_accuracy": 0.13994767293334007,
|
|
"num_tokens": 6856585.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.889919090270996,
|
|
"epoch": 0.3129594622978366,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004994087584323596,
|
|
"loss": 5.8502,
|
|
"mean_token_accuracy": 0.14524889141321182,
|
|
"num_tokens": 6865757.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.9244975566864015,
|
|
"epoch": 0.31337954211300145,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004994065869054676,
|
|
"loss": 5.9051,
|
|
"mean_token_accuracy": 0.13346855491399764,
|
|
"num_tokens": 6875371.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.990236139297485,
|
|
"epoch": 0.31379962192816635,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004994044114033283,
|
|
"loss": 5.9445,
|
|
"mean_token_accuracy": 0.13406403809785844,
|
|
"num_tokens": 6884050.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 6.023118162155152,
|
|
"epoch": 0.31421970174333125,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994022319259806,
|
|
"loss": 5.9236,
|
|
"mean_token_accuracy": 0.1428280971944332,
|
|
"num_tokens": 6893079.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.977470397949219,
|
|
"epoch": 0.3146397815584961,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004994000484734629,
|
|
"loss": 6.0157,
|
|
"mean_token_accuracy": 0.14197005555033684,
|
|
"num_tokens": 6903100.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.968418455123901,
|
|
"epoch": 0.315059861373661,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993978610458137,
|
|
"loss": 5.8564,
|
|
"mean_token_accuracy": 0.1436561480164528,
|
|
"num_tokens": 6912164.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.8913768291473385,
|
|
"epoch": 0.3154799411888259,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993956696430721,
|
|
"loss": 5.8793,
|
|
"mean_token_accuracy": 0.13736136257648468,
|
|
"num_tokens": 6921183.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 6.017658281326294,
|
|
"epoch": 0.3159000210039908,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993934742652768,
|
|
"loss": 5.9616,
|
|
"mean_token_accuracy": 0.1389385998249054,
|
|
"num_tokens": 6931325.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 6.002210426330566,
|
|
"epoch": 0.3163201008191556,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004993912749124665,
|
|
"loss": 5.8433,
|
|
"mean_token_accuracy": 0.1487124353647232,
|
|
"num_tokens": 6940234.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.929537010192871,
|
|
"epoch": 0.3167401806343205,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004993890715846804,
|
|
"loss": 5.9507,
|
|
"mean_token_accuracy": 0.14044182747602463,
|
|
"num_tokens": 6949067.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.998405647277832,
|
|
"epoch": 0.3171602604494854,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993868642819574,
|
|
"loss": 5.9194,
|
|
"mean_token_accuracy": 0.13718469440937042,
|
|
"num_tokens": 6959085.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.961022281646729,
|
|
"epoch": 0.31758034026465026,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004993846530043367,
|
|
"loss": 5.9451,
|
|
"mean_token_accuracy": 0.13289572075009345,
|
|
"num_tokens": 6967392.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.938811779022217,
|
|
"epoch": 0.31800042007981516,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993824377518574,
|
|
"loss": 5.8794,
|
|
"mean_token_accuracy": 0.14492053985595704,
|
|
"num_tokens": 6976369.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 6.007278203964233,
|
|
"epoch": 0.31842049989498006,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004993802185245587,
|
|
"loss": 5.8979,
|
|
"mean_token_accuracy": 0.14349642321467398,
|
|
"num_tokens": 6985889.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.902310371398926,
|
|
"epoch": 0.3188405797101449,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00049937799532248,
|
|
"loss": 5.9155,
|
|
"mean_token_accuracy": 0.13254671469330787,
|
|
"num_tokens": 6995396.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 6.108139371871948,
|
|
"epoch": 0.3192606595253098,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993757681456607,
|
|
"loss": 5.974,
|
|
"mean_token_accuracy": 0.13683522641658782,
|
|
"num_tokens": 7004666.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.993764448165893,
|
|
"epoch": 0.3196807393404747,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004993735369941401,
|
|
"loss": 6.0094,
|
|
"mean_token_accuracy": 0.13341464176774026,
|
|
"num_tokens": 7014608.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.958604240417481,
|
|
"epoch": 0.3201008191556396,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004993713018679579,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.14026129618287086,
|
|
"num_tokens": 7023671.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.995219898223877,
|
|
"epoch": 0.32052089897080444,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993690627671536,
|
|
"loss": 5.9253,
|
|
"mean_token_accuracy": 0.13401568681001663,
|
|
"num_tokens": 7033786.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.926336812973022,
|
|
"epoch": 0.32094097878596933,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004993668196917669,
|
|
"loss": 5.8311,
|
|
"mean_token_accuracy": 0.14573807418346404,
|
|
"num_tokens": 7042162.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.96917757987976,
|
|
"epoch": 0.32136105860113423,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993645726418375,
|
|
"loss": 5.981,
|
|
"mean_token_accuracy": 0.13832971975207328,
|
|
"num_tokens": 7051903.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.879901790618897,
|
|
"epoch": 0.3217811384162991,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993623216174053,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.15186585038900374,
|
|
"num_tokens": 7060229.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.918556547164917,
|
|
"epoch": 0.32220121823146397,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00049936006661851,
|
|
"loss": 5.8909,
|
|
"mean_token_accuracy": 0.13876768276095391,
|
|
"num_tokens": 7069040.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.9392224788665775,
|
|
"epoch": 0.32262129804662887,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993578076451917,
|
|
"loss": 5.7726,
|
|
"mean_token_accuracy": 0.14143876731395721,
|
|
"num_tokens": 7078409.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.779048347473145,
|
|
"epoch": 0.32304137786179377,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004993555446974903,
|
|
"loss": 5.8733,
|
|
"mean_token_accuracy": 0.13716461956501008,
|
|
"num_tokens": 7087983.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.941289329528809,
|
|
"epoch": 0.3234614576769586,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499353277775446,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.14281788170337678,
|
|
"num_tokens": 7097277.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.894749402999878,
|
|
"epoch": 0.3238815374921235,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004993510068790989,
|
|
"loss": 5.7164,
|
|
"mean_token_accuracy": 0.15665216147899627,
|
|
"num_tokens": 7105918.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.773345851898194,
|
|
"epoch": 0.3243016173072884,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004993487320084892,
|
|
"loss": 5.7838,
|
|
"mean_token_accuracy": 0.15064965635538102,
|
|
"num_tokens": 7115049.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.944450235366821,
|
|
"epoch": 0.32472169712245325,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993464531636573,
|
|
"loss": 5.8883,
|
|
"mean_token_accuracy": 0.13874924927949905,
|
|
"num_tokens": 7124862.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.947724437713623,
|
|
"epoch": 0.32514177693761814,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993441703446435,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.1445206731557846,
|
|
"num_tokens": 7133280.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 6.020012712478637,
|
|
"epoch": 0.32556185675278304,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993418835514882,
|
|
"loss": 5.9743,
|
|
"mean_token_accuracy": 0.1368774726986885,
|
|
"num_tokens": 7142446.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.944014692306519,
|
|
"epoch": 0.3259819365679479,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004993395927842321,
|
|
"loss": 5.8824,
|
|
"mean_token_accuracy": 0.1359010323882103,
|
|
"num_tokens": 7152143.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.993379163742065,
|
|
"epoch": 0.3264020163831128,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004993372980429155,
|
|
"loss": 5.9617,
|
|
"mean_token_accuracy": 0.13282209262251854,
|
|
"num_tokens": 7162046.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.989493370056152,
|
|
"epoch": 0.3268220961982777,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004993349993275792,
|
|
"loss": 5.8488,
|
|
"mean_token_accuracy": 0.14026510193943978,
|
|
"num_tokens": 7171557.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.754479789733887,
|
|
"epoch": 0.3272421760134426,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004993326966382639,
|
|
"loss": 5.7423,
|
|
"mean_token_accuracy": 0.14871106296777725,
|
|
"num_tokens": 7180927.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.8972282886505125,
|
|
"epoch": 0.3276622558286074,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004993303899750104,
|
|
"loss": 5.8311,
|
|
"mean_token_accuracy": 0.1395234152674675,
|
|
"num_tokens": 7189552.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 6.021924352645874,
|
|
"epoch": 0.3280823356437723,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004993280793378595,
|
|
"loss": 5.8549,
|
|
"mean_token_accuracy": 0.13788855373859404,
|
|
"num_tokens": 7197857.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.914785861968994,
|
|
"epoch": 0.3285024154589372,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993257647268522,
|
|
"loss": 5.8281,
|
|
"mean_token_accuracy": 0.14489276185631753,
|
|
"num_tokens": 7206785.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.945201826095581,
|
|
"epoch": 0.32892249527410206,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004993234461420295,
|
|
"loss": 5.9003,
|
|
"mean_token_accuracy": 0.1415283761918545,
|
|
"num_tokens": 7216360.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.844962692260742,
|
|
"epoch": 0.32934257508926695,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004993211235834326,
|
|
"loss": 5.7122,
|
|
"mean_token_accuracy": 0.15939737260341644,
|
|
"num_tokens": 7224890.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.77975697517395,
|
|
"epoch": 0.32976265490443185,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004993187970511023,
|
|
"loss": 5.7707,
|
|
"mean_token_accuracy": 0.16336829960346222,
|
|
"num_tokens": 7234442.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.964393234252929,
|
|
"epoch": 0.33018273471959675,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993164665450801,
|
|
"loss": 5.9279,
|
|
"mean_token_accuracy": 0.1439814858138561,
|
|
"num_tokens": 7244023.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.916021871566772,
|
|
"epoch": 0.3306028145347616,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993141320654072,
|
|
"loss": 5.7793,
|
|
"mean_token_accuracy": 0.14671456664800644,
|
|
"num_tokens": 7253548.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.898174810409546,
|
|
"epoch": 0.3310228943499265,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000499311793612125,
|
|
"loss": 5.8402,
|
|
"mean_token_accuracy": 0.1421785496175289,
|
|
"num_tokens": 7262962.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.964570426940918,
|
|
"epoch": 0.3314429741650914,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993094511852748,
|
|
"loss": 5.863,
|
|
"mean_token_accuracy": 0.14184453189373017,
|
|
"num_tokens": 7272234.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.929952716827392,
|
|
"epoch": 0.33186305398025623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993071047848983,
|
|
"loss": 5.8493,
|
|
"mean_token_accuracy": 0.1383821338415146,
|
|
"num_tokens": 7281524.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.838898372650147,
|
|
"epoch": 0.3322831337954211,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004993047544110368,
|
|
"loss": 5.7384,
|
|
"mean_token_accuracy": 0.14712240919470787,
|
|
"num_tokens": 7289601.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.791057062149048,
|
|
"epoch": 0.332703213610586,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993024000637321,
|
|
"loss": 5.7137,
|
|
"mean_token_accuracy": 0.15096415132284163,
|
|
"num_tokens": 7298508.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.892502069473267,
|
|
"epoch": 0.33312329342575087,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993000417430259,
|
|
"loss": 5.9339,
|
|
"mean_token_accuracy": 0.1390118695795536,
|
|
"num_tokens": 7309065.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 6.066646718978882,
|
|
"epoch": 0.33354337324091576,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00049929767944896,
|
|
"loss": 5.953,
|
|
"mean_token_accuracy": 0.1411003813147545,
|
|
"num_tokens": 7319669.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 6.000399112701416,
|
|
"epoch": 0.33396345305608066,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004992953131815761,
|
|
"loss": 5.9022,
|
|
"mean_token_accuracy": 0.1418354742228985,
|
|
"num_tokens": 7328425.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.8749700546264645,
|
|
"epoch": 0.33438353287124556,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992929429409164,
|
|
"loss": 5.775,
|
|
"mean_token_accuracy": 0.1469979852437973,
|
|
"num_tokens": 7337369.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.913109064102173,
|
|
"epoch": 0.3348036126864104,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992905687270225,
|
|
"loss": 5.8411,
|
|
"mean_token_accuracy": 0.1466023862361908,
|
|
"num_tokens": 7346829.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.973616456985473,
|
|
"epoch": 0.3352236925015753,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992881905399368,
|
|
"loss": 5.9044,
|
|
"mean_token_accuracy": 0.14303565323352813,
|
|
"num_tokens": 7355976.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.9362890243530275,
|
|
"epoch": 0.3356437723167402,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004992858083797013,
|
|
"loss": 5.8555,
|
|
"mean_token_accuracy": 0.13833607137203216,
|
|
"num_tokens": 7365210.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.910732650756836,
|
|
"epoch": 0.33606385213190504,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992834222463581,
|
|
"loss": 5.9097,
|
|
"mean_token_accuracy": 0.13066598325967788,
|
|
"num_tokens": 7374175.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 6.022627830505371,
|
|
"epoch": 0.33648393194706994,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992810321399496,
|
|
"loss": 5.936,
|
|
"mean_token_accuracy": 0.13869498372077943,
|
|
"num_tokens": 7383302.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 6.006158876419067,
|
|
"epoch": 0.33690401176223483,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004992786380605182,
|
|
"loss": 5.9162,
|
|
"mean_token_accuracy": 0.13912810906767845,
|
|
"num_tokens": 7392746.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.839102506637573,
|
|
"epoch": 0.33732409157739973,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992762400081062,
|
|
"loss": 5.7562,
|
|
"mean_token_accuracy": 0.1469271421432495,
|
|
"num_tokens": 7401604.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.856449317932129,
|
|
"epoch": 0.3377441713925646,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004992738379827559,
|
|
"loss": 5.8677,
|
|
"mean_token_accuracy": 0.13804834261536597,
|
|
"num_tokens": 7410594.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.922429132461548,
|
|
"epoch": 0.33816425120772947,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004992714319845101,
|
|
"loss": 5.7704,
|
|
"mean_token_accuracy": 0.15343396067619325,
|
|
"num_tokens": 7418831.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.8475088596344,
|
|
"epoch": 0.33858433102289437,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992690220134116,
|
|
"loss": 5.8188,
|
|
"mean_token_accuracy": 0.144370898604393,
|
|
"num_tokens": 7427731.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 6.030502510070801,
|
|
"epoch": 0.3390044108380592,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992666080695027,
|
|
"loss": 5.9373,
|
|
"mean_token_accuracy": 0.13586149737238884,
|
|
"num_tokens": 7436447.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.901221179962159,
|
|
"epoch": 0.3394244906532241,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992641901528262,
|
|
"loss": 5.8156,
|
|
"mean_token_accuracy": 0.14270046576857567,
|
|
"num_tokens": 7445352.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.946398782730102,
|
|
"epoch": 0.339844570468389,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992617682634252,
|
|
"loss": 5.8858,
|
|
"mean_token_accuracy": 0.1441212549805641,
|
|
"num_tokens": 7454298.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.920703315734864,
|
|
"epoch": 0.34026465028355385,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004992593424013424,
|
|
"loss": 5.8948,
|
|
"mean_token_accuracy": 0.13869627565145493,
|
|
"num_tokens": 7463543.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.9791840553283695,
|
|
"epoch": 0.34068473009871875,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992569125666209,
|
|
"loss": 5.9195,
|
|
"mean_token_accuracy": 0.14178480133414267,
|
|
"num_tokens": 7472701.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 6.054230260848999,
|
|
"epoch": 0.34110480991388364,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992544787593037,
|
|
"loss": 5.9062,
|
|
"mean_token_accuracy": 0.13785406127572059,
|
|
"num_tokens": 7481123.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.989615964889526,
|
|
"epoch": 0.34152488972904854,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992520409794338,
|
|
"loss": 5.9555,
|
|
"mean_token_accuracy": 0.14264528974890708,
|
|
"num_tokens": 7490439.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.894261217117309,
|
|
"epoch": 0.3419449695442134,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992495992270544,
|
|
"loss": 5.8444,
|
|
"mean_token_accuracy": 0.1425054393708706,
|
|
"num_tokens": 7499326.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.95070858001709,
|
|
"epoch": 0.3423650493593783,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992471535022089,
|
|
"loss": 5.8947,
|
|
"mean_token_accuracy": 0.14209673926234245,
|
|
"num_tokens": 7509407.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.978242111206055,
|
|
"epoch": 0.3427851291745432,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992447038049405,
|
|
"loss": 5.9368,
|
|
"mean_token_accuracy": 0.1432798534631729,
|
|
"num_tokens": 7518443.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.854420137405396,
|
|
"epoch": 0.343205208989708,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992422501352927,
|
|
"loss": 5.7979,
|
|
"mean_token_accuracy": 0.15148040205240249,
|
|
"num_tokens": 7527609.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.958763885498047,
|
|
"epoch": 0.3436252888048729,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992397924933089,
|
|
"loss": 5.8829,
|
|
"mean_token_accuracy": 0.14002160280942916,
|
|
"num_tokens": 7536890.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.984218978881836,
|
|
"epoch": 0.3440453686200378,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992373308790325,
|
|
"loss": 5.8445,
|
|
"mean_token_accuracy": 0.14879057705402374,
|
|
"num_tokens": 7546509.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.8121418952941895,
|
|
"epoch": 0.3444654484352027,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992348652925074,
|
|
"loss": 5.8814,
|
|
"mean_token_accuracy": 0.13877593278884887,
|
|
"num_tokens": 7555336.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.959460878372193,
|
|
"epoch": 0.34488552825036756,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992323957337771,
|
|
"loss": 5.8217,
|
|
"mean_token_accuracy": 0.14075680449604988,
|
|
"num_tokens": 7565210.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.997728681564331,
|
|
"epoch": 0.34530560806553245,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992299222028855,
|
|
"loss": 5.9177,
|
|
"mean_token_accuracy": 0.14632946625351906,
|
|
"num_tokens": 7574516.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.837478542327881,
|
|
"epoch": 0.34572568788069735,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004992274446998761,
|
|
"loss": 5.7701,
|
|
"mean_token_accuracy": 0.14613791555166245,
|
|
"num_tokens": 7583219.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.990570783615112,
|
|
"epoch": 0.3461457676958622,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992249632247929,
|
|
"loss": 5.9898,
|
|
"mean_token_accuracy": 0.13541294783353805,
|
|
"num_tokens": 7592050.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 6.017976236343384,
|
|
"epoch": 0.3465658475110271,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992224777776802,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.1406927302479744,
|
|
"num_tokens": 7600718.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.928384780883789,
|
|
"epoch": 0.346985927326192,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992199883585816,
|
|
"loss": 5.8623,
|
|
"mean_token_accuracy": 0.14485160112380982,
|
|
"num_tokens": 7609191.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.958423805236817,
|
|
"epoch": 0.34740600714135683,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004992174949675413,
|
|
"loss": 5.8819,
|
|
"mean_token_accuracy": 0.14174177944660188,
|
|
"num_tokens": 7618509.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.890047216415406,
|
|
"epoch": 0.34782608695652173,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004992149976046037,
|
|
"loss": 5.8117,
|
|
"mean_token_accuracy": 0.14391598626971244,
|
|
"num_tokens": 7627851.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.892529726028442,
|
|
"epoch": 0.3482461667716866,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992124962698128,
|
|
"loss": 5.8894,
|
|
"mean_token_accuracy": 0.13846235871315002,
|
|
"num_tokens": 7636748.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.952128744125366,
|
|
"epoch": 0.3486662465868515,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499209990963213,
|
|
"loss": 5.7996,
|
|
"mean_token_accuracy": 0.14363356158137322,
|
|
"num_tokens": 7645436.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.9340009689331055,
|
|
"epoch": 0.34908632640201637,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992074816848487,
|
|
"loss": 5.9287,
|
|
"mean_token_accuracy": 0.13951508998870848,
|
|
"num_tokens": 7655414.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.832207345962525,
|
|
"epoch": 0.34950640621718126,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992049684347642,
|
|
"loss": 5.7094,
|
|
"mean_token_accuracy": 0.14780430346727372,
|
|
"num_tokens": 7664295.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.929846525192261,
|
|
"epoch": 0.34992648603234616,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004992024512130042,
|
|
"loss": 5.8569,
|
|
"mean_token_accuracy": 0.14193690866231917,
|
|
"num_tokens": 7673295.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.905185222625732,
|
|
"epoch": 0.350346565847511,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991999300196132,
|
|
"loss": 5.8475,
|
|
"mean_token_accuracy": 0.13919475451111793,
|
|
"num_tokens": 7682932.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 6.005189561843872,
|
|
"epoch": 0.3507666456626759,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004991974048546359,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.13765867426991463,
|
|
"num_tokens": 7692105.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.873351955413819,
|
|
"epoch": 0.3511867254778408,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499194875718117,
|
|
"loss": 5.859,
|
|
"mean_token_accuracy": 0.1459092453122139,
|
|
"num_tokens": 7701294.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.976405239105224,
|
|
"epoch": 0.3516068052930057,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991923426101013,
|
|
"loss": 5.8556,
|
|
"mean_token_accuracy": 0.14097452014684678,
|
|
"num_tokens": 7710964.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.988002777099609,
|
|
"epoch": 0.35202688510817054,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991898055306337,
|
|
"loss": 5.9768,
|
|
"mean_token_accuracy": 0.13131897300481796,
|
|
"num_tokens": 7719938.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.942753410339355,
|
|
"epoch": 0.35244696492333544,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004991872644797591,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.13939437940716742,
|
|
"num_tokens": 7729129.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.955871152877807,
|
|
"epoch": 0.35286704473850034,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004991847194575226,
|
|
"loss": 5.8881,
|
|
"mean_token_accuracy": 0.13834249898791312,
|
|
"num_tokens": 7738506.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 6.041079711914063,
|
|
"epoch": 0.3532871245536652,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991821704639693,
|
|
"loss": 5.9968,
|
|
"mean_token_accuracy": 0.13867756947875023,
|
|
"num_tokens": 7749320.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 6.0422234535217285,
|
|
"epoch": 0.3537072043688301,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991796174991443,
|
|
"loss": 5.8516,
|
|
"mean_token_accuracy": 0.14419358000159263,
|
|
"num_tokens": 7758735.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.810104942321777,
|
|
"epoch": 0.354127284183995,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004991770605630927,
|
|
"loss": 5.8115,
|
|
"mean_token_accuracy": 0.14199010655283928,
|
|
"num_tokens": 7767556.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.862843370437622,
|
|
"epoch": 0.3545473639991598,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991744996558599,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.14548772126436232,
|
|
"num_tokens": 7776615.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.955168771743774,
|
|
"epoch": 0.3549674438143247,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004991719347774913,
|
|
"loss": 5.8885,
|
|
"mean_token_accuracy": 0.14509620741009713,
|
|
"num_tokens": 7785288.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.897441482543945,
|
|
"epoch": 0.3553875236294896,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004991693659280324,
|
|
"loss": 5.7878,
|
|
"mean_token_accuracy": 0.1456679493188858,
|
|
"num_tokens": 7794381.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.895413112640381,
|
|
"epoch": 0.3558076034446545,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004991667931075284,
|
|
"loss": 5.7548,
|
|
"mean_token_accuracy": 0.14165765419602394,
|
|
"num_tokens": 7803265.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.8606267929077145,
|
|
"epoch": 0.35622768325981935,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991642163160252,
|
|
"loss": 5.8796,
|
|
"mean_token_accuracy": 0.13830938637256623,
|
|
"num_tokens": 7812445.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.941714191436768,
|
|
"epoch": 0.35664776307498425,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991616355535684,
|
|
"loss": 5.8695,
|
|
"mean_token_accuracy": 0.1441208615899086,
|
|
"num_tokens": 7822073.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 6.004122114181518,
|
|
"epoch": 0.35706784289014915,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004991590508202036,
|
|
"loss": 5.8472,
|
|
"mean_token_accuracy": 0.13856493979692458,
|
|
"num_tokens": 7831193.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.952021503448487,
|
|
"epoch": 0.357487922705314,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004991564621159766,
|
|
"loss": 5.8909,
|
|
"mean_token_accuracy": 0.1399833530187607,
|
|
"num_tokens": 7840311.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.902349615097046,
|
|
"epoch": 0.3579080025204789,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004991538694409334,
|
|
"loss": 5.8981,
|
|
"mean_token_accuracy": 0.13640205860137938,
|
|
"num_tokens": 7849622.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.93274884223938,
|
|
"epoch": 0.3583280823356438,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004991512727951198,
|
|
"loss": 5.8639,
|
|
"mean_token_accuracy": 0.1423584371805191,
|
|
"num_tokens": 7859494.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 6.066871976852417,
|
|
"epoch": 0.3587481621508087,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991486721785818,
|
|
"loss": 5.9611,
|
|
"mean_token_accuracy": 0.13798293545842172,
|
|
"num_tokens": 7868526.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.916080617904663,
|
|
"epoch": 0.3591682419659735,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991460675913655,
|
|
"loss": 5.7946,
|
|
"mean_token_accuracy": 0.1431095890700817,
|
|
"num_tokens": 7877631.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.9288982391357425,
|
|
"epoch": 0.3595883217811384,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000499143459033517,
|
|
"loss": 5.8525,
|
|
"mean_token_accuracy": 0.14929330348968506,
|
|
"num_tokens": 7886814.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.835088777542114,
|
|
"epoch": 0.3600084015963033,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991408465050825,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.15145567432045937,
|
|
"num_tokens": 7896337.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.841267919540405,
|
|
"epoch": 0.36042848141146816,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991382300061084,
|
|
"loss": 5.9429,
|
|
"mean_token_accuracy": 0.13477055355906487,
|
|
"num_tokens": 7906071.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 6.013036108016967,
|
|
"epoch": 0.36084856122663306,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004991356095366409,
|
|
"loss": 5.9236,
|
|
"mean_token_accuracy": 0.14087440073490143,
|
|
"num_tokens": 7915003.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.964684629440308,
|
|
"epoch": 0.36126864104179796,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991329850967266,
|
|
"loss": 5.7748,
|
|
"mean_token_accuracy": 0.14612130969762802,
|
|
"num_tokens": 7924408.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.857362222671509,
|
|
"epoch": 0.3616887208569628,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991303566864118,
|
|
"loss": 5.752,
|
|
"mean_token_accuracy": 0.14585833102464676,
|
|
"num_tokens": 7934717.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.800111103057861,
|
|
"epoch": 0.3621088006721277,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004991277243057431,
|
|
"loss": 5.8176,
|
|
"mean_token_accuracy": 0.14245440661907197,
|
|
"num_tokens": 7944278.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.853901958465576,
|
|
"epoch": 0.3625288804872926,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991250879547673,
|
|
"loss": 5.8345,
|
|
"mean_token_accuracy": 0.14364267513155937,
|
|
"num_tokens": 7953344.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.9053857803344725,
|
|
"epoch": 0.3629489603024575,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991224476335309,
|
|
"loss": 5.8601,
|
|
"mean_token_accuracy": 0.1401130437850952,
|
|
"num_tokens": 7962869.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.988316392898559,
|
|
"epoch": 0.36336904011762233,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004991198033420807,
|
|
"loss": 5.8527,
|
|
"mean_token_accuracy": 0.14232899993658066,
|
|
"num_tokens": 7971981.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.870962715148925,
|
|
"epoch": 0.36378911993278723,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004991171550804636,
|
|
"loss": 5.8073,
|
|
"mean_token_accuracy": 0.139846058934927,
|
|
"num_tokens": 7980979.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.898285436630249,
|
|
"epoch": 0.36420919974795213,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004991145028487266,
|
|
"loss": 5.8963,
|
|
"mean_token_accuracy": 0.14070027470588684,
|
|
"num_tokens": 7989607.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.864823675155639,
|
|
"epoch": 0.36462927956311697,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991118466469165,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.14677212983369828,
|
|
"num_tokens": 7998356.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.8904320240020756,
|
|
"epoch": 0.36504935937828187,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004991091864750805,
|
|
"loss": 5.818,
|
|
"mean_token_accuracy": 0.14362581819295883,
|
|
"num_tokens": 8007596.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.893006706237793,
|
|
"epoch": 0.36546943919344677,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991065223332655,
|
|
"loss": 5.8754,
|
|
"mean_token_accuracy": 0.13881655633449555,
|
|
"num_tokens": 8016493.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.957713174819946,
|
|
"epoch": 0.36588951900861166,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991038542215191,
|
|
"loss": 5.8451,
|
|
"mean_token_accuracy": 0.1374589078128338,
|
|
"num_tokens": 8025867.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.831826066970825,
|
|
"epoch": 0.3663095988237765,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004991011821398882,
|
|
"loss": 5.8861,
|
|
"mean_token_accuracy": 0.1465972438454628,
|
|
"num_tokens": 8036251.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 6.003261423110962,
|
|
"epoch": 0.3667296786389414,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990985060884202,
|
|
"loss": 5.8444,
|
|
"mean_token_accuracy": 0.1452535480260849,
|
|
"num_tokens": 8045647.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.943668365478516,
|
|
"epoch": 0.3671497584541063,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990958260671627,
|
|
"loss": 5.8987,
|
|
"mean_token_accuracy": 0.13597789257764817,
|
|
"num_tokens": 8056025.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.898333263397217,
|
|
"epoch": 0.36756983826927114,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004990931420761629,
|
|
"loss": 5.8364,
|
|
"mean_token_accuracy": 0.14677493423223495,
|
|
"num_tokens": 8065029.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.953028678894043,
|
|
"epoch": 0.36798991808443604,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990904541154685,
|
|
"loss": 5.7841,
|
|
"mean_token_accuracy": 0.15241612046957015,
|
|
"num_tokens": 8073249.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.914327716827392,
|
|
"epoch": 0.36840999789960094,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990877621851271,
|
|
"loss": 5.9274,
|
|
"mean_token_accuracy": 0.13789283782243728,
|
|
"num_tokens": 8082039.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.818746089935303,
|
|
"epoch": 0.3688300777147658,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004990850662851863,
|
|
"loss": 5.7546,
|
|
"mean_token_accuracy": 0.14923306405544282,
|
|
"num_tokens": 8090011.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.97280101776123,
|
|
"epoch": 0.3692501575299307,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990823664156941,
|
|
"loss": 5.8789,
|
|
"mean_token_accuracy": 0.1489357531070709,
|
|
"num_tokens": 8099934.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.970620107650757,
|
|
"epoch": 0.3696702373450956,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990796625766981,
|
|
"loss": 5.8822,
|
|
"mean_token_accuracy": 0.13866196647286416,
|
|
"num_tokens": 8108969.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.857716226577759,
|
|
"epoch": 0.3700903171602605,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990769547682462,
|
|
"loss": 5.798,
|
|
"mean_token_accuracy": 0.14401047080755233,
|
|
"num_tokens": 8117372.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 6.015813732147217,
|
|
"epoch": 0.3705103969754253,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990742429903866,
|
|
"loss": 5.9812,
|
|
"mean_token_accuracy": 0.13605612963438035,
|
|
"num_tokens": 8127108.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 6.0110640048980715,
|
|
"epoch": 0.3709304767905902,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000499071527243167,
|
|
"loss": 5.9774,
|
|
"mean_token_accuracy": 0.13931988626718522,
|
|
"num_tokens": 8137392.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.916806697845459,
|
|
"epoch": 0.3713505566057551,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004990688075266357,
|
|
"loss": 5.8172,
|
|
"mean_token_accuracy": 0.14630230888724327,
|
|
"num_tokens": 8146257.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.90497236251831,
|
|
"epoch": 0.37177063642091995,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004990660838408409,
|
|
"loss": 5.7894,
|
|
"mean_token_accuracy": 0.14007715433835982,
|
|
"num_tokens": 8154952.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.948085355758667,
|
|
"epoch": 0.37219071623608485,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004990633561858308,
|
|
"loss": 5.8263,
|
|
"mean_token_accuracy": 0.14142653867602348,
|
|
"num_tokens": 8164365.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.9057210922241214,
|
|
"epoch": 0.37261079605124975,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990606245616537,
|
|
"loss": 5.8405,
|
|
"mean_token_accuracy": 0.13960912972688674,
|
|
"num_tokens": 8172614.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 6.0053239345550535,
|
|
"epoch": 0.37303087586641465,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990578889683579,
|
|
"loss": 5.8993,
|
|
"mean_token_accuracy": 0.13672763109207153,
|
|
"num_tokens": 8182445.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.912483501434326,
|
|
"epoch": 0.3734509556815795,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990551494059921,
|
|
"loss": 5.7912,
|
|
"mean_token_accuracy": 0.14882408380508422,
|
|
"num_tokens": 8191871.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.91331787109375,
|
|
"epoch": 0.3738710354967444,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004990524058746047,
|
|
"loss": 5.9292,
|
|
"mean_token_accuracy": 0.14731585383415222,
|
|
"num_tokens": 8200658.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.922462463378906,
|
|
"epoch": 0.3742911153119093,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990496583742443,
|
|
"loss": 5.8609,
|
|
"mean_token_accuracy": 0.13896840661764145,
|
|
"num_tokens": 8209776.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.8580132007598875,
|
|
"epoch": 0.3747111951270741,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004990469069049596,
|
|
"loss": 5.7933,
|
|
"mean_token_accuracy": 0.14876351952552797,
|
|
"num_tokens": 8219401.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.9017116069793705,
|
|
"epoch": 0.375131274942239,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004990441514667993,
|
|
"loss": 5.8399,
|
|
"mean_token_accuracy": 0.1457892268896103,
|
|
"num_tokens": 8228762.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.960052967071533,
|
|
"epoch": 0.3755513547574039,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990413920598121,
|
|
"loss": 5.8364,
|
|
"mean_token_accuracy": 0.1444413885474205,
|
|
"num_tokens": 8236612.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.957969760894775,
|
|
"epoch": 0.37597143457256876,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990386286840471,
|
|
"loss": 5.8452,
|
|
"mean_token_accuracy": 0.14290711134672165,
|
|
"num_tokens": 8245043.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 6.0023870944976805,
|
|
"epoch": 0.37639151438773366,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990358613395532,
|
|
"loss": 5.9381,
|
|
"mean_token_accuracy": 0.13609616905450822,
|
|
"num_tokens": 8255270.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.976658725738526,
|
|
"epoch": 0.37681159420289856,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004990330900263792,
|
|
"loss": 5.896,
|
|
"mean_token_accuracy": 0.13675653785467148,
|
|
"num_tokens": 8264761.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.991942405700684,
|
|
"epoch": 0.37723167401806346,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990303147445745,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.14412947744131088,
|
|
"num_tokens": 8274308.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.831737422943116,
|
|
"epoch": 0.3776517538332283,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990275354941881,
|
|
"loss": 5.751,
|
|
"mean_token_accuracy": 0.15253113806247712,
|
|
"num_tokens": 8283323.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.965500402450561,
|
|
"epoch": 0.3780718336483932,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990247522752694,
|
|
"loss": 6.0719,
|
|
"mean_token_accuracy": 0.12804851979017257,
|
|
"num_tokens": 8293452.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.9973039627075195,
|
|
"epoch": 0.3784919134635581,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990219650878674,
|
|
"loss": 5.7459,
|
|
"mean_token_accuracy": 0.14813876897096634,
|
|
"num_tokens": 8302941.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.840318632125855,
|
|
"epoch": 0.37891199327872294,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004990191739320318,
|
|
"loss": 5.7706,
|
|
"mean_token_accuracy": 0.15119873285293578,
|
|
"num_tokens": 8311811.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.808368587493897,
|
|
"epoch": 0.37933207309388783,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990163788078117,
|
|
"loss": 5.6889,
|
|
"mean_token_accuracy": 0.1518329106271267,
|
|
"num_tokens": 8321130.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.834763097763061,
|
|
"epoch": 0.37975215290905273,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990135797152569,
|
|
"loss": 5.7997,
|
|
"mean_token_accuracy": 0.14402930140495301,
|
|
"num_tokens": 8330233.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.881337881088257,
|
|
"epoch": 0.3801722327242176,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004990107766544169,
|
|
"loss": 5.7852,
|
|
"mean_token_accuracy": 0.144415046274662,
|
|
"num_tokens": 8338585.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.83257737159729,
|
|
"epoch": 0.38059231253938247,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990079696253413,
|
|
"loss": 5.8118,
|
|
"mean_token_accuracy": 0.14888912737369536,
|
|
"num_tokens": 8346618.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.908400917053223,
|
|
"epoch": 0.38101239235454737,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990051586280799,
|
|
"loss": 5.7942,
|
|
"mean_token_accuracy": 0.14552049711346626,
|
|
"num_tokens": 8356273.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.918098402023316,
|
|
"epoch": 0.38143247216971227,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004990023436626824,
|
|
"loss": 5.7951,
|
|
"mean_token_accuracy": 0.14602155163884162,
|
|
"num_tokens": 8366668.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.982459354400635,
|
|
"epoch": 0.3818525519848771,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004989995247291988,
|
|
"loss": 5.9163,
|
|
"mean_token_accuracy": 0.14120357036590575,
|
|
"num_tokens": 8375610.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.895563316345215,
|
|
"epoch": 0.382272631800042,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004989967018276789,
|
|
"loss": 5.774,
|
|
"mean_token_accuracy": 0.15064741671085358,
|
|
"num_tokens": 8384455.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.79692234992981,
|
|
"epoch": 0.3826927116152069,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989938749581727,
|
|
"loss": 5.8123,
|
|
"mean_token_accuracy": 0.14297219812870027,
|
|
"num_tokens": 8393868.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.923454284667969,
|
|
"epoch": 0.38311279143037175,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004989910441207305,
|
|
"loss": 5.8328,
|
|
"mean_token_accuracy": 0.1404195971786976,
|
|
"num_tokens": 8402916.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.898684453964234,
|
|
"epoch": 0.38353287124553664,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989882093154023,
|
|
"loss": 5.7638,
|
|
"mean_token_accuracy": 0.14875229001045226,
|
|
"num_tokens": 8411649.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.880671072006225,
|
|
"epoch": 0.38395295106070154,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004989853705422381,
|
|
"loss": 5.8801,
|
|
"mean_token_accuracy": 0.13631365299224854,
|
|
"num_tokens": 8420393.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.883023405075074,
|
|
"epoch": 0.38437303087586644,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004989825278012886,
|
|
"loss": 5.7743,
|
|
"mean_token_accuracy": 0.14661871045827865,
|
|
"num_tokens": 8429404.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.882754182815551,
|
|
"epoch": 0.3847931106910313,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000498979681092604,
|
|
"loss": 5.8106,
|
|
"mean_token_accuracy": 0.14257726520299913,
|
|
"num_tokens": 8438299.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.837142848968506,
|
|
"epoch": 0.3852131905061962,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004989768304162345,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.14974153488874437,
|
|
"num_tokens": 8447392.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.9916746616363525,
|
|
"epoch": 0.3856332703213611,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004989739757722308,
|
|
"loss": 5.8625,
|
|
"mean_token_accuracy": 0.13722902536392212,
|
|
"num_tokens": 8456361.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.905898475646973,
|
|
"epoch": 0.3860533501365259,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989711171606436,
|
|
"loss": 5.7858,
|
|
"mean_token_accuracy": 0.14541147351264955,
|
|
"num_tokens": 8465548.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.921667671203613,
|
|
"epoch": 0.3864734299516908,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004989682545815232,
|
|
"loss": 5.8109,
|
|
"mean_token_accuracy": 0.1411545142531395,
|
|
"num_tokens": 8474454.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.837777233123779,
|
|
"epoch": 0.3868935097668557,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989653880349207,
|
|
"loss": 5.7277,
|
|
"mean_token_accuracy": 0.14593051224946976,
|
|
"num_tokens": 8482694.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.864150905609131,
|
|
"epoch": 0.38731358958202056,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004989625175208864,
|
|
"loss": 5.8308,
|
|
"mean_token_accuracy": 0.14381687343120575,
|
|
"num_tokens": 8491162.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.819499731063843,
|
|
"epoch": 0.38773366939718545,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004989596430394717,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.1608663707971573,
|
|
"num_tokens": 8500716.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.8265057563781735,
|
|
"epoch": 0.38815374921235035,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000498956764590727,
|
|
"loss": 5.7384,
|
|
"mean_token_accuracy": 0.14157627001404763,
|
|
"num_tokens": 8508871.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.979275703430176,
|
|
"epoch": 0.38857382902751525,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004989538821747037,
|
|
"loss": 5.9482,
|
|
"mean_token_accuracy": 0.1420240134000778,
|
|
"num_tokens": 8518450.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.9397321224212645,
|
|
"epoch": 0.3889939088426801,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004989509957914527,
|
|
"loss": 5.8528,
|
|
"mean_token_accuracy": 0.1380702592432499,
|
|
"num_tokens": 8528238.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.852479600906372,
|
|
"epoch": 0.389413988657845,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989481054410251,
|
|
"loss": 5.7431,
|
|
"mean_token_accuracy": 0.14131385385990142,
|
|
"num_tokens": 8537587.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.9004875183105465,
|
|
"epoch": 0.3898340684730099,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989452111234721,
|
|
"loss": 5.854,
|
|
"mean_token_accuracy": 0.14011769965291024,
|
|
"num_tokens": 8547703.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.860686302185059,
|
|
"epoch": 0.39025414828817473,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498942312838845,
|
|
"loss": 5.7958,
|
|
"mean_token_accuracy": 0.14458008110523224,
|
|
"num_tokens": 8557001.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.8804422378540036,
|
|
"epoch": 0.3906742281033396,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989394105871952,
|
|
"loss": 5.692,
|
|
"mean_token_accuracy": 0.15489965081214904,
|
|
"num_tokens": 8565638.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.966875410079956,
|
|
"epoch": 0.3910943079185045,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000498936504368574,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.14225341156125068,
|
|
"num_tokens": 8574428.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.759807777404785,
|
|
"epoch": 0.3915143877336694,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004989335941830329,
|
|
"loss": 5.816,
|
|
"mean_token_accuracy": 0.14541401863098144,
|
|
"num_tokens": 8583157.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.834117889404297,
|
|
"epoch": 0.39193446754883426,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004989306800306236,
|
|
"loss": 5.7781,
|
|
"mean_token_accuracy": 0.14344885647296907,
|
|
"num_tokens": 8592382.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.8663976192474365,
|
|
"epoch": 0.39235454736399916,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004989277619113975,
|
|
"loss": 5.7604,
|
|
"mean_token_accuracy": 0.15097892433404922,
|
|
"num_tokens": 8601058.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.956953763961792,
|
|
"epoch": 0.39277462717916406,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004989248398254065,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.1437965750694275,
|
|
"num_tokens": 8609479.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.92048830986023,
|
|
"epoch": 0.3931947069943289,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004989219137727021,
|
|
"loss": 5.8058,
|
|
"mean_token_accuracy": 0.14700522273778915,
|
|
"num_tokens": 8618860.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.8700724124908445,
|
|
"epoch": 0.3936147868094938,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989189837533365,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14664537757635115,
|
|
"num_tokens": 8627462.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.981065273284912,
|
|
"epoch": 0.3940348666246587,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989160497673613,
|
|
"loss": 5.9387,
|
|
"mean_token_accuracy": 0.13696896955370902,
|
|
"num_tokens": 8637569.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.918409252166748,
|
|
"epoch": 0.39445494643982354,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004989131118148286,
|
|
"loss": 5.7353,
|
|
"mean_token_accuracy": 0.14450196400284768,
|
|
"num_tokens": 8645440.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.836373901367187,
|
|
"epoch": 0.39487502625498844,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004989101698957904,
|
|
"loss": 5.9023,
|
|
"mean_token_accuracy": 0.14248489439487458,
|
|
"num_tokens": 8655077.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.941747808456421,
|
|
"epoch": 0.39529510607015333,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004989072240102988,
|
|
"loss": 5.8142,
|
|
"mean_token_accuracy": 0.14740578532218934,
|
|
"num_tokens": 8663126.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.973061513900757,
|
|
"epoch": 0.39571518588531823,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989042741584061,
|
|
"loss": 5.7952,
|
|
"mean_token_accuracy": 0.14338430240750313,
|
|
"num_tokens": 8672386.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.720412731170654,
|
|
"epoch": 0.3961352657004831,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004989013203401645,
|
|
"loss": 5.7388,
|
|
"mean_token_accuracy": 0.1476906917989254,
|
|
"num_tokens": 8681930.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.883289384841919,
|
|
"epoch": 0.396555345515648,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004988983625556264,
|
|
"loss": 5.7919,
|
|
"mean_token_accuracy": 0.14368573501706122,
|
|
"num_tokens": 8690993.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.890859937667846,
|
|
"epoch": 0.39697542533081287,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004988954008048438,
|
|
"loss": 5.7809,
|
|
"mean_token_accuracy": 0.14698703289031984,
|
|
"num_tokens": 8699497.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 6.004160451889038,
|
|
"epoch": 0.3973955051459777,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988924350878697,
|
|
"loss": 5.986,
|
|
"mean_token_accuracy": 0.1333600528538227,
|
|
"num_tokens": 8709274.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.947705507278442,
|
|
"epoch": 0.3978155849611426,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988894654047563,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.13920372053980828,
|
|
"num_tokens": 8718158.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.82051944732666,
|
|
"epoch": 0.3982356647763075,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004988864917555562,
|
|
"loss": 5.7239,
|
|
"mean_token_accuracy": 0.14391618072986603,
|
|
"num_tokens": 8727459.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.940366458892822,
|
|
"epoch": 0.3986557445914724,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004988835141403224,
|
|
"loss": 5.8538,
|
|
"mean_token_accuracy": 0.14721113741397857,
|
|
"num_tokens": 8737614.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.819404935836792,
|
|
"epoch": 0.39907582440663725,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004988805325591073,
|
|
"loss": 5.6874,
|
|
"mean_token_accuracy": 0.14453882575035096,
|
|
"num_tokens": 8746799.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.84985032081604,
|
|
"epoch": 0.39949590422180214,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004988775470119639,
|
|
"loss": 5.8628,
|
|
"mean_token_accuracy": 0.14014028683304786,
|
|
"num_tokens": 8756555.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.867576169967651,
|
|
"epoch": 0.39991598403696704,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004988745574989451,
|
|
"loss": 5.8851,
|
|
"mean_token_accuracy": 0.1480340264737606,
|
|
"num_tokens": 8765849.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 6.094280099868774,
|
|
"epoch": 0.4003360638521319,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988715640201036,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.13378295823931693,
|
|
"num_tokens": 8775713.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.884061288833618,
|
|
"epoch": 0.4007561436672968,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004988685665754928,
|
|
"loss": 5.7775,
|
|
"mean_token_accuracy": 0.14666623920202254,
|
|
"num_tokens": 8784717.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.8814960479736325,
|
|
"epoch": 0.4011762234824617,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988655651651656,
|
|
"loss": 5.7911,
|
|
"mean_token_accuracy": 0.14413672238588332,
|
|
"num_tokens": 8794388.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.836367225646972,
|
|
"epoch": 0.4015963032976265,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004988625597891751,
|
|
"loss": 5.8093,
|
|
"mean_token_accuracy": 0.14697518199682236,
|
|
"num_tokens": 8802436.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.912711811065674,
|
|
"epoch": 0.4020163831127914,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988595504475746,
|
|
"loss": 5.7636,
|
|
"mean_token_accuracy": 0.1465681880712509,
|
|
"num_tokens": 8811184.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.9507347583770756,
|
|
"epoch": 0.4024364629279563,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004988565371404175,
|
|
"loss": 5.8423,
|
|
"mean_token_accuracy": 0.14505148231983184,
|
|
"num_tokens": 8820525.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.830136728286743,
|
|
"epoch": 0.4028565427431212,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004988535198677571,
|
|
"loss": 5.7011,
|
|
"mean_token_accuracy": 0.153212571144104,
|
|
"num_tokens": 8828928.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.90922179222107,
|
|
"epoch": 0.40327662255828606,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004988504986296469,
|
|
"loss": 5.907,
|
|
"mean_token_accuracy": 0.1371180810034275,
|
|
"num_tokens": 8838615.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.942590522766113,
|
|
"epoch": 0.40369670237345096,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988474734261404,
|
|
"loss": 5.9047,
|
|
"mean_token_accuracy": 0.13416762948036193,
|
|
"num_tokens": 8848709.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.973557710647583,
|
|
"epoch": 0.40411678218861585,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988444442572911,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.1310623273253441,
|
|
"num_tokens": 8858277.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.891769552230835,
|
|
"epoch": 0.4045368620037807,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988414111231528,
|
|
"loss": 5.8161,
|
|
"mean_token_accuracy": 0.14670211374759673,
|
|
"num_tokens": 8868436.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.925015592575074,
|
|
"epoch": 0.4049569418189456,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498838374023779,
|
|
"loss": 5.7888,
|
|
"mean_token_accuracy": 0.13960602283477783,
|
|
"num_tokens": 8877740.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.908780908584594,
|
|
"epoch": 0.4053770216341105,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988353329592239,
|
|
"loss": 5.7761,
|
|
"mean_token_accuracy": 0.14475535228848457,
|
|
"num_tokens": 8887408.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.893645095825195,
|
|
"epoch": 0.4057971014492754,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988322879295409,
|
|
"loss": 5.929,
|
|
"mean_token_accuracy": 0.13994188457727433,
|
|
"num_tokens": 8897141.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.865872049331665,
|
|
"epoch": 0.40621718126444023,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988292389347844,
|
|
"loss": 5.7105,
|
|
"mean_token_accuracy": 0.15417256727814674,
|
|
"num_tokens": 8905747.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.965148115158081,
|
|
"epoch": 0.40663726107960513,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000498826185975008,
|
|
"loss": 5.8673,
|
|
"mean_token_accuracy": 0.14333693608641623,
|
|
"num_tokens": 8914926.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.872843933105469,
|
|
"epoch": 0.40705734089477,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004988231290502662,
|
|
"loss": 5.8806,
|
|
"mean_token_accuracy": 0.14108002185821533,
|
|
"num_tokens": 8923956.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.925130224227905,
|
|
"epoch": 0.40747742070993487,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988200681606127,
|
|
"loss": 5.7542,
|
|
"mean_token_accuracy": 0.1388688787817955,
|
|
"num_tokens": 8932654.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.9108325958251955,
|
|
"epoch": 0.40789750052509977,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000498817003306102,
|
|
"loss": 5.7364,
|
|
"mean_token_accuracy": 0.1501722030341625,
|
|
"num_tokens": 8941716.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.846788120269776,
|
|
"epoch": 0.40831758034026466,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004988139344867884,
|
|
"loss": 5.8122,
|
|
"mean_token_accuracy": 0.14448407515883446,
|
|
"num_tokens": 8950377.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.848782968521118,
|
|
"epoch": 0.4087376601554295,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004988108617027261,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.14761658608913422,
|
|
"num_tokens": 8959857.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.834667444229126,
|
|
"epoch": 0.4091577399705944,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988077849539698,
|
|
"loss": 5.7183,
|
|
"mean_token_accuracy": 0.1485067203640938,
|
|
"num_tokens": 8968272.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.923686075210571,
|
|
"epoch": 0.4095778197857593,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004988047042405736,
|
|
"loss": 5.7969,
|
|
"mean_token_accuracy": 0.14762237221002578,
|
|
"num_tokens": 8977445.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.964400959014893,
|
|
"epoch": 0.4099978996009242,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988016195625924,
|
|
"loss": 5.8644,
|
|
"mean_token_accuracy": 0.13916484266519547,
|
|
"num_tokens": 8987315.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.8641290187835695,
|
|
"epoch": 0.41041797941608904,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987985309200807,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.1417423367500305,
|
|
"num_tokens": 8998119.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.7576408863067625,
|
|
"epoch": 0.41083805923125394,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987954383130934,
|
|
"loss": 5.7477,
|
|
"mean_token_accuracy": 0.1535985603928566,
|
|
"num_tokens": 9007167.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.866803312301636,
|
|
"epoch": 0.41125813904641884,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498792341741685,
|
|
"loss": 5.8006,
|
|
"mean_token_accuracy": 0.13756236732006072,
|
|
"num_tokens": 9016690.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.996728754043579,
|
|
"epoch": 0.4116782188615837,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004987892412059106,
|
|
"loss": 5.8881,
|
|
"mean_token_accuracy": 0.1421562008559704,
|
|
"num_tokens": 9026117.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.823458862304688,
|
|
"epoch": 0.4120982986767486,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004987861367058251,
|
|
"loss": 5.7583,
|
|
"mean_token_accuracy": 0.1456121936440468,
|
|
"num_tokens": 9035754.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.91724009513855,
|
|
"epoch": 0.4125183784919135,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987830282414833,
|
|
"loss": 5.7614,
|
|
"mean_token_accuracy": 0.15125717446208,
|
|
"num_tokens": 9045453.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.882875871658325,
|
|
"epoch": 0.41293845830707837,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987799158129404,
|
|
"loss": 5.8736,
|
|
"mean_token_accuracy": 0.14322762489318847,
|
|
"num_tokens": 9056045.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.822021722793579,
|
|
"epoch": 0.4133585381222432,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004987767994202516,
|
|
"loss": 5.7652,
|
|
"mean_token_accuracy": 0.14132684618234634,
|
|
"num_tokens": 9065728.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.874257898330688,
|
|
"epoch": 0.4137786179374081,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987736790634719,
|
|
"loss": 5.7867,
|
|
"mean_token_accuracy": 0.14259056150913238,
|
|
"num_tokens": 9075522.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.868446731567383,
|
|
"epoch": 0.414198697752573,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004987705547426568,
|
|
"loss": 5.7633,
|
|
"mean_token_accuracy": 0.14451717659831048,
|
|
"num_tokens": 9084412.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.86938099861145,
|
|
"epoch": 0.41461877756773785,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987674264578615,
|
|
"loss": 5.8382,
|
|
"mean_token_accuracy": 0.1410167396068573,
|
|
"num_tokens": 9094289.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.902176809310913,
|
|
"epoch": 0.41503885738290275,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004987642942091414,
|
|
"loss": 5.7413,
|
|
"mean_token_accuracy": 0.14698186367750168,
|
|
"num_tokens": 9103124.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.898521900177002,
|
|
"epoch": 0.41545893719806765,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004987611579965523,
|
|
"loss": 5.6945,
|
|
"mean_token_accuracy": 0.1453884869813919,
|
|
"num_tokens": 9112794.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.867249441146851,
|
|
"epoch": 0.4158790170132325,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004987580178201492,
|
|
"loss": 5.8508,
|
|
"mean_token_accuracy": 0.15215325057506562,
|
|
"num_tokens": 9122718.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.877714014053344,
|
|
"epoch": 0.4162990968283974,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004987548736799882,
|
|
"loss": 5.8851,
|
|
"mean_token_accuracy": 0.13938734084367752,
|
|
"num_tokens": 9131855.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.866538429260254,
|
|
"epoch": 0.4167191766435623,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004987517255761248,
|
|
"loss": 5.7248,
|
|
"mean_token_accuracy": 0.14940666258335114,
|
|
"num_tokens": 9141102.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.806973934173584,
|
|
"epoch": 0.4171392564587272,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004987485735086148,
|
|
"loss": 5.8043,
|
|
"mean_token_accuracy": 0.14497776329517365,
|
|
"num_tokens": 9150552.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.940771627426147,
|
|
"epoch": 0.417559336273892,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000498745417477514,
|
|
"loss": 5.7927,
|
|
"mean_token_accuracy": 0.14460284858942032,
|
|
"num_tokens": 9160105.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.864925670623779,
|
|
"epoch": 0.4179794160890569,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004987422574828784,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.14519683197140693,
|
|
"num_tokens": 9169367.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.846901607513428,
|
|
"epoch": 0.4183994959042218,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987390935247639,
|
|
"loss": 5.6568,
|
|
"mean_token_accuracy": 0.15195999220013617,
|
|
"num_tokens": 9177872.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.892278623580933,
|
|
"epoch": 0.41881957571938666,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987359256032265,
|
|
"loss": 5.8728,
|
|
"mean_token_accuracy": 0.1392049200832844,
|
|
"num_tokens": 9187879.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.834523773193359,
|
|
"epoch": 0.41923965553455156,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004987327537183225,
|
|
"loss": 5.7865,
|
|
"mean_token_accuracy": 0.14359964653849602,
|
|
"num_tokens": 9198281.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.898417997360229,
|
|
"epoch": 0.41965973534971646,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004987295778701078,
|
|
"loss": 5.7784,
|
|
"mean_token_accuracy": 0.1480983316898346,
|
|
"num_tokens": 9207670.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.903277587890625,
|
|
"epoch": 0.42007981516488135,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000498726398058639,
|
|
"loss": 5.7986,
|
|
"mean_token_accuracy": 0.1475730612874031,
|
|
"num_tokens": 9216995.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.920054292678833,
|
|
"epoch": 0.4204998949800462,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004987232142839723,
|
|
"loss": 5.8785,
|
|
"mean_token_accuracy": 0.13731264397501947,
|
|
"num_tokens": 9227330.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.861970615386963,
|
|
"epoch": 0.4209199747952111,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987200265461638,
|
|
"loss": 5.7885,
|
|
"mean_token_accuracy": 0.15134866386651993,
|
|
"num_tokens": 9236666.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.934697484970092,
|
|
"epoch": 0.421340054610376,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987168348452705,
|
|
"loss": 5.7864,
|
|
"mean_token_accuracy": 0.144124399125576,
|
|
"num_tokens": 9246388.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.8499044418334964,
|
|
"epoch": 0.42176013442554083,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004987136391813485,
|
|
"loss": 5.7404,
|
|
"mean_token_accuracy": 0.15391666144132615,
|
|
"num_tokens": 9255239.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.773643350601196,
|
|
"epoch": 0.42218021424070573,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987104395544547,
|
|
"loss": 5.7252,
|
|
"mean_token_accuracy": 0.14332954734563827,
|
|
"num_tokens": 9264468.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.859898376464844,
|
|
"epoch": 0.42260029405587063,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987072359646455,
|
|
"loss": 5.7927,
|
|
"mean_token_accuracy": 0.15058641731739045,
|
|
"num_tokens": 9274140.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.917972660064697,
|
|
"epoch": 0.42302037387103547,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987040284119778,
|
|
"loss": 5.7586,
|
|
"mean_token_accuracy": 0.1428128033876419,
|
|
"num_tokens": 9283539.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.781129264831543,
|
|
"epoch": 0.42344045368620037,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004987008168965087,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.14332580342888832,
|
|
"num_tokens": 9292664.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.946068525314331,
|
|
"epoch": 0.42386053350136527,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004986976014182946,
|
|
"loss": 5.8657,
|
|
"mean_token_accuracy": 0.14432715028524398,
|
|
"num_tokens": 9302814.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.980961608886719,
|
|
"epoch": 0.42428061331653016,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986943819773927,
|
|
"loss": 5.858,
|
|
"mean_token_accuracy": 0.14330325573682784,
|
|
"num_tokens": 9312654.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.9505743980407715,
|
|
"epoch": 0.424700693131695,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00049869115857386,
|
|
"loss": 5.8737,
|
|
"mean_token_accuracy": 0.13669376373291015,
|
|
"num_tokens": 9322271.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.951388359069824,
|
|
"epoch": 0.4251207729468599,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004986879312077536,
|
|
"loss": 5.8193,
|
|
"mean_token_accuracy": 0.14102528542280196,
|
|
"num_tokens": 9331341.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.834031820297241,
|
|
"epoch": 0.4255408527620248,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986846998791308,
|
|
"loss": 5.7561,
|
|
"mean_token_accuracy": 0.1436670668423176,
|
|
"num_tokens": 9339863.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.811039066314697,
|
|
"epoch": 0.42596093257718964,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986814645880485,
|
|
"loss": 5.7236,
|
|
"mean_token_accuracy": 0.14669884666800498,
|
|
"num_tokens": 9349488.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.830924463272095,
|
|
"epoch": 0.42638101239235454,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986782253345645,
|
|
"loss": 5.7333,
|
|
"mean_token_accuracy": 0.14323149994015694,
|
|
"num_tokens": 9357977.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.839050388336181,
|
|
"epoch": 0.42680109220751944,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004986749821187358,
|
|
"loss": 5.8394,
|
|
"mean_token_accuracy": 0.14253177791833876,
|
|
"num_tokens": 9367449.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.939317226409912,
|
|
"epoch": 0.42722117202268434,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00049867173494062,
|
|
"loss": 5.8681,
|
|
"mean_token_accuracy": 0.14768607616424562,
|
|
"num_tokens": 9377070.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.813904285430908,
|
|
"epoch": 0.4276412518378492,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986684838002744,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.14204483926296235,
|
|
"num_tokens": 9385881.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.823819637298584,
|
|
"epoch": 0.4280613316530141,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004986652286977569,
|
|
"loss": 5.7905,
|
|
"mean_token_accuracy": 0.14255458265542983,
|
|
"num_tokens": 9395159.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.877113628387451,
|
|
"epoch": 0.428481411468179,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986619696331252,
|
|
"loss": 5.7486,
|
|
"mean_token_accuracy": 0.14601895585656166,
|
|
"num_tokens": 9404590.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.856746768951416,
|
|
"epoch": 0.4289014912833438,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986587066064367,
|
|
"loss": 5.7708,
|
|
"mean_token_accuracy": 0.1473971426486969,
|
|
"num_tokens": 9414452.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.868241453170777,
|
|
"epoch": 0.4293215710985087,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004986554396177494,
|
|
"loss": 5.894,
|
|
"mean_token_accuracy": 0.1396991342306137,
|
|
"num_tokens": 9424004.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.933579587936402,
|
|
"epoch": 0.4297416509136736,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004986521686671212,
|
|
"loss": 5.7713,
|
|
"mean_token_accuracy": 0.1551983118057251,
|
|
"num_tokens": 9433487.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.856822824478149,
|
|
"epoch": 0.43016173072883845,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00049864889375461,
|
|
"loss": 5.8359,
|
|
"mean_token_accuracy": 0.13958305045962333,
|
|
"num_tokens": 9442742.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.880755043029785,
|
|
"epoch": 0.43058181054400335,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004986456148802738,
|
|
"loss": 5.8957,
|
|
"mean_token_accuracy": 0.14121335968375207,
|
|
"num_tokens": 9452550.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 6.039326620101929,
|
|
"epoch": 0.43100189035916825,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004986423320441707,
|
|
"loss": 5.8546,
|
|
"mean_token_accuracy": 0.13762183710932732,
|
|
"num_tokens": 9461920.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.904562616348267,
|
|
"epoch": 0.43142197017433315,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004986390452463588,
|
|
"loss": 5.7682,
|
|
"mean_token_accuracy": 0.14276604056358339,
|
|
"num_tokens": 9470817.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.710296773910523,
|
|
"epoch": 0.431842049989498,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986357544868964,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.15019231289625168,
|
|
"num_tokens": 9479936.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.892205905914307,
|
|
"epoch": 0.4322621298046629,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004986324597658418,
|
|
"loss": 5.7581,
|
|
"mean_token_accuracy": 0.15196042209863664,
|
|
"num_tokens": 9489818.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.733763742446899,
|
|
"epoch": 0.4326822096198278,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004986291610832533,
|
|
"loss": 5.7455,
|
|
"mean_token_accuracy": 0.14281522929668428,
|
|
"num_tokens": 9499688.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.960237169265747,
|
|
"epoch": 0.4331022894349926,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004986258584391892,
|
|
"loss": 5.8063,
|
|
"mean_token_accuracy": 0.14208860471844673,
|
|
"num_tokens": 9509581.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 6.0035475730896,
|
|
"epoch": 0.4335223692501575,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004986225518337084,
|
|
"loss": 5.89,
|
|
"mean_token_accuracy": 0.143732051551342,
|
|
"num_tokens": 9518556.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.81024432182312,
|
|
"epoch": 0.4339424490653224,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004986192412668692,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.14318298548460007,
|
|
"num_tokens": 9527612.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.847835922241211,
|
|
"epoch": 0.4343625288804873,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004986159267387302,
|
|
"loss": 5.6856,
|
|
"mean_token_accuracy": 0.1560652643442154,
|
|
"num_tokens": 9535882.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.862061595916748,
|
|
"epoch": 0.43478260869565216,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986126082493502,
|
|
"loss": 5.7914,
|
|
"mean_token_accuracy": 0.14822041988372803,
|
|
"num_tokens": 9544799.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.794046545028687,
|
|
"epoch": 0.43520268851081706,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004986092857987881,
|
|
"loss": 5.6968,
|
|
"mean_token_accuracy": 0.15352533906698226,
|
|
"num_tokens": 9553805.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.832414722442627,
|
|
"epoch": 0.43562276832598196,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004986059593871026,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.14509093537926673,
|
|
"num_tokens": 9563493.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.899970149993896,
|
|
"epoch": 0.4360428481411468,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004986026290143527,
|
|
"loss": 5.8201,
|
|
"mean_token_accuracy": 0.14310061410069466,
|
|
"num_tokens": 9572297.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.985169315338135,
|
|
"epoch": 0.4364629279563117,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004985992946805973,
|
|
"loss": 5.9499,
|
|
"mean_token_accuracy": 0.1373360723257065,
|
|
"num_tokens": 9581967.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.853709316253662,
|
|
"epoch": 0.4368830077714766,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004985959563858955,
|
|
"loss": 5.8611,
|
|
"mean_token_accuracy": 0.14648908525705337,
|
|
"num_tokens": 9590885.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.920672750473022,
|
|
"epoch": 0.43730308758664144,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004985926141303066,
|
|
"loss": 5.7766,
|
|
"mean_token_accuracy": 0.14383909106254578,
|
|
"num_tokens": 9599247.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.823170852661133,
|
|
"epoch": 0.43772316740180633,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004985892679138896,
|
|
"loss": 5.709,
|
|
"mean_token_accuracy": 0.15263715162873268,
|
|
"num_tokens": 9608296.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.922242307662964,
|
|
"epoch": 0.43814324721697123,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985859177367038,
|
|
"loss": 5.7539,
|
|
"mean_token_accuracy": 0.14295759946107864,
|
|
"num_tokens": 9616734.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.933417272567749,
|
|
"epoch": 0.43856332703213613,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004985825635988087,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.14136623740196227,
|
|
"num_tokens": 9626246.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.840227174758911,
|
|
"epoch": 0.43898340684730097,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985792055002635,
|
|
"loss": 5.7156,
|
|
"mean_token_accuracy": 0.1447908401489258,
|
|
"num_tokens": 9634963.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.864311695098877,
|
|
"epoch": 0.43940348666246587,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004985758434411278,
|
|
"loss": 5.7954,
|
|
"mean_token_accuracy": 0.1492132991552353,
|
|
"num_tokens": 9643615.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.824445819854736,
|
|
"epoch": 0.43982356647763077,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004985724774214613,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14679911136627197,
|
|
"num_tokens": 9653306.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.8889368057250975,
|
|
"epoch": 0.4402436462927956,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004985691074413233,
|
|
"loss": 5.7966,
|
|
"mean_token_accuracy": 0.1408935308456421,
|
|
"num_tokens": 9662389.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.806066703796387,
|
|
"epoch": 0.4406637261079605,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004985657335007739,
|
|
"loss": 5.7659,
|
|
"mean_token_accuracy": 0.14551339596509932,
|
|
"num_tokens": 9671183.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.852633047103882,
|
|
"epoch": 0.4410838059231254,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004985623555998725,
|
|
"loss": 5.778,
|
|
"mean_token_accuracy": 0.1539351999759674,
|
|
"num_tokens": 9680544.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.867886209487915,
|
|
"epoch": 0.4415038857382903,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985589737386791,
|
|
"loss": 5.8053,
|
|
"mean_token_accuracy": 0.1449089080095291,
|
|
"num_tokens": 9690137.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.847021532058716,
|
|
"epoch": 0.44192396555345514,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004985555879172535,
|
|
"loss": 5.7433,
|
|
"mean_token_accuracy": 0.14687602072954178,
|
|
"num_tokens": 9699149.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.898943853378296,
|
|
"epoch": 0.44234404536862004,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498552198135656,
|
|
"loss": 5.8097,
|
|
"mean_token_accuracy": 0.15019679218530654,
|
|
"num_tokens": 9709308.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.844637632369995,
|
|
"epoch": 0.44276412518378494,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004985488043939462,
|
|
"loss": 5.7573,
|
|
"mean_token_accuracy": 0.1442711167037487,
|
|
"num_tokens": 9718462.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.853937387466431,
|
|
"epoch": 0.4431842049989498,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004985454066921846,
|
|
"loss": 5.6905,
|
|
"mean_token_accuracy": 0.1537187710404396,
|
|
"num_tokens": 9727626.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.747472763061523,
|
|
"epoch": 0.4436042848141147,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004985420050304312,
|
|
"loss": 5.7068,
|
|
"mean_token_accuracy": 0.1498991407454014,
|
|
"num_tokens": 9737091.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.846937942504883,
|
|
"epoch": 0.4440243646292796,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004985385994087462,
|
|
"loss": 5.7867,
|
|
"mean_token_accuracy": 0.14585647359490395,
|
|
"num_tokens": 9746135.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.949729108810425,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004985351898271901,
|
|
"loss": 5.719,
|
|
"mean_token_accuracy": 0.1520434781908989,
|
|
"num_tokens": 9754549.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.887947463989258,
|
|
"epoch": 0.4448645242596093,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004985317762858231,
|
|
"loss": 5.8567,
|
|
"mean_token_accuracy": 0.14025997146964073,
|
|
"num_tokens": 9764219.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.871951913833618,
|
|
"epoch": 0.4452846040747742,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000498528358784706,
|
|
"loss": 5.6972,
|
|
"mean_token_accuracy": 0.15001460164785385,
|
|
"num_tokens": 9772234.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.811316633224488,
|
|
"epoch": 0.4457046838899391,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498524937323899,
|
|
"loss": 5.7622,
|
|
"mean_token_accuracy": 0.15125853270292283,
|
|
"num_tokens": 9781417.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.981836175918579,
|
|
"epoch": 0.44612476370510395,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004985215119034628,
|
|
"loss": 5.8763,
|
|
"mean_token_accuracy": 0.13692381381988525,
|
|
"num_tokens": 9791286.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.866169118881226,
|
|
"epoch": 0.44654484352026885,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985180825234582,
|
|
"loss": 5.8755,
|
|
"mean_token_accuracy": 0.13873762115836144,
|
|
"num_tokens": 9802157.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.981353807449341,
|
|
"epoch": 0.44696492333543375,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004985146491839459,
|
|
"loss": 5.8547,
|
|
"mean_token_accuracy": 0.1320488214492798,
|
|
"num_tokens": 9812646.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.9978625774383545,
|
|
"epoch": 0.4473850031505986,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985112118849865,
|
|
"loss": 5.8664,
|
|
"mean_token_accuracy": 0.13918881937861444,
|
|
"num_tokens": 9822274.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.781670093536377,
|
|
"epoch": 0.4478050829657635,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004985077706266412,
|
|
"loss": 5.6507,
|
|
"mean_token_accuracy": 0.14431787207722663,
|
|
"num_tokens": 9831337.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.797645950317383,
|
|
"epoch": 0.4482251627809284,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004985043254089708,
|
|
"loss": 5.8111,
|
|
"mean_token_accuracy": 0.13542471826076508,
|
|
"num_tokens": 9840798.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.871469783782959,
|
|
"epoch": 0.44864524259609323,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004985008762320364,
|
|
"loss": 5.7666,
|
|
"mean_token_accuracy": 0.14363950192928315,
|
|
"num_tokens": 9850117.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.885560655593872,
|
|
"epoch": 0.4490653224112581,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000498497423095899,
|
|
"loss": 5.7176,
|
|
"mean_token_accuracy": 0.15319354236125945,
|
|
"num_tokens": 9858227.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.810570764541626,
|
|
"epoch": 0.449485402226423,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004984939660006199,
|
|
"loss": 5.8079,
|
|
"mean_token_accuracy": 0.14338937029242516,
|
|
"num_tokens": 9867157.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.811974906921387,
|
|
"epoch": 0.4499054820415879,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004984905049462602,
|
|
"loss": 5.7349,
|
|
"mean_token_accuracy": 0.144259013235569,
|
|
"num_tokens": 9877045.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.959705638885498,
|
|
"epoch": 0.45032556185675277,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004984870399328814,
|
|
"loss": 5.8617,
|
|
"mean_token_accuracy": 0.14245471283793448,
|
|
"num_tokens": 9886637.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.816979646682739,
|
|
"epoch": 0.45074564167191766,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004984835709605446,
|
|
"loss": 5.7271,
|
|
"mean_token_accuracy": 0.15511318892240525,
|
|
"num_tokens": 9895601.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.86139702796936,
|
|
"epoch": 0.45116572148708256,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004984800980293116,
|
|
"loss": 5.8807,
|
|
"mean_token_accuracy": 0.14196527227759362,
|
|
"num_tokens": 9904775.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.883301210403443,
|
|
"epoch": 0.4515858013022474,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004984766211392435,
|
|
"loss": 5.8184,
|
|
"mean_token_accuracy": 0.13878512308001517,
|
|
"num_tokens": 9913795.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.856382942199707,
|
|
"epoch": 0.4520058811174123,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004984731402904024,
|
|
"loss": 5.6546,
|
|
"mean_token_accuracy": 0.15193988084793092,
|
|
"num_tokens": 9922576.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.768913459777832,
|
|
"epoch": 0.4524259609325772,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004984696554828496,
|
|
"loss": 5.6446,
|
|
"mean_token_accuracy": 0.15225213021039963,
|
|
"num_tokens": 9930971.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.856381464004516,
|
|
"epoch": 0.4528460407477421,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004984661667166468,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.1514030024409294,
|
|
"num_tokens": 9939628.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.887900066375733,
|
|
"epoch": 0.45326612056290694,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004984626739918561,
|
|
"loss": 5.7294,
|
|
"mean_token_accuracy": 0.15370103269815444,
|
|
"num_tokens": 9948397.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.8639452934265135,
|
|
"epoch": 0.45368620037807184,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004984591773085391,
|
|
"loss": 5.8108,
|
|
"mean_token_accuracy": 0.14718640744686126,
|
|
"num_tokens": 9957683.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.911360502243042,
|
|
"epoch": 0.45410628019323673,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984556766667578,
|
|
"loss": 5.7938,
|
|
"mean_token_accuracy": 0.14773029685020447,
|
|
"num_tokens": 9966756.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.876928043365479,
|
|
"epoch": 0.4545263600084016,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004984521720665743,
|
|
"loss": 5.7996,
|
|
"mean_token_accuracy": 0.1499388188123703,
|
|
"num_tokens": 9976000.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.9389279842376705,
|
|
"epoch": 0.4549464398235665,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984486635080507,
|
|
"loss": 5.7922,
|
|
"mean_token_accuracy": 0.146384534239769,
|
|
"num_tokens": 9985509.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.7951904296875,
|
|
"epoch": 0.45536651963873137,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004984451509912489,
|
|
"loss": 5.744,
|
|
"mean_token_accuracy": 0.1474005714058876,
|
|
"num_tokens": 9994342.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.838972473144532,
|
|
"epoch": 0.4557865994538962,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004984416345162315,
|
|
"loss": 5.7889,
|
|
"mean_token_accuracy": 0.14537926837801934,
|
|
"num_tokens": 10004249.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.8457417488098145,
|
|
"epoch": 0.4562066792690611,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984381140830605,
|
|
"loss": 5.7485,
|
|
"mean_token_accuracy": 0.14723600521683694,
|
|
"num_tokens": 10012430.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.878772354125976,
|
|
"epoch": 0.456626759084226,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004984345896917984,
|
|
"loss": 5.7605,
|
|
"mean_token_accuracy": 0.14340553283691407,
|
|
"num_tokens": 10021434.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.859716320037842,
|
|
"epoch": 0.4570468388993909,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984310613425076,
|
|
"loss": 5.7662,
|
|
"mean_token_accuracy": 0.1505170688033104,
|
|
"num_tokens": 10030473.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.890053796768188,
|
|
"epoch": 0.45746691871455575,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004984275290352506,
|
|
"loss": 5.7347,
|
|
"mean_token_accuracy": 0.1503530338406563,
|
|
"num_tokens": 10039057.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.906252813339234,
|
|
"epoch": 0.45788699852972065,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004984239927700899,
|
|
"loss": 5.8309,
|
|
"mean_token_accuracy": 0.14800925105810164,
|
|
"num_tokens": 10047998.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.96235499382019,
|
|
"epoch": 0.45830707834488554,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004984204525470883,
|
|
"loss": 5.7626,
|
|
"mean_token_accuracy": 0.14305243864655495,
|
|
"num_tokens": 10057479.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.773991537094116,
|
|
"epoch": 0.4587271581600504,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004984169083663084,
|
|
"loss": 5.7318,
|
|
"mean_token_accuracy": 0.14002140685915948,
|
|
"num_tokens": 10067754.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.805001163482666,
|
|
"epoch": 0.4591472379752153,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004984133602278129,
|
|
"loss": 5.8253,
|
|
"mean_token_accuracy": 0.1421283006668091,
|
|
"num_tokens": 10076815.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 6.033328580856323,
|
|
"epoch": 0.4595673177903802,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498409808131665,
|
|
"loss": 5.8269,
|
|
"mean_token_accuracy": 0.14671371206641198,
|
|
"num_tokens": 10086300.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.823101377487182,
|
|
"epoch": 0.4599873976055451,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984062520779272,
|
|
"loss": 5.7259,
|
|
"mean_token_accuracy": 0.1552243560552597,
|
|
"num_tokens": 10095383.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.773621034622193,
|
|
"epoch": 0.4604074774207099,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984026920666628,
|
|
"loss": 5.7019,
|
|
"mean_token_accuracy": 0.1514463573694229,
|
|
"num_tokens": 10103971.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.798014068603516,
|
|
"epoch": 0.4608275572358748,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004983991280979347,
|
|
"loss": 5.6971,
|
|
"mean_token_accuracy": 0.1502104952931404,
|
|
"num_tokens": 10113028.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.823189973831177,
|
|
"epoch": 0.4612476370510397,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004983955601718061,
|
|
"loss": 5.6819,
|
|
"mean_token_accuracy": 0.14814986884593964,
|
|
"num_tokens": 10121890.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.896232748031617,
|
|
"epoch": 0.46166771686620456,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983919882883401,
|
|
"loss": 5.8089,
|
|
"mean_token_accuracy": 0.1452305495738983,
|
|
"num_tokens": 10131655.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.876237583160401,
|
|
"epoch": 0.46208779668136946,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004983884124476,
|
|
"loss": 5.8051,
|
|
"mean_token_accuracy": 0.14433109760284424,
|
|
"num_tokens": 10140778.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.897982120513916,
|
|
"epoch": 0.46250787649653435,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983848326496494,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.1391661711037159,
|
|
"num_tokens": 10150229.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.943829345703125,
|
|
"epoch": 0.4629279563116992,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983812488945513,
|
|
"loss": 5.7502,
|
|
"mean_token_accuracy": 0.14314467534422876,
|
|
"num_tokens": 10158939.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.819750833511352,
|
|
"epoch": 0.4633480361268641,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004983776611823696,
|
|
"loss": 5.7489,
|
|
"mean_token_accuracy": 0.14325918182730674,
|
|
"num_tokens": 10168383.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.7525170803070065,
|
|
"epoch": 0.463768115942029,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004983740695131676,
|
|
"loss": 5.7483,
|
|
"mean_token_accuracy": 0.1506567046046257,
|
|
"num_tokens": 10178678.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.8393933296203615,
|
|
"epoch": 0.4641881957571939,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000498370473887009,
|
|
"loss": 5.7404,
|
|
"mean_token_accuracy": 0.1451387256383896,
|
|
"num_tokens": 10188964.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.9242652416229244,
|
|
"epoch": 0.46460827557235873,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004983668743039573,
|
|
"loss": 5.7722,
|
|
"mean_token_accuracy": 0.15323825627565385,
|
|
"num_tokens": 10198333.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.789677238464355,
|
|
"epoch": 0.46502835538752363,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983632707640766,
|
|
"loss": 5.7876,
|
|
"mean_token_accuracy": 0.14813560321927072,
|
|
"num_tokens": 10207876.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.812788200378418,
|
|
"epoch": 0.4654484352026885,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004983596632674306,
|
|
"loss": 5.7229,
|
|
"mean_token_accuracy": 0.14903474599123,
|
|
"num_tokens": 10216822.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.883552932739258,
|
|
"epoch": 0.46586851501785337,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983560518140831,
|
|
"loss": 5.8344,
|
|
"mean_token_accuracy": 0.139993616938591,
|
|
"num_tokens": 10226887.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.850424337387085,
|
|
"epoch": 0.46628859483301827,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004983524364040982,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.1548854097723961,
|
|
"num_tokens": 10235935.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.844246101379395,
|
|
"epoch": 0.46670867464818316,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983488170375399,
|
|
"loss": 5.6405,
|
|
"mean_token_accuracy": 0.1503463476896286,
|
|
"num_tokens": 10245590.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.735381555557251,
|
|
"epoch": 0.46712875446334806,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983451937144723,
|
|
"loss": 5.7345,
|
|
"mean_token_accuracy": 0.1456381857395172,
|
|
"num_tokens": 10255104.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.7118124008178714,
|
|
"epoch": 0.4675488342785129,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004983415664349595,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.16290194243192674,
|
|
"num_tokens": 10264236.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.817228507995606,
|
|
"epoch": 0.4679689140936778,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004983379351990659,
|
|
"loss": 5.7056,
|
|
"mean_token_accuracy": 0.1503439575433731,
|
|
"num_tokens": 10273335.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.7475629329681395,
|
|
"epoch": 0.4683889939088427,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004983343000068559,
|
|
"loss": 5.6682,
|
|
"mean_token_accuracy": 0.1495598793029785,
|
|
"num_tokens": 10282206.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.688462829589843,
|
|
"epoch": 0.46880907372400754,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004983306608583937,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.16340474039316177,
|
|
"num_tokens": 10290056.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.7730052947998045,
|
|
"epoch": 0.46922915353917244,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983270177537438,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.14809525161981582,
|
|
"num_tokens": 10299726.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.84525089263916,
|
|
"epoch": 0.46964923335433734,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983233706929708,
|
|
"loss": 5.7725,
|
|
"mean_token_accuracy": 0.1471342384815216,
|
|
"num_tokens": 10308696.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.880400562286377,
|
|
"epoch": 0.4700693131695022,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004983197196761392,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.14054280817508696,
|
|
"num_tokens": 10317845.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.84756875038147,
|
|
"epoch": 0.4704893929846671,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004983160647033139,
|
|
"loss": 5.737,
|
|
"mean_token_accuracy": 0.150573068857193,
|
|
"num_tokens": 10326563.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.826395320892334,
|
|
"epoch": 0.470909472799832,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004983124057745595,
|
|
"loss": 5.7235,
|
|
"mean_token_accuracy": 0.14374103918671607,
|
|
"num_tokens": 10335931.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.76983675956726,
|
|
"epoch": 0.47132955261499687,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004983087428899408,
|
|
"loss": 5.7216,
|
|
"mean_token_accuracy": 0.1377339854836464,
|
|
"num_tokens": 10344984.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.842723369598389,
|
|
"epoch": 0.4717496324301617,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004983050760495227,
|
|
"loss": 5.7638,
|
|
"mean_token_accuracy": 0.14885966181755067,
|
|
"num_tokens": 10353522.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.915482044219971,
|
|
"epoch": 0.4721697122453266,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983014052533702,
|
|
"loss": 5.7678,
|
|
"mean_token_accuracy": 0.14949656873941422,
|
|
"num_tokens": 10363527.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.765365362167358,
|
|
"epoch": 0.4725897920604915,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004982977305015481,
|
|
"loss": 5.6942,
|
|
"mean_token_accuracy": 0.1467475950717926,
|
|
"num_tokens": 10372040.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.808851623535157,
|
|
"epoch": 0.47300987187565635,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004982940517941219,
|
|
"loss": 5.6732,
|
|
"mean_token_accuracy": 0.14801965281367302,
|
|
"num_tokens": 10381279.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.891337108612061,
|
|
"epoch": 0.47342995169082125,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982903691311564,
|
|
"loss": 5.8457,
|
|
"mean_token_accuracy": 0.1401650868356228,
|
|
"num_tokens": 10390608.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.811560487747192,
|
|
"epoch": 0.47385003150598615,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004982866825127172,
|
|
"loss": 5.6437,
|
|
"mean_token_accuracy": 0.1533919870853424,
|
|
"num_tokens": 10399851.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.952455997467041,
|
|
"epoch": 0.47427011132115104,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982829919388692,
|
|
"loss": 5.9303,
|
|
"mean_token_accuracy": 0.1413193352520466,
|
|
"num_tokens": 10410425.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.829264545440674,
|
|
"epoch": 0.4746901911363159,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004982792974096781,
|
|
"loss": 5.6844,
|
|
"mean_token_accuracy": 0.15058013647794724,
|
|
"num_tokens": 10418783.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.883219861984253,
|
|
"epoch": 0.4751102709514808,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000498275598925209,
|
|
"loss": 5.8575,
|
|
"mean_token_accuracy": 0.14019499495625495,
|
|
"num_tokens": 10427360.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.982011365890503,
|
|
"epoch": 0.4755303507666457,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004982718964855277,
|
|
"loss": 5.8116,
|
|
"mean_token_accuracy": 0.14399669840931892,
|
|
"num_tokens": 10436613.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.872733783721924,
|
|
"epoch": 0.4759504305818105,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982681900907,
|
|
"loss": 5.8526,
|
|
"mean_token_accuracy": 0.1458025962114334,
|
|
"num_tokens": 10445055.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.826623582839966,
|
|
"epoch": 0.4763705103969754,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000498264479740791,
|
|
"loss": 5.6666,
|
|
"mean_token_accuracy": 0.15394981056451798,
|
|
"num_tokens": 10454516.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.948064708709717,
|
|
"epoch": 0.4767905902121403,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004982607654358668,
|
|
"loss": 5.8096,
|
|
"mean_token_accuracy": 0.147859063744545,
|
|
"num_tokens": 10463771.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.835044527053833,
|
|
"epoch": 0.47721067002730516,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000498257047175993,
|
|
"loss": 5.7488,
|
|
"mean_token_accuracy": 0.142615008354187,
|
|
"num_tokens": 10473783.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.83440375328064,
|
|
"epoch": 0.47763074984247006,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004982533249612357,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.14993957430124283,
|
|
"num_tokens": 10483424.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.763900947570801,
|
|
"epoch": 0.47805082965763496,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004982495987916607,
|
|
"loss": 5.6455,
|
|
"mean_token_accuracy": 0.15347654670476912,
|
|
"num_tokens": 10492536.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.8370520114898685,
|
|
"epoch": 0.47847090947279985,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004982458686673339,
|
|
"loss": 5.7578,
|
|
"mean_token_accuracy": 0.14936625212430954,
|
|
"num_tokens": 10501616.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.956824541091919,
|
|
"epoch": 0.4788909892879647,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004982421345883217,
|
|
"loss": 5.8031,
|
|
"mean_token_accuracy": 0.14071496576070786,
|
|
"num_tokens": 10511190.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.793789196014404,
|
|
"epoch": 0.4793110691031296,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004982383965546898,
|
|
"loss": 5.7381,
|
|
"mean_token_accuracy": 0.144473847001791,
|
|
"num_tokens": 10520310.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.833015632629395,
|
|
"epoch": 0.4797311489182945,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004982346545665048,
|
|
"loss": 5.6941,
|
|
"mean_token_accuracy": 0.1467716298997402,
|
|
"num_tokens": 10528711.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.8455291271209715,
|
|
"epoch": 0.48015122873345933,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004982309086238328,
|
|
"loss": 5.8016,
|
|
"mean_token_accuracy": 0.14259516224265098,
|
|
"num_tokens": 10538484.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.898940181732177,
|
|
"epoch": 0.48057130854862423,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004982271587267403,
|
|
"loss": 5.747,
|
|
"mean_token_accuracy": 0.14794613867998124,
|
|
"num_tokens": 10547623.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.868904733657837,
|
|
"epoch": 0.48099138836378913,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004982234048752935,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.14849727526307105,
|
|
"num_tokens": 10556234.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.9389198303222654,
|
|
"epoch": 0.481411468178954,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000498219647069559,
|
|
"loss": 5.9273,
|
|
"mean_token_accuracy": 0.13982586190104485,
|
|
"num_tokens": 10566308.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.836957883834839,
|
|
"epoch": 0.48183154799411887,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004982158853096035,
|
|
"loss": 5.8519,
|
|
"mean_token_accuracy": 0.1417085811495781,
|
|
"num_tokens": 10575212.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.8836267471313475,
|
|
"epoch": 0.48225162780928377,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982121195954935,
|
|
"loss": 5.6287,
|
|
"mean_token_accuracy": 0.15638786405324936,
|
|
"num_tokens": 10584590.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.817459297180176,
|
|
"epoch": 0.48267170762444866,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004982083499272957,
|
|
"loss": 5.7007,
|
|
"mean_token_accuracy": 0.14900539070367813,
|
|
"num_tokens": 10593997.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.799760389328003,
|
|
"epoch": 0.4830917874396135,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004982045763050768,
|
|
"loss": 5.8291,
|
|
"mean_token_accuracy": 0.1467505380511284,
|
|
"num_tokens": 10603299.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.825570392608642,
|
|
"epoch": 0.4835118672547784,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982007987289041,
|
|
"loss": 5.7641,
|
|
"mean_token_accuracy": 0.14574431553483008,
|
|
"num_tokens": 10613546.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.833213567733765,
|
|
"epoch": 0.4839319470699433,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004981970171988439,
|
|
"loss": 5.7267,
|
|
"mean_token_accuracy": 0.15680563673377038,
|
|
"num_tokens": 10622966.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.918120956420898,
|
|
"epoch": 0.48435202688510814,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004981932317149636,
|
|
"loss": 5.8074,
|
|
"mean_token_accuracy": 0.14230270087718963,
|
|
"num_tokens": 10633441.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.926499748229981,
|
|
"epoch": 0.48477210670027304,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00049818944227733,
|
|
"loss": 5.7829,
|
|
"mean_token_accuracy": 0.145944182574749,
|
|
"num_tokens": 10643124.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.8368360042572025,
|
|
"epoch": 0.48519218651543794,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004981856488860105,
|
|
"loss": 5.75,
|
|
"mean_token_accuracy": 0.14405592083930968,
|
|
"num_tokens": 10652517.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.827040672302246,
|
|
"epoch": 0.48561226633060284,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004981818515410721,
|
|
"loss": 5.8018,
|
|
"mean_token_accuracy": 0.14195797815918923,
|
|
"num_tokens": 10663352.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.911312675476074,
|
|
"epoch": 0.4860323461457677,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004981780502425821,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.14514586478471755,
|
|
"num_tokens": 10672430.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.858085298538208,
|
|
"epoch": 0.4864524259609326,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004981742449906079,
|
|
"loss": 5.7778,
|
|
"mean_token_accuracy": 0.15105650201439857,
|
|
"num_tokens": 10681908.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.876479959487915,
|
|
"epoch": 0.4868725057760975,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004981704357852168,
|
|
"loss": 5.7501,
|
|
"mean_token_accuracy": 0.1459008663892746,
|
|
"num_tokens": 10691259.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.803030967712402,
|
|
"epoch": 0.4872925855912623,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004981666226264764,
|
|
"loss": 5.6514,
|
|
"mean_token_accuracy": 0.14785986095666886,
|
|
"num_tokens": 10699668.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.827937030792237,
|
|
"epoch": 0.4877126654064272,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004981628055144542,
|
|
"loss": 5.7065,
|
|
"mean_token_accuracy": 0.15127545595169067,
|
|
"num_tokens": 10709146.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.876874828338623,
|
|
"epoch": 0.4881327452215921,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004981589844492177,
|
|
"loss": 5.8008,
|
|
"mean_token_accuracy": 0.13951031863689423,
|
|
"num_tokens": 10718724.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.814950895309448,
|
|
"epoch": 0.488552825036757,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004981551594308349,
|
|
"loss": 5.7424,
|
|
"mean_token_accuracy": 0.14747670367360116,
|
|
"num_tokens": 10728101.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.938137483596802,
|
|
"epoch": 0.48897290485192185,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004981513304593733,
|
|
"loss": 5.7721,
|
|
"mean_token_accuracy": 0.15057093650102615,
|
|
"num_tokens": 10736750.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.9004603862762455,
|
|
"epoch": 0.48939298466708675,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004981474975349006,
|
|
"loss": 5.9573,
|
|
"mean_token_accuracy": 0.143083293735981,
|
|
"num_tokens": 10746914.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.944899702072144,
|
|
"epoch": 0.48981306448225165,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000498143660657485,
|
|
"loss": 5.7841,
|
|
"mean_token_accuracy": 0.14469311460852624,
|
|
"num_tokens": 10755786.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.719291877746582,
|
|
"epoch": 0.4902331442974165,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004981398198271944,
|
|
"loss": 5.6544,
|
|
"mean_token_accuracy": 0.15054057389497758,
|
|
"num_tokens": 10764821.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.821346855163574,
|
|
"epoch": 0.4906532241125814,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004981359750440968,
|
|
"loss": 5.7381,
|
|
"mean_token_accuracy": 0.14619418531656264,
|
|
"num_tokens": 10773569.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.812557601928711,
|
|
"epoch": 0.4910733039277463,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004981321263082603,
|
|
"loss": 5.7233,
|
|
"mean_token_accuracy": 0.14379709362983703,
|
|
"num_tokens": 10782298.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.7633030891418455,
|
|
"epoch": 0.4914933837429111,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000498128273619753,
|
|
"loss": 5.6964,
|
|
"mean_token_accuracy": 0.15067172646522523,
|
|
"num_tokens": 10792087.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.826433086395264,
|
|
"epoch": 0.491913463558076,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004981244169786433,
|
|
"loss": 5.7863,
|
|
"mean_token_accuracy": 0.14527801647782326,
|
|
"num_tokens": 10801641.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.962628364562988,
|
|
"epoch": 0.4923335433732409,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004981205563849994,
|
|
"loss": 5.8636,
|
|
"mean_token_accuracy": 0.1445979543030262,
|
|
"num_tokens": 10811612.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.84666166305542,
|
|
"epoch": 0.4927536231884058,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004981166918388897,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.1496157467365265,
|
|
"num_tokens": 10821608.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.758074522018433,
|
|
"epoch": 0.49317370300357066,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004981128233403828,
|
|
"loss": 5.6341,
|
|
"mean_token_accuracy": 0.15541895031929015,
|
|
"num_tokens": 10830679.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.810383653640747,
|
|
"epoch": 0.49359378281873556,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000498108950889547,
|
|
"loss": 5.7028,
|
|
"mean_token_accuracy": 0.15059976279735565,
|
|
"num_tokens": 10839669.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.813056564331054,
|
|
"epoch": 0.49401386263390046,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004981050744864512,
|
|
"loss": 5.6876,
|
|
"mean_token_accuracy": 0.14685238003730774,
|
|
"num_tokens": 10849666.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.78202338218689,
|
|
"epoch": 0.4944339424490653,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004981011941311638,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.1536119759082794,
|
|
"num_tokens": 10858225.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.7550591945648195,
|
|
"epoch": 0.4948540222642302,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004980973098237535,
|
|
"loss": 5.7246,
|
|
"mean_token_accuracy": 0.14252085834741593,
|
|
"num_tokens": 10867466.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.849875020980835,
|
|
"epoch": 0.4952741020793951,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980934215642894,
|
|
"loss": 5.7463,
|
|
"mean_token_accuracy": 0.151506906747818,
|
|
"num_tokens": 10875850.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.780202579498291,
|
|
"epoch": 0.49569418189456,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00049808952935284,
|
|
"loss": 5.6809,
|
|
"mean_token_accuracy": 0.15422153174877168,
|
|
"num_tokens": 10885154.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.7728334903717045,
|
|
"epoch": 0.49611426170972484,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004980856331894747,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.14351727366447448,
|
|
"num_tokens": 10894080.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.794958066940308,
|
|
"epoch": 0.49653434152488973,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980817330742621,
|
|
"loss": 5.7728,
|
|
"mean_token_accuracy": 0.1406318761408329,
|
|
"num_tokens": 10903248.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.890414190292359,
|
|
"epoch": 0.49695442134005463,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004980778290072716,
|
|
"loss": 5.7344,
|
|
"mean_token_accuracy": 0.1520361930131912,
|
|
"num_tokens": 10912939.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.844255971908569,
|
|
"epoch": 0.4973745011552195,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004980739209885722,
|
|
"loss": 5.7519,
|
|
"mean_token_accuracy": 0.14798953309655188,
|
|
"num_tokens": 10921505.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.894140291213989,
|
|
"epoch": 0.49779458097038437,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980700090182331,
|
|
"loss": 5.8334,
|
|
"mean_token_accuracy": 0.14881108254194259,
|
|
"num_tokens": 10931861.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.870219659805298,
|
|
"epoch": 0.49821466078554927,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004980660930963238,
|
|
"loss": 5.7625,
|
|
"mean_token_accuracy": 0.14495279788970947,
|
|
"num_tokens": 10940810.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.808070087432862,
|
|
"epoch": 0.4986347406007141,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004980621732229133,
|
|
"loss": 5.6263,
|
|
"mean_token_accuracy": 0.15171189308166505,
|
|
"num_tokens": 10949514.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.853536224365234,
|
|
"epoch": 0.499054820415879,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004980582493980714,
|
|
"loss": 5.8402,
|
|
"mean_token_accuracy": 0.13668815642595292,
|
|
"num_tokens": 10959161.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.811306715011597,
|
|
"epoch": 0.4994749002310439,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004980543216218674,
|
|
"loss": 5.7084,
|
|
"mean_token_accuracy": 0.1605042815208435,
|
|
"num_tokens": 10968983.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.838724660873413,
|
|
"epoch": 0.4998949800462088,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980503898943711,
|
|
"loss": 5.8486,
|
|
"mean_token_accuracy": 0.14541933685541153,
|
|
"num_tokens": 10978044.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.919149684906006,
|
|
"epoch": 0.5003150598613737,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980464542156519,
|
|
"loss": 5.7474,
|
|
"mean_token_accuracy": 0.15162651985883713,
|
|
"num_tokens": 10986980.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.8385172367095945,
|
|
"epoch": 0.5007351396765385,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004980425145857796,
|
|
"loss": 5.6939,
|
|
"mean_token_accuracy": 0.15786231756210328,
|
|
"num_tokens": 10995163.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.755066156387329,
|
|
"epoch": 0.5011552194917034,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000498038571004824,
|
|
"loss": 5.6211,
|
|
"mean_token_accuracy": 0.159263913333416,
|
|
"num_tokens": 11003722.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.732334613800049,
|
|
"epoch": 0.5015752993068683,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004980346234728549,
|
|
"loss": 5.6829,
|
|
"mean_token_accuracy": 0.15636452287435532,
|
|
"num_tokens": 11013176.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.856866264343262,
|
|
"epoch": 0.5019953791220332,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004980306719899424,
|
|
"loss": 5.7417,
|
|
"mean_token_accuracy": 0.1482336312532425,
|
|
"num_tokens": 11022636.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.81472544670105,
|
|
"epoch": 0.5024154589371981,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980267165561564,
|
|
"loss": 5.6994,
|
|
"mean_token_accuracy": 0.15061589032411576,
|
|
"num_tokens": 11031896.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.8317889213562015,
|
|
"epoch": 0.502835538752363,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980227571715669,
|
|
"loss": 5.7442,
|
|
"mean_token_accuracy": 0.14868111461400985,
|
|
"num_tokens": 11040802.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.817817497253418,
|
|
"epoch": 0.5032556185675279,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004980187938362441,
|
|
"loss": 5.6616,
|
|
"mean_token_accuracy": 0.14449788331985475,
|
|
"num_tokens": 11049701.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.8403524398803714,
|
|
"epoch": 0.5036756983826927,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004980148265502581,
|
|
"loss": 5.8553,
|
|
"mean_token_accuracy": 0.1392398163676262,
|
|
"num_tokens": 11059555.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.883025121688843,
|
|
"epoch": 0.5040957781978576,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004980108553136795,
|
|
"loss": 5.7762,
|
|
"mean_token_accuracy": 0.14863402545452117,
|
|
"num_tokens": 11068940.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5040957781978576,
|
|
"eval_entropy": 5.732787127158954,
|
|
"eval_loss": 5.7686614990234375,
|
|
"eval_mean_token_accuracy": 0.15331337192289018,
|
|
"eval_num_tokens": 11068940.0,
|
|
"eval_runtime": 27.3892,
|
|
"eval_samples_per_second": 1364.261,
|
|
"eval_steps_per_second": 170.542,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 5.908424186706543,
|
|
"epoch": 0.5045158580130225,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980068801265783,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.14692858532071112,
|
|
"num_tokens": 11079014.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 5.866373205184937,
|
|
"epoch": 0.5049359378281874,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004980029009890251,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.1466228261590004,
|
|
"num_tokens": 11089526.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 5.839123296737671,
|
|
"epoch": 0.5053560176433523,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004979989179010904,
|
|
"loss": 5.7197,
|
|
"mean_token_accuracy": 0.15178524404764177,
|
|
"num_tokens": 11099156.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 5.760820007324218,
|
|
"epoch": 0.5057760974585171,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004979949308628445,
|
|
"loss": 5.7078,
|
|
"mean_token_accuracy": 0.15017148554325105,
|
|
"num_tokens": 11108242.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 5.7764500141143795,
|
|
"epoch": 0.506196177273682,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004979909398743584,
|
|
"loss": 5.7066,
|
|
"mean_token_accuracy": 0.15099107772111892,
|
|
"num_tokens": 11118076.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 5.893146562576294,
|
|
"epoch": 0.5066162570888468,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004979869449357026,
|
|
"loss": 5.7766,
|
|
"mean_token_accuracy": 0.15781906694173814,
|
|
"num_tokens": 11127265.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 5.810907363891602,
|
|
"epoch": 0.5070363369040117,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004979829460469478,
|
|
"loss": 5.6965,
|
|
"mean_token_accuracy": 0.1483650103211403,
|
|
"num_tokens": 11136429.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.813454437255859,
|
|
"epoch": 0.5074564167191766,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979789432081649,
|
|
"loss": 5.7139,
|
|
"mean_token_accuracy": 0.1487409368157387,
|
|
"num_tokens": 11146201.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 5.864733123779297,
|
|
"epoch": 0.5078764965343415,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000497974936419425,
|
|
"loss": 5.7222,
|
|
"mean_token_accuracy": 0.15236361622810363,
|
|
"num_tokens": 11154867.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 5.746392869949341,
|
|
"epoch": 0.5082965763495064,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004979709256807989,
|
|
"loss": 5.758,
|
|
"mean_token_accuracy": 0.1480425164103508,
|
|
"num_tokens": 11164092.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 5.840289688110351,
|
|
"epoch": 0.5087166561646713,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004979669109923575,
|
|
"loss": 5.7754,
|
|
"mean_token_accuracy": 0.14666769057512283,
|
|
"num_tokens": 11173176.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 5.953520202636719,
|
|
"epoch": 0.5091367359798362,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004979628923541721,
|
|
"loss": 5.7491,
|
|
"mean_token_accuracy": 0.1458544984459877,
|
|
"num_tokens": 11182397.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 5.871777105331421,
|
|
"epoch": 0.509556815795001,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497958869766314,
|
|
"loss": 5.7938,
|
|
"mean_token_accuracy": 0.14472762495279312,
|
|
"num_tokens": 11191790.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 5.785938310623169,
|
|
"epoch": 0.5099768956101659,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979548432288543,
|
|
"loss": 5.7104,
|
|
"mean_token_accuracy": 0.1533594697713852,
|
|
"num_tokens": 11201104.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 5.850540256500244,
|
|
"epoch": 0.5103969754253308,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004979508127418643,
|
|
"loss": 5.7179,
|
|
"mean_token_accuracy": 0.1509293831884861,
|
|
"num_tokens": 11209578.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 5.824426078796387,
|
|
"epoch": 0.5108170552404957,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004979467783054155,
|
|
"loss": 5.6559,
|
|
"mean_token_accuracy": 0.15454075038433074,
|
|
"num_tokens": 11218380.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 5.734690237045288,
|
|
"epoch": 0.5112371350556606,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979427399195793,
|
|
"loss": 5.6795,
|
|
"mean_token_accuracy": 0.1466882646083832,
|
|
"num_tokens": 11227810.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.784052991867066,
|
|
"epoch": 0.5116572148708255,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004979386975844274,
|
|
"loss": 5.6925,
|
|
"mean_token_accuracy": 0.1516873687505722,
|
|
"num_tokens": 11236631.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 5.811602210998535,
|
|
"epoch": 0.5120772946859904,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004979346513000311,
|
|
"loss": 5.7643,
|
|
"mean_token_accuracy": 0.14228157997131347,
|
|
"num_tokens": 11247418.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 5.801711654663086,
|
|
"epoch": 0.5124973745011552,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004979306010664623,
|
|
"loss": 5.6482,
|
|
"mean_token_accuracy": 0.15656405985355376,
|
|
"num_tokens": 11256246.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 5.709601259231567,
|
|
"epoch": 0.5129174543163201,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004979265468837927,
|
|
"loss": 5.6377,
|
|
"mean_token_accuracy": 0.15466838777065278,
|
|
"num_tokens": 11265980.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.778408575057983,
|
|
"epoch": 0.513337534131485,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000497922488752094,
|
|
"loss": 5.6873,
|
|
"mean_token_accuracy": 0.1463077425956726,
|
|
"num_tokens": 11276158.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 5.757645797729492,
|
|
"epoch": 0.5137576139466499,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004979184266714383,
|
|
"loss": 5.6121,
|
|
"mean_token_accuracy": 0.1554221287369728,
|
|
"num_tokens": 11284957.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 5.694925689697266,
|
|
"epoch": 0.5141776937618148,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979143606418974,
|
|
"loss": 5.6283,
|
|
"mean_token_accuracy": 0.1562877871096134,
|
|
"num_tokens": 11294340.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.903133296966553,
|
|
"epoch": 0.5145977735769797,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979102906635435,
|
|
"loss": 5.8808,
|
|
"mean_token_accuracy": 0.14421921372413635,
|
|
"num_tokens": 11303344.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 5.9017737865447994,
|
|
"epoch": 0.5150178533921445,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979062167364486,
|
|
"loss": 5.7468,
|
|
"mean_token_accuracy": 0.15465227216482164,
|
|
"num_tokens": 11311338.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 5.760764503479004,
|
|
"epoch": 0.5154379332073094,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004979021388606847,
|
|
"loss": 5.5793,
|
|
"mean_token_accuracy": 0.16053801253437996,
|
|
"num_tokens": 11320194.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 5.783118629455567,
|
|
"epoch": 0.5158580130224742,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978980570363243,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.15072498917579652,
|
|
"num_tokens": 11329952.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.807923793792725,
|
|
"epoch": 0.5162780928376391,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004978939712634396,
|
|
"loss": 5.7097,
|
|
"mean_token_accuracy": 0.1485825777053833,
|
|
"num_tokens": 11339384.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 5.927007532119751,
|
|
"epoch": 0.516698172652804,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978898815421029,
|
|
"loss": 5.882,
|
|
"mean_token_accuracy": 0.14463590383529662,
|
|
"num_tokens": 11348409.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 5.948485612869263,
|
|
"epoch": 0.5171182524679689,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004978857878723867,
|
|
"loss": 5.7826,
|
|
"mean_token_accuracy": 0.1465214103460312,
|
|
"num_tokens": 11357478.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.871764278411865,
|
|
"epoch": 0.5175383322831338,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004978816902543636,
|
|
"loss": 5.7924,
|
|
"mean_token_accuracy": 0.14824822992086412,
|
|
"num_tokens": 11366379.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.857372522354126,
|
|
"epoch": 0.5179584120982986,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978775886881062,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.144633187353611,
|
|
"num_tokens": 11376357.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 5.790678644180298,
|
|
"epoch": 0.5183784919134635,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000497873483173687,
|
|
"loss": 5.682,
|
|
"mean_token_accuracy": 0.1550826385617256,
|
|
"num_tokens": 11384995.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 5.803675746917724,
|
|
"epoch": 0.5187985717286284,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978693737111787,
|
|
"loss": 5.691,
|
|
"mean_token_accuracy": 0.14901078641414642,
|
|
"num_tokens": 11395363.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.773939752578736,
|
|
"epoch": 0.5192186515437933,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978652603006543,
|
|
"loss": 5.6785,
|
|
"mean_token_accuracy": 0.14922358542680741,
|
|
"num_tokens": 11404511.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.83831205368042,
|
|
"epoch": 0.5196387313589582,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004978611429421866,
|
|
"loss": 5.7376,
|
|
"mean_token_accuracy": 0.14898759126663208,
|
|
"num_tokens": 11413400.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 5.867534255981445,
|
|
"epoch": 0.5200588111741231,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004978570216358485,
|
|
"loss": 5.7719,
|
|
"mean_token_accuracy": 0.14096312299370767,
|
|
"num_tokens": 11423693.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 5.85771164894104,
|
|
"epoch": 0.520478890989288,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000497852896381713,
|
|
"loss": 5.7317,
|
|
"mean_token_accuracy": 0.14528233110904692,
|
|
"num_tokens": 11433195.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 5.8870384216308596,
|
|
"epoch": 0.5208989708044528,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004978487671798531,
|
|
"loss": 5.8604,
|
|
"mean_token_accuracy": 0.13629197254776954,
|
|
"num_tokens": 11443416.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 5.938678550720215,
|
|
"epoch": 0.5213190506196177,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004978446340303422,
|
|
"loss": 5.7271,
|
|
"mean_token_accuracy": 0.15116187259554864,
|
|
"num_tokens": 11452487.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 5.809211301803589,
|
|
"epoch": 0.5217391304347826,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004978404969332533,
|
|
"loss": 5.7517,
|
|
"mean_token_accuracy": 0.15704237520694733,
|
|
"num_tokens": 11461893.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.73575005531311,
|
|
"epoch": 0.5221592102499475,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004978363558886597,
|
|
"loss": 5.6754,
|
|
"mean_token_accuracy": 0.14295373037457465,
|
|
"num_tokens": 11471238.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 5.850252771377564,
|
|
"epoch": 0.5225792900651124,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004978322108966348,
|
|
"loss": 5.7739,
|
|
"mean_token_accuracy": 0.14141838401556014,
|
|
"num_tokens": 11480571.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 5.817096996307373,
|
|
"epoch": 0.5229993698802773,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978280619572521,
|
|
"loss": 5.7567,
|
|
"mean_token_accuracy": 0.14793166518211365,
|
|
"num_tokens": 11489552.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 5.864131927490234,
|
|
"epoch": 0.5234194496954422,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497823909070585,
|
|
"loss": 5.8087,
|
|
"mean_token_accuracy": 0.1432569444179535,
|
|
"num_tokens": 11498715.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 5.847290849685669,
|
|
"epoch": 0.523839529510607,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978197522367071,
|
|
"loss": 5.7472,
|
|
"mean_token_accuracy": 0.14424416646361352,
|
|
"num_tokens": 11508472.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 5.939693546295166,
|
|
"epoch": 0.5242596093257719,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004978155914556919,
|
|
"loss": 5.6864,
|
|
"mean_token_accuracy": 0.15637651830911636,
|
|
"num_tokens": 11517620.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.744783592224121,
|
|
"epoch": 0.5246796891409368,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004978114267276134,
|
|
"loss": 5.7336,
|
|
"mean_token_accuracy": 0.14782111793756486,
|
|
"num_tokens": 11526106.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 5.853097581863404,
|
|
"epoch": 0.5250997689561017,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978072580525451,
|
|
"loss": 5.7751,
|
|
"mean_token_accuracy": 0.14963556379079818,
|
|
"num_tokens": 11535840.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 5.883814191818237,
|
|
"epoch": 0.5255198487712666,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000497803085430561,
|
|
"loss": 5.7622,
|
|
"mean_token_accuracy": 0.15003612414002418,
|
|
"num_tokens": 11545110.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 5.879300594329834,
|
|
"epoch": 0.5259399285864315,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004977989088617349,
|
|
"loss": 5.7805,
|
|
"mean_token_accuracy": 0.1432628057897091,
|
|
"num_tokens": 11554382.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 5.77400393486023,
|
|
"epoch": 0.5263600084015964,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000497794728346141,
|
|
"loss": 5.632,
|
|
"mean_token_accuracy": 0.1552414707839489,
|
|
"num_tokens": 11562821.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 5.952142190933228,
|
|
"epoch": 0.5267800882167611,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004977905438838531,
|
|
"loss": 5.8474,
|
|
"mean_token_accuracy": 0.14172168597579002,
|
|
"num_tokens": 11571705.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 5.71492829322815,
|
|
"epoch": 0.527200168031926,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004977863554749453,
|
|
"loss": 5.6778,
|
|
"mean_token_accuracy": 0.14525432735681534,
|
|
"num_tokens": 11580692.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 5.727636861801147,
|
|
"epoch": 0.5276202478470909,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004977821631194922,
|
|
"loss": 5.686,
|
|
"mean_token_accuracy": 0.14509947448968888,
|
|
"num_tokens": 11589966.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 5.8679040431976315,
|
|
"epoch": 0.5280403276622558,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004977779668175677,
|
|
"loss": 5.7627,
|
|
"mean_token_accuracy": 0.1469483494758606,
|
|
"num_tokens": 11599627.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 5.856904077529907,
|
|
"epoch": 0.5284604074774207,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004977737665692461,
|
|
"loss": 5.7366,
|
|
"mean_token_accuracy": 0.15558115839958192,
|
|
"num_tokens": 11608431.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 5.841502332687378,
|
|
"epoch": 0.5288804872925856,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004977695623746021,
|
|
"loss": 5.6142,
|
|
"mean_token_accuracy": 0.14905260503292084,
|
|
"num_tokens": 11617552.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 5.712338972091675,
|
|
"epoch": 0.5293005671077504,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004977653542337099,
|
|
"loss": 5.6645,
|
|
"mean_token_accuracy": 0.15581920593976975,
|
|
"num_tokens": 11626828.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 5.804640913009644,
|
|
"epoch": 0.5297206469229153,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004977611421466443,
|
|
"loss": 5.746,
|
|
"mean_token_accuracy": 0.14610961824655533,
|
|
"num_tokens": 11635867.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 5.886562156677246,
|
|
"epoch": 0.5301407267380802,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977569261134797,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15055324360728264,
|
|
"num_tokens": 11644711.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 5.830437183380127,
|
|
"epoch": 0.5305608065532451,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004977527061342908,
|
|
"loss": 5.7385,
|
|
"mean_token_accuracy": 0.15071533769369125,
|
|
"num_tokens": 11653320.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 5.832324886322022,
|
|
"epoch": 0.53098088636841,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004977484822091524,
|
|
"loss": 5.703,
|
|
"mean_token_accuracy": 0.15310411900281906,
|
|
"num_tokens": 11662753.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 5.879701805114746,
|
|
"epoch": 0.5314009661835749,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004977442543381394,
|
|
"loss": 5.7395,
|
|
"mean_token_accuracy": 0.1498982183635235,
|
|
"num_tokens": 11671622.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.854084539413452,
|
|
"epoch": 0.5318210459987398,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004977400225213266,
|
|
"loss": 5.7196,
|
|
"mean_token_accuracy": 0.14721598774194716,
|
|
"num_tokens": 11679964.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.763905620574951,
|
|
"epoch": 0.5322411258139046,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497735786758789,
|
|
"loss": 5.6842,
|
|
"mean_token_accuracy": 0.1521085247397423,
|
|
"num_tokens": 11688700.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 5.846723842620849,
|
|
"epoch": 0.5326612056290695,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004977315470506016,
|
|
"loss": 5.8056,
|
|
"mean_token_accuracy": 0.14883239492774009,
|
|
"num_tokens": 11698425.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 5.966537141799927,
|
|
"epoch": 0.5330812854442344,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004977273033968397,
|
|
"loss": 5.791,
|
|
"mean_token_accuracy": 0.13928466588258742,
|
|
"num_tokens": 11707705.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.8435125827789305,
|
|
"epoch": 0.5335013652593993,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004977230557975782,
|
|
"loss": 5.6783,
|
|
"mean_token_accuracy": 0.1494770586490631,
|
|
"num_tokens": 11717079.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 5.791642379760742,
|
|
"epoch": 0.5339214450745642,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004977188042528923,
|
|
"loss": 5.6678,
|
|
"mean_token_accuracy": 0.14970564991235732,
|
|
"num_tokens": 11725504.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 5.847938060760498,
|
|
"epoch": 0.5343415248897291,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004977145487628576,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14778463244438172,
|
|
"num_tokens": 11735282.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.854086971282959,
|
|
"epoch": 0.534761604704894,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004977102893275494,
|
|
"loss": 5.7377,
|
|
"mean_token_accuracy": 0.14616001397371292,
|
|
"num_tokens": 11744827.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 5.835380983352661,
|
|
"epoch": 0.5351816845200588,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000497706025947043,
|
|
"loss": 5.7012,
|
|
"mean_token_accuracy": 0.14849554300308226,
|
|
"num_tokens": 11753066.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 5.829690742492676,
|
|
"epoch": 0.5356017643352237,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004977017586214142,
|
|
"loss": 5.7175,
|
|
"mean_token_accuracy": 0.14658187404274942,
|
|
"num_tokens": 11761190.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 5.845994329452514,
|
|
"epoch": 0.5360218441503886,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004976974873507382,
|
|
"loss": 5.6947,
|
|
"mean_token_accuracy": 0.15390099734067916,
|
|
"num_tokens": 11770321.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 5.7918110370635985,
|
|
"epoch": 0.5364419239655535,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000497693212135091,
|
|
"loss": 5.7547,
|
|
"mean_token_accuracy": 0.14563888013362886,
|
|
"num_tokens": 11778388.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 5.857013368606568,
|
|
"epoch": 0.5368620037807184,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004976889329745482,
|
|
"loss": 5.6164,
|
|
"mean_token_accuracy": 0.15133741348981858,
|
|
"num_tokens": 11786250.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 5.720251989364624,
|
|
"epoch": 0.5372820835958833,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004976846498691857,
|
|
"loss": 5.579,
|
|
"mean_token_accuracy": 0.15662760883569718,
|
|
"num_tokens": 11794831.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 5.777666759490967,
|
|
"epoch": 0.5377021634110482,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004976803628190792,
|
|
"loss": 5.6537,
|
|
"mean_token_accuracy": 0.15591528862714768,
|
|
"num_tokens": 11803550.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 5.767534923553467,
|
|
"epoch": 0.5381222432262129,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004976760718243047,
|
|
"loss": 5.7165,
|
|
"mean_token_accuracy": 0.14894714206457138,
|
|
"num_tokens": 11812478.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 5.8361043453216555,
|
|
"epoch": 0.5385423230413778,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004976717768849383,
|
|
"loss": 5.6892,
|
|
"mean_token_accuracy": 0.14339745715260505,
|
|
"num_tokens": 11822463.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 5.79760046005249,
|
|
"epoch": 0.5389624028565427,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0004976674780010561,
|
|
"loss": 5.7244,
|
|
"mean_token_accuracy": 0.13902894631028176,
|
|
"num_tokens": 11831853.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.824806070327758,
|
|
"epoch": 0.5393824826717076,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000497663175172734,
|
|
"loss": 5.7457,
|
|
"mean_token_accuracy": 0.1442998580634594,
|
|
"num_tokens": 11841574.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 5.9099555015563965,
|
|
"epoch": 0.5398025624868725,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004976588684000486,
|
|
"loss": 5.8432,
|
|
"mean_token_accuracy": 0.13176233023405076,
|
|
"num_tokens": 11852489.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 5.846707534790039,
|
|
"epoch": 0.5402226423020374,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004976545576830759,
|
|
"loss": 5.6999,
|
|
"mean_token_accuracy": 0.1471443608403206,
|
|
"num_tokens": 11861499.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 5.810786867141724,
|
|
"epoch": 0.5406427221172023,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004976502430218924,
|
|
"loss": 5.776,
|
|
"mean_token_accuracy": 0.14316292852163315,
|
|
"num_tokens": 11871685.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 5.8063677787780765,
|
|
"epoch": 0.5410628019323671,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004976459244165744,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.14863400161266327,
|
|
"num_tokens": 11881340.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.772097444534301,
|
|
"epoch": 0.541482881747532,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004976416018671986,
|
|
"loss": 5.7131,
|
|
"mean_token_accuracy": 0.14742937684059143,
|
|
"num_tokens": 11890700.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 5.814801359176636,
|
|
"epoch": 0.5419029615626969,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004976372753738415,
|
|
"loss": 5.7129,
|
|
"mean_token_accuracy": 0.14111651703715325,
|
|
"num_tokens": 11900329.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 5.9360603332519535,
|
|
"epoch": 0.5423230413778618,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004976329449365795,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1429471679031849,
|
|
"num_tokens": 11909915.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 5.787397623062134,
|
|
"epoch": 0.5427431211930267,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004976286105554897,
|
|
"loss": 5.7645,
|
|
"mean_token_accuracy": 0.14958669245243073,
|
|
"num_tokens": 11918302.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.77375168800354,
|
|
"epoch": 0.5431632010081916,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004976242722306487,
|
|
"loss": 5.7198,
|
|
"mean_token_accuracy": 0.14630756974220277,
|
|
"num_tokens": 11927794.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 5.919241952896118,
|
|
"epoch": 0.5435832808233564,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004976199299621333,
|
|
"loss": 5.747,
|
|
"mean_token_accuracy": 0.14924167543649675,
|
|
"num_tokens": 11937701.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 5.725202035903931,
|
|
"epoch": 0.5440033606385213,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004976155837500205,
|
|
"loss": 5.6509,
|
|
"mean_token_accuracy": 0.15285194665193558,
|
|
"num_tokens": 11946106.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 5.793752574920655,
|
|
"epoch": 0.5444234404536862,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004976112335943872,
|
|
"loss": 5.5899,
|
|
"mean_token_accuracy": 0.15264788568019866,
|
|
"num_tokens": 11954604.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.727561092376709,
|
|
"epoch": 0.5448435202688511,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004976068794953106,
|
|
"loss": 5.655,
|
|
"mean_token_accuracy": 0.15496142357587814,
|
|
"num_tokens": 11963664.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 5.800908708572388,
|
|
"epoch": 0.545263600084016,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004976025214528677,
|
|
"loss": 5.6569,
|
|
"mean_token_accuracy": 0.15130768865346908,
|
|
"num_tokens": 11973426.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 5.773944950103759,
|
|
"epoch": 0.5456836798991809,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004975981594671359,
|
|
"loss": 5.6981,
|
|
"mean_token_accuracy": 0.14681158736348152,
|
|
"num_tokens": 11982339.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.846315574645996,
|
|
"epoch": 0.5461037597143458,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004975937935381921,
|
|
"loss": 5.7408,
|
|
"mean_token_accuracy": 0.15329586565494538,
|
|
"num_tokens": 11992016.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 5.7528393268585205,
|
|
"epoch": 0.5465238395295106,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000497589423666114,
|
|
"loss": 5.7341,
|
|
"mean_token_accuracy": 0.1440807357430458,
|
|
"num_tokens": 12000616.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 5.6946946144104,
|
|
"epoch": 0.5469439193446755,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975850498509789,
|
|
"loss": 5.6253,
|
|
"mean_token_accuracy": 0.15553901046514512,
|
|
"num_tokens": 12009717.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 5.767681360244751,
|
|
"epoch": 0.5473639991598404,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004975806720928642,
|
|
"loss": 5.713,
|
|
"mean_token_accuracy": 0.1479937508702278,
|
|
"num_tokens": 12018020.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 5.797775173187256,
|
|
"epoch": 0.5477840789750053,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004975762903918475,
|
|
"loss": 5.7163,
|
|
"mean_token_accuracy": 0.14613735526800156,
|
|
"num_tokens": 12027119.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 5.875396728515625,
|
|
"epoch": 0.5482041587901701,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975719047480064,
|
|
"loss": 5.6829,
|
|
"mean_token_accuracy": 0.15304642170667648,
|
|
"num_tokens": 12035566.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 5.761675643920898,
|
|
"epoch": 0.548624238605335,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975675151614187,
|
|
"loss": 5.6105,
|
|
"mean_token_accuracy": 0.15602717846632003,
|
|
"num_tokens": 12044505.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 5.709016609191894,
|
|
"epoch": 0.5490443184204999,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000497563121632162,
|
|
"loss": 5.6827,
|
|
"mean_token_accuracy": 0.15345038324594498,
|
|
"num_tokens": 12053338.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 5.784457445144653,
|
|
"epoch": 0.5494643982356647,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004975587241603142,
|
|
"loss": 5.676,
|
|
"mean_token_accuracy": 0.14854272603988647,
|
|
"num_tokens": 12063235.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 5.909809684753418,
|
|
"epoch": 0.5498844780508296,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004975543227459533,
|
|
"loss": 5.7491,
|
|
"mean_token_accuracy": 0.1429952785372734,
|
|
"num_tokens": 12072490.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 5.8736042976379395,
|
|
"epoch": 0.5503045578659945,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004975499173891571,
|
|
"loss": 5.818,
|
|
"mean_token_accuracy": 0.14217820167541503,
|
|
"num_tokens": 12081474.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 5.804098796844483,
|
|
"epoch": 0.5507246376811594,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975455080900037,
|
|
"loss": 5.6739,
|
|
"mean_token_accuracy": 0.15498915761709214,
|
|
"num_tokens": 12090963.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 5.811689233779907,
|
|
"epoch": 0.5511447174963243,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004975410948485713,
|
|
"loss": 5.6853,
|
|
"mean_token_accuracy": 0.1526065543293953,
|
|
"num_tokens": 12099786.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 5.74642539024353,
|
|
"epoch": 0.5515647973114892,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004975366776649379,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14672838300466537,
|
|
"num_tokens": 12108469.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.774152183532715,
|
|
"epoch": 0.5519848771266541,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004975322565391818,
|
|
"loss": 5.6804,
|
|
"mean_token_accuracy": 0.1517785020172596,
|
|
"num_tokens": 12118287.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 5.879052972793579,
|
|
"epoch": 0.5524049569418189,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004975278314713814,
|
|
"loss": 5.8381,
|
|
"mean_token_accuracy": 0.14230698868632316,
|
|
"num_tokens": 12127122.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 5.914984178543091,
|
|
"epoch": 0.5528250367569838,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004975234024616152,
|
|
"loss": 5.731,
|
|
"mean_token_accuracy": 0.15133389160037042,
|
|
"num_tokens": 12136395.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.734422016143799,
|
|
"epoch": 0.5532451165721487,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004975189695099613,
|
|
"loss": 5.6943,
|
|
"mean_token_accuracy": 0.15051371306180955,
|
|
"num_tokens": 12145025.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 5.800812196731568,
|
|
"epoch": 0.5536651963873136,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004975145326164985,
|
|
"loss": 5.7429,
|
|
"mean_token_accuracy": 0.1447499178349972,
|
|
"num_tokens": 12154352.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.8064220428466795,
|
|
"epoch": 0.5540852762024785,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004975100917813055,
|
|
"loss": 5.6588,
|
|
"mean_token_accuracy": 0.15041681826114656,
|
|
"num_tokens": 12163802.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 5.750297594070434,
|
|
"epoch": 0.5545053560176434,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004975056470044606,
|
|
"loss": 5.682,
|
|
"mean_token_accuracy": 0.14631521701812744,
|
|
"num_tokens": 12173111.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 5.8171515464782715,
|
|
"epoch": 0.5549254358328082,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004975011982860428,
|
|
"loss": 5.7383,
|
|
"mean_token_accuracy": 0.14391349628567696,
|
|
"num_tokens": 12182048.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 5.812657642364502,
|
|
"epoch": 0.5553455156479731,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004974967456261309,
|
|
"loss": 5.7159,
|
|
"mean_token_accuracy": 0.15039578825235367,
|
|
"num_tokens": 12191501.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.857609844207763,
|
|
"epoch": 0.555765595463138,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004974922890248036,
|
|
"loss": 5.7249,
|
|
"mean_token_accuracy": 0.15451397448778154,
|
|
"num_tokens": 12201132.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 5.899567031860352,
|
|
"epoch": 0.5561856752783029,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00049748782848214,
|
|
"loss": 5.8549,
|
|
"mean_token_accuracy": 0.14553611800074578,
|
|
"num_tokens": 12211082.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 5.807045125961304,
|
|
"epoch": 0.5566057550934678,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004974833639982192,
|
|
"loss": 5.6909,
|
|
"mean_token_accuracy": 0.15329068303108215,
|
|
"num_tokens": 12219946.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 5.925949478149414,
|
|
"epoch": 0.5570258349086327,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00049747889557312,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.14512094482779503,
|
|
"num_tokens": 12229668.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 5.886264276504517,
|
|
"epoch": 0.5574459147237976,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004974744232069219,
|
|
"loss": 5.7574,
|
|
"mean_token_accuracy": 0.14679303765296936,
|
|
"num_tokens": 12238750.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.809984493255615,
|
|
"epoch": 0.5578659945389624,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004974699468997038,
|
|
"loss": 5.7017,
|
|
"mean_token_accuracy": 0.14905162900686264,
|
|
"num_tokens": 12246825.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 5.811229848861695,
|
|
"epoch": 0.5582860743541272,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004974654666515452,
|
|
"loss": 5.6602,
|
|
"mean_token_accuracy": 0.14834603071212768,
|
|
"num_tokens": 12256413.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 5.882418012619018,
|
|
"epoch": 0.5587061541692921,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004974609824625254,
|
|
"loss": 5.6729,
|
|
"mean_token_accuracy": 0.1607891857624054,
|
|
"num_tokens": 12265458.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 5.649556875228882,
|
|
"epoch": 0.559126233984457,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004974564943327239,
|
|
"loss": 5.6227,
|
|
"mean_token_accuracy": 0.15252939462661744,
|
|
"num_tokens": 12274124.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 5.668555736541748,
|
|
"epoch": 0.5595463137996219,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00049745200226222,
|
|
"loss": 5.5888,
|
|
"mean_token_accuracy": 0.16476203203201295,
|
|
"num_tokens": 12283513.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 5.861951494216919,
|
|
"epoch": 0.5599663936147868,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004974475062510936,
|
|
"loss": 5.7171,
|
|
"mean_token_accuracy": 0.15322822630405425,
|
|
"num_tokens": 12292396.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 5.834360265731812,
|
|
"epoch": 0.5603864734299517,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974430062994242,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1490551695227623,
|
|
"num_tokens": 12301604.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 5.901991987228394,
|
|
"epoch": 0.5608065532451165,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004974385024072912,
|
|
"loss": 5.7881,
|
|
"mean_token_accuracy": 0.14175782203674317,
|
|
"num_tokens": 12310458.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 5.967726707458496,
|
|
"epoch": 0.5612266330602814,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000497433994574775,
|
|
"loss": 5.7835,
|
|
"mean_token_accuracy": 0.1453966811299324,
|
|
"num_tokens": 12319620.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 5.85808310508728,
|
|
"epoch": 0.5616467128754463,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000497429482801955,
|
|
"loss": 5.8356,
|
|
"mean_token_accuracy": 0.1476121611893177,
|
|
"num_tokens": 12329518.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.773319292068481,
|
|
"epoch": 0.5620667926906112,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004974249670889111,
|
|
"loss": 5.6512,
|
|
"mean_token_accuracy": 0.15055545866489412,
|
|
"num_tokens": 12338244.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 5.965986871719361,
|
|
"epoch": 0.5624868725057761,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974204474357237,
|
|
"loss": 5.8233,
|
|
"mean_token_accuracy": 0.14185196608304979,
|
|
"num_tokens": 12347962.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 5.896701097488403,
|
|
"epoch": 0.562906952320941,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004974159238424723,
|
|
"loss": 5.7434,
|
|
"mean_token_accuracy": 0.14349103569984437,
|
|
"num_tokens": 12357020.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 5.812654113769531,
|
|
"epoch": 0.5633270321361059,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004974113963092376,
|
|
"loss": 5.7151,
|
|
"mean_token_accuracy": 0.1478872776031494,
|
|
"num_tokens": 12366108.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 5.879363203048706,
|
|
"epoch": 0.5637471119512707,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004974068648360995,
|
|
"loss": 5.646,
|
|
"mean_token_accuracy": 0.15770871341228485,
|
|
"num_tokens": 12374508.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 5.793216609954834,
|
|
"epoch": 0.5641671917664356,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004974023294231383,
|
|
"loss": 5.652,
|
|
"mean_token_accuracy": 0.15676265954971313,
|
|
"num_tokens": 12383555.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 5.762006092071533,
|
|
"epoch": 0.5645872715816005,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004973977900704342,
|
|
"loss": 5.7612,
|
|
"mean_token_accuracy": 0.1457872360944748,
|
|
"num_tokens": 12392680.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.872710561752319,
|
|
"epoch": 0.5650073513967654,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004973932467780679,
|
|
"loss": 5.7963,
|
|
"mean_token_accuracy": 0.14350106567144394,
|
|
"num_tokens": 12401881.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 5.897738790512085,
|
|
"epoch": 0.5654274312119303,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004973886995461197,
|
|
"loss": 5.7755,
|
|
"mean_token_accuracy": 0.14316605031490326,
|
|
"num_tokens": 12411487.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 5.799207353591919,
|
|
"epoch": 0.5658475110270952,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973841483746703,
|
|
"loss": 5.594,
|
|
"mean_token_accuracy": 0.16017859652638436,
|
|
"num_tokens": 12420376.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 5.6296477794647215,
|
|
"epoch": 0.5662675908422601,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004973795932638001,
|
|
"loss": 5.639,
|
|
"mean_token_accuracy": 0.15424187034368514,
|
|
"num_tokens": 12429518.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 5.768233728408814,
|
|
"epoch": 0.5666876706574249,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00049737503421359,
|
|
"loss": 5.6208,
|
|
"mean_token_accuracy": 0.15618278905749322,
|
|
"num_tokens": 12438952.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 5.762353801727295,
|
|
"epoch": 0.5671077504725898,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004973704712241206,
|
|
"loss": 5.6399,
|
|
"mean_token_accuracy": 0.14973016381263732,
|
|
"num_tokens": 12448576.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.758606004714966,
|
|
"epoch": 0.5675278302877547,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004973659042954729,
|
|
"loss": 5.666,
|
|
"mean_token_accuracy": 0.15317632332444192,
|
|
"num_tokens": 12458166.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 5.703948211669922,
|
|
"epoch": 0.5679479101029196,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004973613334277277,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.15764016062021255,
|
|
"num_tokens": 12467271.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 5.815484666824341,
|
|
"epoch": 0.5683679899180845,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004973567586209658,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.1427201583981514,
|
|
"num_tokens": 12476255.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 5.838050889968872,
|
|
"epoch": 0.5687880697332494,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004973521798752686,
|
|
"loss": 5.7306,
|
|
"mean_token_accuracy": 0.1476944074034691,
|
|
"num_tokens": 12485096.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 5.906451845169068,
|
|
"epoch": 0.5692081495484141,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000497347597190717,
|
|
"loss": 5.7558,
|
|
"mean_token_accuracy": 0.1506843164563179,
|
|
"num_tokens": 12494405.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 5.792209434509277,
|
|
"epoch": 0.569628229363579,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973430105673921,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.14848777875304223,
|
|
"num_tokens": 12503349.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.828717470169067,
|
|
"epoch": 0.5700483091787439,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004973384200053754,
|
|
"loss": 5.7518,
|
|
"mean_token_accuracy": 0.15347943902015687,
|
|
"num_tokens": 12513122.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.784585285186767,
|
|
"epoch": 0.5704683889939088,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000497333825504748,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14986882135272026,
|
|
"num_tokens": 12523614.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 5.838396644592285,
|
|
"epoch": 0.5708884688090737,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973292270655914,
|
|
"loss": 5.7434,
|
|
"mean_token_accuracy": 0.143761482834816,
|
|
"num_tokens": 12532031.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.926707601547241,
|
|
"epoch": 0.5713085486242386,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000497324624687987,
|
|
"loss": 5.8378,
|
|
"mean_token_accuracy": 0.1392517074942589,
|
|
"num_tokens": 12542239.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 5.917767190933228,
|
|
"epoch": 0.5717286284394035,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004973200183720164,
|
|
"loss": 5.7483,
|
|
"mean_token_accuracy": 0.14240999147295952,
|
|
"num_tokens": 12552608.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 5.775180721282959,
|
|
"epoch": 0.5721487082545683,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004973154081177611,
|
|
"loss": 5.593,
|
|
"mean_token_accuracy": 0.15000374913215636,
|
|
"num_tokens": 12562020.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 5.760695695877075,
|
|
"epoch": 0.5725687880697332,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004973107939253027,
|
|
"loss": 5.6762,
|
|
"mean_token_accuracy": 0.1592295289039612,
|
|
"num_tokens": 12570519.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 5.706324434280395,
|
|
"epoch": 0.5729888678848981,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004973061757947233,
|
|
"loss": 5.6616,
|
|
"mean_token_accuracy": 0.15384514778852462,
|
|
"num_tokens": 12579324.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.790519523620605,
|
|
"epoch": 0.573408947700063,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004973015537261043,
|
|
"loss": 5.7372,
|
|
"mean_token_accuracy": 0.1493046186864376,
|
|
"num_tokens": 12588014.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 5.890619230270386,
|
|
"epoch": 0.5738290275152279,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004972969277195279,
|
|
"loss": 5.7305,
|
|
"mean_token_accuracy": 0.15202558934688568,
|
|
"num_tokens": 12596882.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 5.819242668151856,
|
|
"epoch": 0.5742491073303928,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004972922977750757,
|
|
"loss": 5.6515,
|
|
"mean_token_accuracy": 0.1478489086031914,
|
|
"num_tokens": 12606069.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 5.829999208450317,
|
|
"epoch": 0.5746691871455577,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00049728766389283,
|
|
"loss": 5.6783,
|
|
"mean_token_accuracy": 0.1460999220609665,
|
|
"num_tokens": 12615167.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 5.775484275817871,
|
|
"epoch": 0.5750892669607225,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004972830260728729,
|
|
"loss": 5.7111,
|
|
"mean_token_accuracy": 0.15089115351438523,
|
|
"num_tokens": 12624230.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 5.81471266746521,
|
|
"epoch": 0.5755093467758874,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004972783843152863,
|
|
"loss": 5.6964,
|
|
"mean_token_accuracy": 0.15319516360759736,
|
|
"num_tokens": 12633158.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 5.742516231536865,
|
|
"epoch": 0.5759294265910523,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004972737386201527,
|
|
"loss": 5.6358,
|
|
"mean_token_accuracy": 0.1493402510881424,
|
|
"num_tokens": 12641465.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 5.772433757781982,
|
|
"epoch": 0.5763495064062172,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004972690889875541,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.15269945561885834,
|
|
"num_tokens": 12650437.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 5.9466852188110355,
|
|
"epoch": 0.5767695862213821,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004972644354175732,
|
|
"loss": 5.8321,
|
|
"mean_token_accuracy": 0.14773827642202378,
|
|
"num_tokens": 12660072.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 5.8965418338775635,
|
|
"epoch": 0.577189666036547,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004972597779102922,
|
|
"loss": 5.844,
|
|
"mean_token_accuracy": 0.14816712588071823,
|
|
"num_tokens": 12670405.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 5.826220703125,
|
|
"epoch": 0.5776097458517119,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004972551164657937,
|
|
"loss": 5.7126,
|
|
"mean_token_accuracy": 0.15028751343488694,
|
|
"num_tokens": 12679992.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 5.9022228717803955,
|
|
"epoch": 0.5780298256668767,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004972504510841602,
|
|
"loss": 5.7796,
|
|
"mean_token_accuracy": 0.14697190523147582,
|
|
"num_tokens": 12690289.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 5.883794593811035,
|
|
"epoch": 0.5784499054820416,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004972457817654745,
|
|
"loss": 5.7709,
|
|
"mean_token_accuracy": 0.14337689578533172,
|
|
"num_tokens": 12700518.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.896582746505738,
|
|
"epoch": 0.5788699852972065,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004972411085098191,
|
|
"loss": 5.8202,
|
|
"mean_token_accuracy": 0.138790999352932,
|
|
"num_tokens": 12710603.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.896594381332397,
|
|
"epoch": 0.5792900651123714,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497236431317277,
|
|
"loss": 5.7086,
|
|
"mean_token_accuracy": 0.14955383241176606,
|
|
"num_tokens": 12719298.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.828510808944702,
|
|
"epoch": 0.5797101449275363,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000497231750187931,
|
|
"loss": 5.7051,
|
|
"mean_token_accuracy": 0.1494380295276642,
|
|
"num_tokens": 12728368.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.847594785690307,
|
|
"epoch": 0.5801302247427012,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004972270651218638,
|
|
"loss": 5.769,
|
|
"mean_token_accuracy": 0.15052054449915886,
|
|
"num_tokens": 12737898.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 5.896743059158325,
|
|
"epoch": 0.580550304557866,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004972223761191587,
|
|
"loss": 5.7024,
|
|
"mean_token_accuracy": 0.1484552301466465,
|
|
"num_tokens": 12746761.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 5.748441457748413,
|
|
"epoch": 0.5809703843730308,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004972176831798986,
|
|
"loss": 5.6317,
|
|
"mean_token_accuracy": 0.1558982439339161,
|
|
"num_tokens": 12755128.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 5.8237542629241945,
|
|
"epoch": 0.5813904641881957,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004972129863041667,
|
|
"loss": 5.8145,
|
|
"mean_token_accuracy": 0.1419169031083584,
|
|
"num_tokens": 12764727.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 5.825289487838745,
|
|
"epoch": 0.5818105440033606,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004972082854920462,
|
|
"loss": 5.6682,
|
|
"mean_token_accuracy": 0.15212180316448212,
|
|
"num_tokens": 12773557.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.780522108078003,
|
|
"epoch": 0.5822306238185255,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004972035807436203,
|
|
"loss": 5.6741,
|
|
"mean_token_accuracy": 0.15388695299625396,
|
|
"num_tokens": 12782525.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.874711608886718,
|
|
"epoch": 0.5826507036336904,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004971988720589723,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.14911144897341727,
|
|
"num_tokens": 12791534.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 5.865447235107422,
|
|
"epoch": 0.5830707834488553,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004971941594381858,
|
|
"loss": 5.6622,
|
|
"mean_token_accuracy": 0.1520915597677231,
|
|
"num_tokens": 12800662.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.833262968063354,
|
|
"epoch": 0.5834908632640201,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004971894428813441,
|
|
"loss": 5.7134,
|
|
"mean_token_accuracy": 0.15022262334823608,
|
|
"num_tokens": 12809440.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 5.89053783416748,
|
|
"epoch": 0.583910943079185,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000497184722388531,
|
|
"loss": 5.7974,
|
|
"mean_token_accuracy": 0.14950450211763383,
|
|
"num_tokens": 12818560.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 5.910626697540283,
|
|
"epoch": 0.5843310228943499,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004971799979598297,
|
|
"loss": 5.7158,
|
|
"mean_token_accuracy": 0.15047362595796585,
|
|
"num_tokens": 12827898.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 5.736415719985962,
|
|
"epoch": 0.5847511027095148,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004971752695953243,
|
|
"loss": 5.6673,
|
|
"mean_token_accuracy": 0.15286629199981688,
|
|
"num_tokens": 12837199.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 5.841268587112427,
|
|
"epoch": 0.5851711825246797,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004971705372950984,
|
|
"loss": 5.6889,
|
|
"mean_token_accuracy": 0.14883269965648652,
|
|
"num_tokens": 12846493.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 5.862727975845337,
|
|
"epoch": 0.5855912623398446,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004971658010592358,
|
|
"loss": 5.7059,
|
|
"mean_token_accuracy": 0.14308914840221404,
|
|
"num_tokens": 12855026.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 5.807987260818481,
|
|
"epoch": 0.5860113421550095,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004971610608878205,
|
|
"loss": 5.7711,
|
|
"mean_token_accuracy": 0.14490452259778977,
|
|
"num_tokens": 12864563.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 5.884010982513428,
|
|
"epoch": 0.5864314219701743,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004971563167809363,
|
|
"loss": 5.7237,
|
|
"mean_token_accuracy": 0.15075904428958892,
|
|
"num_tokens": 12874358.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 5.7711278915405275,
|
|
"epoch": 0.5868515017853392,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004971515687386674,
|
|
"loss": 5.7117,
|
|
"mean_token_accuracy": 0.1473625972867012,
|
|
"num_tokens": 12883110.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 5.803575611114502,
|
|
"epoch": 0.5872715816005041,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004971468167610978,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.15010628029704093,
|
|
"num_tokens": 12892977.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 5.790566396713257,
|
|
"epoch": 0.587691661415669,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004971420608483117,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.1545809641480446,
|
|
"num_tokens": 12902327.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 5.741348314285278,
|
|
"epoch": 0.5881117412308339,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004971373010003936,
|
|
"loss": 5.6022,
|
|
"mean_token_accuracy": 0.16168920323252678,
|
|
"num_tokens": 12911957.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 5.8003096103668215,
|
|
"epoch": 0.5885318210459988,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004971325372174274,
|
|
"loss": 5.6907,
|
|
"mean_token_accuracy": 0.14657490849494934,
|
|
"num_tokens": 12920380.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"entropy": 5.811933612823486,
|
|
"epoch": 0.5889519008611637,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004971277694994976,
|
|
"loss": 5.7533,
|
|
"mean_token_accuracy": 0.15078987032175065,
|
|
"num_tokens": 12929670.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 5.819301414489746,
|
|
"epoch": 0.5893719806763285,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000497122997846689,
|
|
"loss": 5.6612,
|
|
"mean_token_accuracy": 0.1566910207271576,
|
|
"num_tokens": 12938185.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"entropy": 5.85056962966919,
|
|
"epoch": 0.5897920604914934,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004971182222590857,
|
|
"loss": 5.6984,
|
|
"mean_token_accuracy": 0.15590957552194595,
|
|
"num_tokens": 12947706.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 5.766946744918823,
|
|
"epoch": 0.5902121403066583,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004971134427367725,
|
|
"loss": 5.6836,
|
|
"mean_token_accuracy": 0.14876563102006912,
|
|
"num_tokens": 12957393.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"entropy": 5.863473749160766,
|
|
"epoch": 0.5906322201218231,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497108659279834,
|
|
"loss": 5.5813,
|
|
"mean_token_accuracy": 0.1580106034874916,
|
|
"num_tokens": 12967165.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 5.893796777725219,
|
|
"epoch": 0.591052299936988,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004971038718883551,
|
|
"loss": 5.7311,
|
|
"mean_token_accuracy": 0.14258148968219758,
|
|
"num_tokens": 12976490.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"entropy": 5.8169300079345705,
|
|
"epoch": 0.5914723797521529,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004970990805624203,
|
|
"loss": 5.7245,
|
|
"mean_token_accuracy": 0.1458576127886772,
|
|
"num_tokens": 12985423.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 5.806120443344116,
|
|
"epoch": 0.5918924595673178,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970942853021147,
|
|
"loss": 5.6187,
|
|
"mean_token_accuracy": 0.15678810328245163,
|
|
"num_tokens": 12994510.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"entropy": 5.8349559783935545,
|
|
"epoch": 0.5923125393824826,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004970894861075232,
|
|
"loss": 5.734,
|
|
"mean_token_accuracy": 0.1486038699746132,
|
|
"num_tokens": 13003383.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 5.833832693099976,
|
|
"epoch": 0.5927326191976475,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970846829787309,
|
|
"loss": 5.6695,
|
|
"mean_token_accuracy": 0.15129955112934113,
|
|
"num_tokens": 13012550.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"entropy": 5.845009517669678,
|
|
"epoch": 0.5931526990128124,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004970798759158227,
|
|
"loss": 5.7421,
|
|
"mean_token_accuracy": 0.14426639974117278,
|
|
"num_tokens": 13022066.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 5.804647397994995,
|
|
"epoch": 0.5935727788279773,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004970750649188839,
|
|
"loss": 5.711,
|
|
"mean_token_accuracy": 0.15260717198252677,
|
|
"num_tokens": 13031008.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"entropy": 5.774487495422363,
|
|
"epoch": 0.5939928586431422,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004970702499879998,
|
|
"loss": 5.6978,
|
|
"mean_token_accuracy": 0.14794613867998124,
|
|
"num_tokens": 13040366.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 5.774663066864013,
|
|
"epoch": 0.5944129384583071,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004970654311232554,
|
|
"loss": 5.7282,
|
|
"mean_token_accuracy": 0.14623787105083466,
|
|
"num_tokens": 13051140.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"entropy": 5.849271965026856,
|
|
"epoch": 0.594833018273472,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004970606083247362,
|
|
"loss": 5.6443,
|
|
"mean_token_accuracy": 0.15294349193572998,
|
|
"num_tokens": 13059835.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 5.7127063274383545,
|
|
"epoch": 0.5952530980886368,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004970557815925278,
|
|
"loss": 5.5898,
|
|
"mean_token_accuracy": 0.14923029839992524,
|
|
"num_tokens": 13068909.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"entropy": 5.729467248916626,
|
|
"epoch": 0.5956731779038017,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 0.0004970509509267155,
|
|
"loss": 5.6618,
|
|
"mean_token_accuracy": 0.14696715027093887,
|
|
"num_tokens": 13078380.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 5.90779447555542,
|
|
"epoch": 0.5960932577189666,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004970461163273849,
|
|
"loss": 5.7102,
|
|
"mean_token_accuracy": 0.15209844410419465,
|
|
"num_tokens": 13087774.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"entropy": 5.781322765350342,
|
|
"epoch": 0.5965133375341315,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004970412777946219,
|
|
"loss": 5.5491,
|
|
"mean_token_accuracy": 0.1548515573143959,
|
|
"num_tokens": 13095938.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 5.7372105598449705,
|
|
"epoch": 0.5969334173492964,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004970364353285117,
|
|
"loss": 5.6888,
|
|
"mean_token_accuracy": 0.15444473102688788,
|
|
"num_tokens": 13104661.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"entropy": 5.844806241989136,
|
|
"epoch": 0.5973534971644613,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004970315889291405,
|
|
"loss": 5.6731,
|
|
"mean_token_accuracy": 0.1474146157503128,
|
|
"num_tokens": 13114505.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 5.694882488250732,
|
|
"epoch": 0.5977735769796261,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004970267385965941,
|
|
"loss": 5.6245,
|
|
"mean_token_accuracy": 0.15627836883068086,
|
|
"num_tokens": 13124590.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"entropy": 5.715419483184815,
|
|
"epoch": 0.598193656794791,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004970218843309583,
|
|
"loss": 5.6087,
|
|
"mean_token_accuracy": 0.1559140369296074,
|
|
"num_tokens": 13134026.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 5.890923166275025,
|
|
"epoch": 0.5986137366099559,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004970170261323192,
|
|
"loss": 5.7662,
|
|
"mean_token_accuracy": 0.15187639147043228,
|
|
"num_tokens": 13142654.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"entropy": 5.7584481716156,
|
|
"epoch": 0.5990338164251208,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004970121640007627,
|
|
"loss": 5.6728,
|
|
"mean_token_accuracy": 0.1504793107509613,
|
|
"num_tokens": 13151177.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 5.807246541976928,
|
|
"epoch": 0.5994538962402857,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004970072979363751,
|
|
"loss": 5.6657,
|
|
"mean_token_accuracy": 0.1458762139081955,
|
|
"num_tokens": 13159689.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"entropy": 5.796993541717529,
|
|
"epoch": 0.5998739760554506,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004970024279392425,
|
|
"loss": 5.7087,
|
|
"mean_token_accuracy": 0.1491813488304615,
|
|
"num_tokens": 13168601.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 5.799499607086181,
|
|
"epoch": 0.6002940558706155,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004969975540094513,
|
|
"loss": 5.6911,
|
|
"mean_token_accuracy": 0.1491454616189003,
|
|
"num_tokens": 13177035.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"entropy": 5.840288925170898,
|
|
"epoch": 0.6007141356857802,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004969926761470876,
|
|
"loss": 5.6471,
|
|
"mean_token_accuracy": 0.15894681811332703,
|
|
"num_tokens": 13185444.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 5.787335777282715,
|
|
"epoch": 0.6011342155009451,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000496987794352238,
|
|
"loss": 5.6543,
|
|
"mean_token_accuracy": 0.15718057453632356,
|
|
"num_tokens": 13194987.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"entropy": 5.711384534835815,
|
|
"epoch": 0.60155429531611,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969829086249889,
|
|
"loss": 5.6887,
|
|
"mean_token_accuracy": 0.14929505437612534,
|
|
"num_tokens": 13203807.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 5.874243068695068,
|
|
"epoch": 0.6019743751312749,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000496978018965427,
|
|
"loss": 5.7803,
|
|
"mean_token_accuracy": 0.14797215312719345,
|
|
"num_tokens": 13214362.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"entropy": 5.938519763946533,
|
|
"epoch": 0.6023944549464398,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004969731253736387,
|
|
"loss": 5.7816,
|
|
"mean_token_accuracy": 0.14409856349229813,
|
|
"num_tokens": 13224192.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 5.824232769012451,
|
|
"epoch": 0.6028145347616047,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004969682278497109,
|
|
"loss": 5.7438,
|
|
"mean_token_accuracy": 0.149906075745821,
|
|
"num_tokens": 13234430.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"entropy": 5.766725778579712,
|
|
"epoch": 0.6032346145767696,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004969633263937301,
|
|
"loss": 5.6477,
|
|
"mean_token_accuracy": 0.15190263986587524,
|
|
"num_tokens": 13243681.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 5.959778547286987,
|
|
"epoch": 0.6036546943919344,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004969584210057832,
|
|
"loss": 5.9315,
|
|
"mean_token_accuracy": 0.13914565443992616,
|
|
"num_tokens": 13254334.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"entropy": 5.908876419067383,
|
|
"epoch": 0.6040747742070993,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004969535116859573,
|
|
"loss": 5.7233,
|
|
"mean_token_accuracy": 0.15498362332582474,
|
|
"num_tokens": 13263781.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 5.757447004318237,
|
|
"epoch": 0.6044948540222642,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969485984343392,
|
|
"loss": 5.633,
|
|
"mean_token_accuracy": 0.15214563608169557,
|
|
"num_tokens": 13272831.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"entropy": 5.840635204315186,
|
|
"epoch": 0.6049149338374291,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000496943681251016,
|
|
"loss": 5.6943,
|
|
"mean_token_accuracy": 0.15125853568315506,
|
|
"num_tokens": 13281621.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 5.772113513946533,
|
|
"epoch": 0.605335013652594,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004969387601360747,
|
|
"loss": 5.6754,
|
|
"mean_token_accuracy": 0.1471445269882679,
|
|
"num_tokens": 13291021.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"entropy": 5.837057733535767,
|
|
"epoch": 0.6057550934677589,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969338350896026,
|
|
"loss": 5.6877,
|
|
"mean_token_accuracy": 0.15487841069698333,
|
|
"num_tokens": 13299752.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 5.855220079421997,
|
|
"epoch": 0.6061751732829238,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004969289061116869,
|
|
"loss": 5.7219,
|
|
"mean_token_accuracy": 0.14336248189210893,
|
|
"num_tokens": 13309112.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"entropy": 5.829800653457641,
|
|
"epoch": 0.6065952530980886,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004969239732024148,
|
|
"loss": 5.7305,
|
|
"mean_token_accuracy": 0.15485918670892715,
|
|
"num_tokens": 13318328.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 5.693413162231446,
|
|
"epoch": 0.6070153329132535,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969190363618739,
|
|
"loss": 5.6063,
|
|
"mean_token_accuracy": 0.149900983273983,
|
|
"num_tokens": 13328940.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"entropy": 5.717437219619751,
|
|
"epoch": 0.6074354127284184,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004969140955901516,
|
|
"loss": 5.6137,
|
|
"mean_token_accuracy": 0.15410374999046325,
|
|
"num_tokens": 13337829.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 5.903831624984742,
|
|
"epoch": 0.6078554925435833,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004969091508873352,
|
|
"loss": 5.804,
|
|
"mean_token_accuracy": 0.14683766812086105,
|
|
"num_tokens": 13348289.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"entropy": 5.835478973388672,
|
|
"epoch": 0.6082755723587482,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969042022535126,
|
|
"loss": 5.708,
|
|
"mean_token_accuracy": 0.15235030949115752,
|
|
"num_tokens": 13357292.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 5.843629169464111,
|
|
"epoch": 0.6086956521739131,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004968992496887713,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.14912576526403426,
|
|
"num_tokens": 13366640.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"entropy": 5.844546985626221,
|
|
"epoch": 0.609115731989078,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004968942931931989,
|
|
"loss": 5.6594,
|
|
"mean_token_accuracy": 0.1629155233502388,
|
|
"num_tokens": 13377509.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 5.813440895080566,
|
|
"epoch": 0.6095358118042428,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004968893327668835,
|
|
"loss": 5.749,
|
|
"mean_token_accuracy": 0.14384781569242477,
|
|
"num_tokens": 13386573.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"entropy": 5.739164876937866,
|
|
"epoch": 0.6099558916194077,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004968843684099128,
|
|
"loss": 5.607,
|
|
"mean_token_accuracy": 0.1540288582444191,
|
|
"num_tokens": 13395790.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 5.730731964111328,
|
|
"epoch": 0.6103759714345726,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004968794001223747,
|
|
"loss": 5.658,
|
|
"mean_token_accuracy": 0.1504225805401802,
|
|
"num_tokens": 13405265.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"entropy": 5.77107720375061,
|
|
"epoch": 0.6107960512497375,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004968744279043574,
|
|
"loss": 5.6733,
|
|
"mean_token_accuracy": 0.15312831848859787,
|
|
"num_tokens": 13413796.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 5.86907844543457,
|
|
"epoch": 0.6112161310649024,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004968694517559488,
|
|
"loss": 5.7213,
|
|
"mean_token_accuracy": 0.15257197394967079,
|
|
"num_tokens": 13423299.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"entropy": 5.709070634841919,
|
|
"epoch": 0.6116362108800673,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004968644716772371,
|
|
"loss": 5.6292,
|
|
"mean_token_accuracy": 0.15693681687116623,
|
|
"num_tokens": 13432267.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 5.737072992324829,
|
|
"epoch": 0.612056290695232,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004968594876683105,
|
|
"loss": 5.733,
|
|
"mean_token_accuracy": 0.14609354361891747,
|
|
"num_tokens": 13442332.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"entropy": 5.8117883682250975,
|
|
"epoch": 0.6124763705103969,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004968544997292572,
|
|
"loss": 5.6747,
|
|
"mean_token_accuracy": 0.15259024500846863,
|
|
"num_tokens": 13451700.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 5.812619876861572,
|
|
"epoch": 0.6128964503255618,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004968495078601659,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.14332814291119575,
|
|
"num_tokens": 13461009.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"entropy": 5.858203887939453,
|
|
"epoch": 0.6133165301407267,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004968445120611247,
|
|
"loss": 5.7707,
|
|
"mean_token_accuracy": 0.15080213099718093,
|
|
"num_tokens": 13470341.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 5.905436229705811,
|
|
"epoch": 0.6137366099558916,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004968395123322223,
|
|
"loss": 5.7003,
|
|
"mean_token_accuracy": 0.1523931697010994,
|
|
"num_tokens": 13479898.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"entropy": 5.742975854873658,
|
|
"epoch": 0.6141566897710565,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000496834508673547,
|
|
"loss": 5.6046,
|
|
"mean_token_accuracy": 0.15081604719161987,
|
|
"num_tokens": 13488116.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 5.723895263671875,
|
|
"epoch": 0.6145767695862214,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004968295010851877,
|
|
"loss": 5.6474,
|
|
"mean_token_accuracy": 0.15416487902402878,
|
|
"num_tokens": 13497814.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"entropy": 5.786228084564209,
|
|
"epoch": 0.6149968494013862,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004968244895672331,
|
|
"loss": 5.6679,
|
|
"mean_token_accuracy": 0.14462938904762268,
|
|
"num_tokens": 13506617.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 5.833630132675171,
|
|
"epoch": 0.6154169292165511,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004968194741197718,
|
|
"loss": 5.8051,
|
|
"mean_token_accuracy": 0.1436678983271122,
|
|
"num_tokens": 13516632.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"entropy": 5.897484588623047,
|
|
"epoch": 0.615837009031716,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004968144547428927,
|
|
"loss": 5.7291,
|
|
"mean_token_accuracy": 0.15222294852137566,
|
|
"num_tokens": 13526452.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 5.792807674407959,
|
|
"epoch": 0.6162570888468809,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004968094314366848,
|
|
"loss": 5.6406,
|
|
"mean_token_accuracy": 0.150718155503273,
|
|
"num_tokens": 13535663.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"entropy": 5.687614870071411,
|
|
"epoch": 0.6166771686620458,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000496804404201237,
|
|
"loss": 5.558,
|
|
"mean_token_accuracy": 0.16134363710880278,
|
|
"num_tokens": 13544574.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 5.88130555152893,
|
|
"epoch": 0.6170972484772107,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004967993730366385,
|
|
"loss": 5.7309,
|
|
"mean_token_accuracy": 0.15020160600543023,
|
|
"num_tokens": 13553041.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"entropy": 5.799270153045654,
|
|
"epoch": 0.6175173282923756,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004967943379429781,
|
|
"loss": 5.7106,
|
|
"mean_token_accuracy": 0.14654484167695045,
|
|
"num_tokens": 13562108.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 5.930500316619873,
|
|
"epoch": 0.6179374081075404,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004967892989203454,
|
|
"loss": 5.8659,
|
|
"mean_token_accuracy": 0.14354829862713814,
|
|
"num_tokens": 13571500.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"entropy": 5.872519779205322,
|
|
"epoch": 0.6183574879227053,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004967842559688295,
|
|
"loss": 5.7577,
|
|
"mean_token_accuracy": 0.14510439038276673,
|
|
"num_tokens": 13581304.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 5.81227593421936,
|
|
"epoch": 0.6187775677378702,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004967792090885195,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.15179503858089446,
|
|
"num_tokens": 13590734.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"entropy": 5.746864557266235,
|
|
"epoch": 0.6191976475530351,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004967741582795052,
|
|
"loss": 5.6924,
|
|
"mean_token_accuracy": 0.14929923564195632,
|
|
"num_tokens": 13600486.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 5.881101942062378,
|
|
"epoch": 0.6196177273682,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967691035418758,
|
|
"loss": 5.7268,
|
|
"mean_token_accuracy": 0.14389215558767318,
|
|
"num_tokens": 13610542.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"entropy": 5.792819786071777,
|
|
"epoch": 0.6200378071833649,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.000496764044875721,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.15460289865732194,
|
|
"num_tokens": 13619431.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 5.761080598831176,
|
|
"epoch": 0.6204578869985298,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967589822811303,
|
|
"loss": 5.6957,
|
|
"mean_token_accuracy": 0.14801864922046662,
|
|
"num_tokens": 13629930.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"entropy": 5.956879663467407,
|
|
"epoch": 0.6208779668136946,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004967539157581934,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14267176687717437,
|
|
"num_tokens": 13639439.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 5.9114847660064695,
|
|
"epoch": 0.6212980466288595,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000496748845307,
|
|
"loss": 5.7476,
|
|
"mean_token_accuracy": 0.15258604139089585,
|
|
"num_tokens": 13648548.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"entropy": 5.858182144165039,
|
|
"epoch": 0.6217181264440244,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004967437709276401,
|
|
"loss": 5.7985,
|
|
"mean_token_accuracy": 0.15154744163155556,
|
|
"num_tokens": 13657658.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 5.721544599533081,
|
|
"epoch": 0.6221382062591893,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004967386926202034,
|
|
"loss": 5.5518,
|
|
"mean_token_accuracy": 0.15903828144073487,
|
|
"num_tokens": 13666763.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"entropy": 5.837467288970947,
|
|
"epoch": 0.6225582860743542,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00049673361038478,
|
|
"loss": 5.8103,
|
|
"mean_token_accuracy": 0.14174049571156502,
|
|
"num_tokens": 13676527.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 5.855217123031617,
|
|
"epoch": 0.622978365889519,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004967285242214599,
|
|
"loss": 5.7674,
|
|
"mean_token_accuracy": 0.149812014400959,
|
|
"num_tokens": 13685404.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"entropy": 5.782896041870117,
|
|
"epoch": 0.6233984457046838,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000496723434130333,
|
|
"loss": 5.5821,
|
|
"mean_token_accuracy": 0.15357585549354552,
|
|
"num_tokens": 13693118.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 5.7227521419525145,
|
|
"epoch": 0.6238185255198487,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004967183401114898,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15249475762248038,
|
|
"num_tokens": 13702015.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"entropy": 5.806180191040039,
|
|
"epoch": 0.6242386053350136,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004967132421650203,
|
|
"loss": 5.6877,
|
|
"mean_token_accuracy": 0.14611244574189186,
|
|
"num_tokens": 13711658.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 5.766854763031006,
|
|
"epoch": 0.6246586851501785,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004967081402910149,
|
|
"loss": 5.6979,
|
|
"mean_token_accuracy": 0.14979787766933442,
|
|
"num_tokens": 13720718.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"entropy": 5.728975391387939,
|
|
"epoch": 0.6250787649653434,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496703034489564,
|
|
"loss": 5.5606,
|
|
"mean_token_accuracy": 0.1568959876894951,
|
|
"num_tokens": 13729364.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 5.909390020370483,
|
|
"epoch": 0.6254988447805083,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004966979247607579,
|
|
"loss": 5.8725,
|
|
"mean_token_accuracy": 0.14035747721791267,
|
|
"num_tokens": 13739436.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"entropy": 5.9296684741973875,
|
|
"epoch": 0.6259189245956732,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004966928111046873,
|
|
"loss": 5.7708,
|
|
"mean_token_accuracy": 0.15743647813796996,
|
|
"num_tokens": 13749196.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 5.783377313613892,
|
|
"epoch": 0.626339004410838,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004966876935214426,
|
|
"loss": 5.6254,
|
|
"mean_token_accuracy": 0.15206747651100158,
|
|
"num_tokens": 13758414.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"entropy": 5.766037368774414,
|
|
"epoch": 0.6267590842260029,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0004966825720111147,
|
|
"loss": 5.6562,
|
|
"mean_token_accuracy": 0.14928966909646987,
|
|
"num_tokens": 13767496.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 5.811860084533691,
|
|
"epoch": 0.6271791640411678,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004966774465737942,
|
|
"loss": 5.8047,
|
|
"mean_token_accuracy": 0.15070491954684256,
|
|
"num_tokens": 13777033.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"entropy": 5.844302463531494,
|
|
"epoch": 0.6275992438563327,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004966723172095717,
|
|
"loss": 5.7583,
|
|
"mean_token_accuracy": 0.14748911708593368,
|
|
"num_tokens": 13786313.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 5.826303386688233,
|
|
"epoch": 0.6280193236714976,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004966671839185384,
|
|
"loss": 5.691,
|
|
"mean_token_accuracy": 0.1544649474322796,
|
|
"num_tokens": 13795257.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"entropy": 5.733129787445068,
|
|
"epoch": 0.6284394034866625,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004966620467007851,
|
|
"loss": 5.6151,
|
|
"mean_token_accuracy": 0.15482667088508606,
|
|
"num_tokens": 13804582.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 5.708710527420044,
|
|
"epoch": 0.6288594833018274,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004966569055564027,
|
|
"loss": 5.5858,
|
|
"mean_token_accuracy": 0.1517590843141079,
|
|
"num_tokens": 13813248.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"entropy": 5.892451477050781,
|
|
"epoch": 0.6292795631169922,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004966517604854823,
|
|
"loss": 5.8557,
|
|
"mean_token_accuracy": 0.13463475033640862,
|
|
"num_tokens": 13823301.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 5.816387891769409,
|
|
"epoch": 0.6296996429321571,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004966466114881152,
|
|
"loss": 5.5904,
|
|
"mean_token_accuracy": 0.15593330711126327,
|
|
"num_tokens": 13832040.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"entropy": 5.830536413192749,
|
|
"epoch": 0.630119722747322,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004966414585643925,
|
|
"loss": 5.7743,
|
|
"mean_token_accuracy": 0.14742243885993958,
|
|
"num_tokens": 13841874.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 5.7584226608276365,
|
|
"epoch": 0.6305398025624869,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004966363017144055,
|
|
"loss": 5.6126,
|
|
"mean_token_accuracy": 0.15902097374200821,
|
|
"num_tokens": 13850755.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"entropy": 5.788242483139038,
|
|
"epoch": 0.6309598823776518,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004966311409382455,
|
|
"loss": 5.6797,
|
|
"mean_token_accuracy": 0.14931050986051558,
|
|
"num_tokens": 13860009.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 5.736308908462524,
|
|
"epoch": 0.6313799621928167,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004966259762360039,
|
|
"loss": 5.5946,
|
|
"mean_token_accuracy": 0.15429836511611938,
|
|
"num_tokens": 13868476.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"entropy": 5.711131143569946,
|
|
"epoch": 0.6318000420079816,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004966208076077723,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.15463593304157258,
|
|
"num_tokens": 13877367.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 5.750036096572876,
|
|
"epoch": 0.6322201218231464,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004966156350536422,
|
|
"loss": 5.6935,
|
|
"mean_token_accuracy": 0.14963461458683014,
|
|
"num_tokens": 13885985.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"entropy": 5.755751752853394,
|
|
"epoch": 0.6326402016383113,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004966104585737054,
|
|
"loss": 5.61,
|
|
"mean_token_accuracy": 0.15479331612586975,
|
|
"num_tokens": 13895059.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 5.780548143386841,
|
|
"epoch": 0.6330602814534761,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004966052781680534,
|
|
"loss": 5.6767,
|
|
"mean_token_accuracy": 0.14704100489616395,
|
|
"num_tokens": 13903789.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"entropy": 5.845569133758545,
|
|
"epoch": 0.633480361268641,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004966000938367778,
|
|
"loss": 5.6591,
|
|
"mean_token_accuracy": 0.15396612286567687,
|
|
"num_tokens": 13913377.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 5.6942973136901855,
|
|
"epoch": 0.6339004410838059,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004965949055799708,
|
|
"loss": 5.6186,
|
|
"mean_token_accuracy": 0.1588241770863533,
|
|
"num_tokens": 13922141.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"entropy": 5.787711143493652,
|
|
"epoch": 0.6343205208989708,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004965897133977241,
|
|
"loss": 5.6597,
|
|
"mean_token_accuracy": 0.1402692511677742,
|
|
"num_tokens": 13930717.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 5.825317001342773,
|
|
"epoch": 0.6347406007141357,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004965845172901298,
|
|
"loss": 5.7464,
|
|
"mean_token_accuracy": 0.14808339700102807,
|
|
"num_tokens": 13940344.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"entropy": 5.7218469142913815,
|
|
"epoch": 0.6351606805293005,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004965793172572798,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.15380775630474092,
|
|
"num_tokens": 13948400.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 5.710135746002197,
|
|
"epoch": 0.6355807603444654,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004965741132992663,
|
|
"loss": 5.6947,
|
|
"mean_token_accuracy": 0.14487617537379266,
|
|
"num_tokens": 13957939.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"entropy": 5.832439231872558,
|
|
"epoch": 0.6360008401596303,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965689054161814,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.1547864407300949,
|
|
"num_tokens": 13966943.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 5.738895320892334,
|
|
"epoch": 0.6364209199747952,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004965636936081176,
|
|
"loss": 5.5722,
|
|
"mean_token_accuracy": 0.1546689599752426,
|
|
"num_tokens": 13975850.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"entropy": 5.806326103210449,
|
|
"epoch": 0.6368409997899601,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000496558477875167,
|
|
"loss": 5.6725,
|
|
"mean_token_accuracy": 0.15719727128744126,
|
|
"num_tokens": 13985059.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 5.77093358039856,
|
|
"epoch": 0.637261079605125,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496553258217422,
|
|
"loss": 5.7215,
|
|
"mean_token_accuracy": 0.1449730947613716,
|
|
"num_tokens": 13993571.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"entropy": 5.842133378982544,
|
|
"epoch": 0.6376811594202898,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004965480346349751,
|
|
"loss": 5.7185,
|
|
"mean_token_accuracy": 0.15069702565670012,
|
|
"num_tokens": 14002326.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 5.9778131484985355,
|
|
"epoch": 0.6381012392354547,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000496542807127919,
|
|
"loss": 5.8686,
|
|
"mean_token_accuracy": 0.14351749792695045,
|
|
"num_tokens": 14012002.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"entropy": 5.788293838500977,
|
|
"epoch": 0.6385213190506196,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496537575696346,
|
|
"loss": 5.7363,
|
|
"mean_token_accuracy": 0.14434802830219268,
|
|
"num_tokens": 14022085.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 5.704484844207764,
|
|
"epoch": 0.6389413988657845,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004965323403403488,
|
|
"loss": 5.6045,
|
|
"mean_token_accuracy": 0.15442810356616973,
|
|
"num_tokens": 14030706.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"entropy": 5.77836651802063,
|
|
"epoch": 0.6393614786809494,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004965271010600205,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.15519261509180068,
|
|
"num_tokens": 14039520.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 5.822714900970459,
|
|
"epoch": 0.6397815584961143,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004965218578554535,
|
|
"loss": 5.7178,
|
|
"mean_token_accuracy": 0.15360228195786477,
|
|
"num_tokens": 14048407.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"entropy": 5.711956024169922,
|
|
"epoch": 0.6402016383112792,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496516610726741,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.158063705265522,
|
|
"num_tokens": 14057534.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 5.765710496902466,
|
|
"epoch": 0.640621718126444,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965113596739759,
|
|
"loss": 5.6129,
|
|
"mean_token_accuracy": 0.1602526545524597,
|
|
"num_tokens": 14065992.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"entropy": 5.712855339050293,
|
|
"epoch": 0.6410417979416089,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004965061046972508,
|
|
"loss": 5.6062,
|
|
"mean_token_accuracy": 0.15307263806462287,
|
|
"num_tokens": 14074806.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 5.752716493606568,
|
|
"epoch": 0.6414618777567738,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004965008457966594,
|
|
"loss": 5.6501,
|
|
"mean_token_accuracy": 0.15263762921094895,
|
|
"num_tokens": 14083813.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"entropy": 5.762417888641357,
|
|
"epoch": 0.6418819575719387,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004964955829722945,
|
|
"loss": 5.5858,
|
|
"mean_token_accuracy": 0.1599087104201317,
|
|
"num_tokens": 14092193.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 5.84725341796875,
|
|
"epoch": 0.6423020373871036,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004964903162242493,
|
|
"loss": 5.7916,
|
|
"mean_token_accuracy": 0.14413690567016602,
|
|
"num_tokens": 14102797.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"entropy": 5.76859679222107,
|
|
"epoch": 0.6427221172022685,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004964850455526173,
|
|
"loss": 5.6637,
|
|
"mean_token_accuracy": 0.15364854410290718,
|
|
"num_tokens": 14112226.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 5.661821556091309,
|
|
"epoch": 0.6431421970174334,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004964797709574917,
|
|
"loss": 5.5939,
|
|
"mean_token_accuracy": 0.15402402132749557,
|
|
"num_tokens": 14121775.0,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"entropy": 5.719243478775025,
|
|
"epoch": 0.6435622768325981,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000496474492438966,
|
|
"loss": 5.5856,
|
|
"mean_token_accuracy": 0.15579498410224915,
|
|
"num_tokens": 14130415.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 5.75182991027832,
|
|
"epoch": 0.643982356647763,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964692099971338,
|
|
"loss": 5.6058,
|
|
"mean_token_accuracy": 0.1568465366959572,
|
|
"num_tokens": 14140204.0,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"entropy": 5.736771440505981,
|
|
"epoch": 0.6444024364629279,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964639236320885,
|
|
"loss": 5.567,
|
|
"mean_token_accuracy": 0.15371138900518416,
|
|
"num_tokens": 14149595.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 5.714345407485962,
|
|
"epoch": 0.6448225162780928,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004964586333439239,
|
|
"loss": 5.6346,
|
|
"mean_token_accuracy": 0.15398874282836914,
|
|
"num_tokens": 14158865.0,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"entropy": 5.78523097038269,
|
|
"epoch": 0.6452425960932577,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004964533391327335,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.158450847864151,
|
|
"num_tokens": 14167962.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 5.816212701797485,
|
|
"epoch": 0.6456626759084226,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004964480409986113,
|
|
"loss": 5.6465,
|
|
"mean_token_accuracy": 0.1606015980243683,
|
|
"num_tokens": 14176479.0,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"entropy": 5.829603910446167,
|
|
"epoch": 0.6460827557235875,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004964427389416512,
|
|
"loss": 5.6739,
|
|
"mean_token_accuracy": 0.14969076216220856,
|
|
"num_tokens": 14185408.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 5.702767419815063,
|
|
"epoch": 0.6465028355387523,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000496437432961947,
|
|
"loss": 5.6745,
|
|
"mean_token_accuracy": 0.15580256432294845,
|
|
"num_tokens": 14194155.0,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"entropy": 5.729840040206909,
|
|
"epoch": 0.6469229153539172,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964321230595925,
|
|
"loss": 5.6916,
|
|
"mean_token_accuracy": 0.1505993440747261,
|
|
"num_tokens": 14202779.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 5.923639154434204,
|
|
"epoch": 0.6473429951690821,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004964268092346821,
|
|
"loss": 5.868,
|
|
"mean_token_accuracy": 0.14160000756382943,
|
|
"num_tokens": 14212552.0,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"entropy": 5.925770807266235,
|
|
"epoch": 0.647763074984247,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004964214914873098,
|
|
"loss": 5.6684,
|
|
"mean_token_accuracy": 0.14924321398139,
|
|
"num_tokens": 14222783.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 5.70919623374939,
|
|
"epoch": 0.6481831547994119,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004964161698175697,
|
|
"loss": 5.5477,
|
|
"mean_token_accuracy": 0.15285850167274476,
|
|
"num_tokens": 14232085.0,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"entropy": 5.768083095550537,
|
|
"epoch": 0.6486032346145768,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004964108442255562,
|
|
"loss": 5.7039,
|
|
"mean_token_accuracy": 0.14666701555252076,
|
|
"num_tokens": 14241969.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 5.75738754272461,
|
|
"epoch": 0.6490233144297417,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004964055147113637,
|
|
"loss": 5.616,
|
|
"mean_token_accuracy": 0.1562434285879135,
|
|
"num_tokens": 14251012.0,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"entropy": 5.841613340377807,
|
|
"epoch": 0.6494433942449065,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004964001812750864,
|
|
"loss": 5.7414,
|
|
"mean_token_accuracy": 0.15030983835458755,
|
|
"num_tokens": 14261110.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 5.793753337860108,
|
|
"epoch": 0.6498634740600714,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000496394843916819,
|
|
"loss": 5.7123,
|
|
"mean_token_accuracy": 0.15001400411128998,
|
|
"num_tokens": 14270869.0,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"entropy": 5.8021101474761965,
|
|
"epoch": 0.6502835538752363,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004963895026366558,
|
|
"loss": 5.6624,
|
|
"mean_token_accuracy": 0.14703597128391266,
|
|
"num_tokens": 14279607.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 5.798326921463013,
|
|
"epoch": 0.6507036336904012,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004963841574346917,
|
|
"loss": 5.6664,
|
|
"mean_token_accuracy": 0.15177475959062575,
|
|
"num_tokens": 14289282.0,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"entropy": 5.785371494293213,
|
|
"epoch": 0.6511237135055661,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004963788083110212,
|
|
"loss": 5.5947,
|
|
"mean_token_accuracy": 0.15618948638439178,
|
|
"num_tokens": 14298658.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 5.867933845520019,
|
|
"epoch": 0.651543793320731,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000496373455265739,
|
|
"loss": 5.6715,
|
|
"mean_token_accuracy": 0.15167464911937714,
|
|
"num_tokens": 14307832.0,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"entropy": 5.737640428543091,
|
|
"epoch": 0.6519638731358958,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004963680982989402,
|
|
"loss": 5.5745,
|
|
"mean_token_accuracy": 0.15618224889039994,
|
|
"num_tokens": 14317122.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 5.728768348693848,
|
|
"epoch": 0.6523839529510607,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004963627374107195,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.15685338973999025,
|
|
"num_tokens": 14326069.0,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"entropy": 5.735061359405518,
|
|
"epoch": 0.6528040327662256,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004963573726011717,
|
|
"loss": 5.6154,
|
|
"mean_token_accuracy": 0.152651646733284,
|
|
"num_tokens": 14335260.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 5.89712963104248,
|
|
"epoch": 0.6532241125813905,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004963520038703922,
|
|
"loss": 5.7147,
|
|
"mean_token_accuracy": 0.14169859886169434,
|
|
"num_tokens": 14345823.0,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"entropy": 5.8055966854095455,
|
|
"epoch": 0.6536441923965554,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000496346631218476,
|
|
"loss": 5.5901,
|
|
"mean_token_accuracy": 0.151746928691864,
|
|
"num_tokens": 14354316.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 5.731487655639649,
|
|
"epoch": 0.6540642722117203,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.000496341254645518,
|
|
"loss": 5.637,
|
|
"mean_token_accuracy": 0.15558102428913118,
|
|
"num_tokens": 14364539.0,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"entropy": 5.791000318527222,
|
|
"epoch": 0.6544843520268852,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004963358741516138,
|
|
"loss": 5.7568,
|
|
"mean_token_accuracy": 0.14070456251502036,
|
|
"num_tokens": 14374081.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 5.791856861114502,
|
|
"epoch": 0.6549044318420499,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004963304897368585,
|
|
"loss": 5.6421,
|
|
"mean_token_accuracy": 0.14869485646486283,
|
|
"num_tokens": 14383255.0,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"entropy": 5.887608623504638,
|
|
"epoch": 0.6553245116572148,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004963251014013475,
|
|
"loss": 5.7709,
|
|
"mean_token_accuracy": 0.14988299310207367,
|
|
"num_tokens": 14392417.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 5.925739812850952,
|
|
"epoch": 0.6557445914723797,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004963197091451763,
|
|
"loss": 5.8171,
|
|
"mean_token_accuracy": 0.14091493040323258,
|
|
"num_tokens": 14401899.0,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"entropy": 5.8610601902008055,
|
|
"epoch": 0.6561646712875446,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004963143129684405,
|
|
"loss": 5.7865,
|
|
"mean_token_accuracy": 0.14567770585417747,
|
|
"num_tokens": 14411245.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 5.733341979980469,
|
|
"epoch": 0.6565847511027095,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004963089128712355,
|
|
"loss": 5.6357,
|
|
"mean_token_accuracy": 0.15616341382265092,
|
|
"num_tokens": 14419710.0,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"entropy": 5.761330413818359,
|
|
"epoch": 0.6570048309178744,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004963035088536571,
|
|
"loss": 5.6196,
|
|
"mean_token_accuracy": 0.16149473637342454,
|
|
"num_tokens": 14430266.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 5.832095336914063,
|
|
"epoch": 0.6574249107330393,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004962981009158012,
|
|
"loss": 5.5946,
|
|
"mean_token_accuracy": 0.14890647828578948,
|
|
"num_tokens": 14439515.0,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"entropy": 5.783193588256836,
|
|
"epoch": 0.6578449905482041,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004962926890577632,
|
|
"loss": 5.6537,
|
|
"mean_token_accuracy": 0.1543855309486389,
|
|
"num_tokens": 14448091.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 5.762275314331054,
|
|
"epoch": 0.658265070363369,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000496287273279639,
|
|
"loss": 5.6831,
|
|
"mean_token_accuracy": 0.14809218272566796,
|
|
"num_tokens": 14457744.0,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"entropy": 5.830176925659179,
|
|
"epoch": 0.6586851501785339,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496281853581525,
|
|
"loss": 5.6747,
|
|
"mean_token_accuracy": 0.15542599856853484,
|
|
"num_tokens": 14467597.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 5.816223096847534,
|
|
"epoch": 0.6591052299936988,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004962764299635168,
|
|
"loss": 5.6557,
|
|
"mean_token_accuracy": 0.15143783688545226,
|
|
"num_tokens": 14476662.0,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"entropy": 5.868206977844238,
|
|
"epoch": 0.6595253098088637,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004962710024257105,
|
|
"loss": 5.7365,
|
|
"mean_token_accuracy": 0.15013337954878808,
|
|
"num_tokens": 14486583.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 5.866771793365478,
|
|
"epoch": 0.6599453896240286,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004962655709682025,
|
|
"loss": 5.7422,
|
|
"mean_token_accuracy": 0.14670923799276353,
|
|
"num_tokens": 14496528.0,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"entropy": 5.847543859481812,
|
|
"epoch": 0.6603654694391935,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004962601355910887,
|
|
"loss": 5.7216,
|
|
"mean_token_accuracy": 0.14750941842794418,
|
|
"num_tokens": 14507026.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 5.714229869842529,
|
|
"epoch": 0.6607855492543583,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004962546962944656,
|
|
"loss": 5.5896,
|
|
"mean_token_accuracy": 0.1554133415222168,
|
|
"num_tokens": 14516480.0,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"entropy": 5.7652284622192385,
|
|
"epoch": 0.6612056290695232,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962492530784295,
|
|
"loss": 5.5384,
|
|
"mean_token_accuracy": 0.16685622930526733,
|
|
"num_tokens": 14525068.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 5.764181613922119,
|
|
"epoch": 0.6616257088846881,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004962438059430768,
|
|
"loss": 5.6811,
|
|
"mean_token_accuracy": 0.15448692589998245,
|
|
"num_tokens": 14534441.0,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"entropy": 5.791794538497925,
|
|
"epoch": 0.662045788699853,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004962383548885039,
|
|
"loss": 5.7416,
|
|
"mean_token_accuracy": 0.15312327668070794,
|
|
"num_tokens": 14543026.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 5.810564088821411,
|
|
"epoch": 0.6624658685150179,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004962328999148075,
|
|
"loss": 5.6235,
|
|
"mean_token_accuracy": 0.15815748721361161,
|
|
"num_tokens": 14552068.0,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"entropy": 5.795226907730102,
|
|
"epoch": 0.6628859483301828,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004962274410220842,
|
|
"loss": 5.748,
|
|
"mean_token_accuracy": 0.14739178717136384,
|
|
"num_tokens": 14561587.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 5.840717220306397,
|
|
"epoch": 0.6633060281453477,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962219782104308,
|
|
"loss": 5.7455,
|
|
"mean_token_accuracy": 0.15566187649965285,
|
|
"num_tokens": 14571020.0,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"entropy": 5.857281494140625,
|
|
"epoch": 0.6637261079605125,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004962165114799439,
|
|
"loss": 5.7013,
|
|
"mean_token_accuracy": 0.14193924963474275,
|
|
"num_tokens": 14580638.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 5.753746509552002,
|
|
"epoch": 0.6641461877756774,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004962110408307204,
|
|
"loss": 5.6411,
|
|
"mean_token_accuracy": 0.1508389577269554,
|
|
"num_tokens": 14590173.0,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"entropy": 5.771540355682373,
|
|
"epoch": 0.6645662675908423,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004962055662628571,
|
|
"loss": 5.6088,
|
|
"mean_token_accuracy": 0.1546558991074562,
|
|
"num_tokens": 14598635.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 5.824790573120117,
|
|
"epoch": 0.6649863474060071,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004962000877764513,
|
|
"loss": 5.6465,
|
|
"mean_token_accuracy": 0.15380171239376067,
|
|
"num_tokens": 14607233.0,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"entropy": 5.900277614593506,
|
|
"epoch": 0.665406427221172,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004961946053715998,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.14116770774126053,
|
|
"num_tokens": 14617483.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 5.774311876296997,
|
|
"epoch": 0.665826507036337,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004961891190483997,
|
|
"loss": 5.6337,
|
|
"mean_token_accuracy": 0.15262163281440735,
|
|
"num_tokens": 14625805.0,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"entropy": 5.750567626953125,
|
|
"epoch": 0.6662465868515017,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004961836288069483,
|
|
"loss": 5.56,
|
|
"mean_token_accuracy": 0.15181114226579667,
|
|
"num_tokens": 14634605.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 5.866780996322632,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004961781346473428,
|
|
"loss": 5.754,
|
|
"mean_token_accuracy": 0.1443464897572994,
|
|
"num_tokens": 14644970.0,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"entropy": 5.8288147926330565,
|
|
"epoch": 0.6670867464818315,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004961726365696805,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.1512111656367779,
|
|
"num_tokens": 14655043.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 5.81706018447876,
|
|
"epoch": 0.6675068262969964,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004961671345740589,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.1498358130455017,
|
|
"num_tokens": 14663994.0,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"entropy": 5.73077392578125,
|
|
"epoch": 0.6679269061121613,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004961616286605753,
|
|
"loss": 5.6285,
|
|
"mean_token_accuracy": 0.14595297276973723,
|
|
"num_tokens": 14674101.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 5.793763732910156,
|
|
"epoch": 0.6683469859273262,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004961561188293273,
|
|
"loss": 5.7245,
|
|
"mean_token_accuracy": 0.14435067921876907,
|
|
"num_tokens": 14684156.0,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"entropy": 5.726213026046753,
|
|
"epoch": 0.6687670657424911,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004961506050804126,
|
|
"loss": 5.6178,
|
|
"mean_token_accuracy": 0.15918601751327516,
|
|
"num_tokens": 14693223.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 5.852010822296142,
|
|
"epoch": 0.6691871455576559,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000496145087413929,
|
|
"loss": 5.6258,
|
|
"mean_token_accuracy": 0.14910822063684465,
|
|
"num_tokens": 14702959.0,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"entropy": 5.876345634460449,
|
|
"epoch": 0.6696072253728208,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004961395658299737,
|
|
"loss": 5.737,
|
|
"mean_token_accuracy": 0.1483006753027439,
|
|
"num_tokens": 14712146.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 5.710770320892334,
|
|
"epoch": 0.6700273051879857,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004961340403286451,
|
|
"loss": 5.6515,
|
|
"mean_token_accuracy": 0.14912314414978028,
|
|
"num_tokens": 14721932.0,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"entropy": 5.775924396514893,
|
|
"epoch": 0.6704473850031506,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961285109100408,
|
|
"loss": 5.5857,
|
|
"mean_token_accuracy": 0.15742873400449753,
|
|
"num_tokens": 14731080.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 5.719264698028565,
|
|
"epoch": 0.6708674648183155,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004961229775742587,
|
|
"loss": 5.5991,
|
|
"mean_token_accuracy": 0.16006802767515182,
|
|
"num_tokens": 14740057.0,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"entropy": 5.813319492340088,
|
|
"epoch": 0.6712875446334804,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000496117440321397,
|
|
"loss": 5.6828,
|
|
"mean_token_accuracy": 0.15654956847429274,
|
|
"num_tokens": 14748399.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 5.8324696063995365,
|
|
"epoch": 0.6717076244486453,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961118991515537,
|
|
"loss": 5.6881,
|
|
"mean_token_accuracy": 0.14406146556138993,
|
|
"num_tokens": 14757215.0,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"entropy": 5.786386203765869,
|
|
"epoch": 0.6721277042638101,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000496106354064827,
|
|
"loss": 5.6868,
|
|
"mean_token_accuracy": 0.15685203224420546,
|
|
"num_tokens": 14766191.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 5.8651642322540285,
|
|
"epoch": 0.672547784078975,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004961008050613149,
|
|
"loss": 5.7521,
|
|
"mean_token_accuracy": 0.14210513085126877,
|
|
"num_tokens": 14775220.0,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"entropy": 5.838468170166015,
|
|
"epoch": 0.6729678638941399,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004960952521411161,
|
|
"loss": 5.7078,
|
|
"mean_token_accuracy": 0.14716721177101136,
|
|
"num_tokens": 14784287.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 5.932072496414184,
|
|
"epoch": 0.6733879437093048,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004960896953043287,
|
|
"loss": 5.7759,
|
|
"mean_token_accuracy": 0.14442920163273812,
|
|
"num_tokens": 14794219.0,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"entropy": 5.824687051773071,
|
|
"epoch": 0.6738080235244697,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004960841345510511,
|
|
"loss": 5.6703,
|
|
"mean_token_accuracy": 0.1518692597746849,
|
|
"num_tokens": 14803324.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 5.7951741218566895,
|
|
"epoch": 0.6742281033396346,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000496078569881382,
|
|
"loss": 5.6876,
|
|
"mean_token_accuracy": 0.1539413034915924,
|
|
"num_tokens": 14811963.0,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"entropy": 5.747313785552978,
|
|
"epoch": 0.6746481831547995,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004960730012954198,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.14589986428618432,
|
|
"num_tokens": 14821903.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 5.716427040100098,
|
|
"epoch": 0.6750682629699643,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004960674287932634,
|
|
"loss": 5.6271,
|
|
"mean_token_accuracy": 0.14554727971553802,
|
|
"num_tokens": 14831215.0,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"entropy": 5.827300643920898,
|
|
"epoch": 0.6754883427851291,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004960618523750111,
|
|
"loss": 5.5552,
|
|
"mean_token_accuracy": 0.1551190733909607,
|
|
"num_tokens": 14840354.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 5.817133188247681,
|
|
"epoch": 0.675908422600294,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000496056272040762,
|
|
"loss": 5.7402,
|
|
"mean_token_accuracy": 0.14943507611751555,
|
|
"num_tokens": 14849660.0,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"entropy": 5.807599830627441,
|
|
"epoch": 0.6763285024154589,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004960506877906149,
|
|
"loss": 5.6648,
|
|
"mean_token_accuracy": 0.14764449894428253,
|
|
"num_tokens": 14859819.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 5.801334857940674,
|
|
"epoch": 0.6767485822306238,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004960450996246686,
|
|
"loss": 5.6585,
|
|
"mean_token_accuracy": 0.15806604847311972,
|
|
"num_tokens": 14869260.0,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"entropy": 5.7306236743927,
|
|
"epoch": 0.6771686620457887,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004960395075430222,
|
|
"loss": 5.6336,
|
|
"mean_token_accuracy": 0.15279667675495148,
|
|
"num_tokens": 14878685.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 5.749643182754516,
|
|
"epoch": 0.6775887418609536,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004960339115457748,
|
|
"loss": 5.6372,
|
|
"mean_token_accuracy": 0.1503060542047024,
|
|
"num_tokens": 14888456.0,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"entropy": 5.7973710060119625,
|
|
"epoch": 0.6780088216761184,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004960283116330255,
|
|
"loss": 5.731,
|
|
"mean_token_accuracy": 0.14978916943073273,
|
|
"num_tokens": 14897401.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 5.807585668563843,
|
|
"epoch": 0.6784289014912833,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004960227078048735,
|
|
"loss": 5.6567,
|
|
"mean_token_accuracy": 0.15412394553422928,
|
|
"num_tokens": 14906741.0,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"entropy": 5.760078573226929,
|
|
"epoch": 0.6788489813064482,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004960171000614179,
|
|
"loss": 5.5427,
|
|
"mean_token_accuracy": 0.16074198186397554,
|
|
"num_tokens": 14916002.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 5.638378715515136,
|
|
"epoch": 0.6792690611216131,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004960114884027583,
|
|
"loss": 5.4776,
|
|
"mean_token_accuracy": 0.16621290147304535,
|
|
"num_tokens": 14925247.0,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"entropy": 5.708978319168091,
|
|
"epoch": 0.679689140936778,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004960058728289939,
|
|
"loss": 5.608,
|
|
"mean_token_accuracy": 0.15026133954524995,
|
|
"num_tokens": 14933925.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 5.904026126861572,
|
|
"epoch": 0.6801092207519429,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004960002533402243,
|
|
"loss": 5.6881,
|
|
"mean_token_accuracy": 0.15241528823971748,
|
|
"num_tokens": 14943368.0,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"entropy": 5.790306043624878,
|
|
"epoch": 0.6805293005671077,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004959946299365491,
|
|
"loss": 5.6953,
|
|
"mean_token_accuracy": 0.14710961580276488,
|
|
"num_tokens": 14953710.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 5.816765403747558,
|
|
"epoch": 0.6809493803822726,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004959890026180677,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.14748610258102418,
|
|
"num_tokens": 14962814.0,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"entropy": 5.688648128509522,
|
|
"epoch": 0.6813694601974375,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00049598337138488,
|
|
"loss": 5.5964,
|
|
"mean_token_accuracy": 0.16184311360120773,
|
|
"num_tokens": 14971631.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 5.8211281299591064,
|
|
"epoch": 0.6817895400126024,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004959777362370855,
|
|
"loss": 5.5884,
|
|
"mean_token_accuracy": 0.15286847501993178,
|
|
"num_tokens": 14980528.0,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"entropy": 5.87521915435791,
|
|
"epoch": 0.6822096198277673,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004959720971747843,
|
|
"loss": 5.6149,
|
|
"mean_token_accuracy": 0.15216847509145737,
|
|
"num_tokens": 14989331.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 5.713017272949219,
|
|
"epoch": 0.6826296996429322,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004959664541980762,
|
|
"loss": 5.598,
|
|
"mean_token_accuracy": 0.15774561017751693,
|
|
"num_tokens": 14999403.0,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"entropy": 5.737113285064697,
|
|
"epoch": 0.6830497794580971,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004959608073070612,
|
|
"loss": 5.6958,
|
|
"mean_token_accuracy": 0.14559513479471206,
|
|
"num_tokens": 15009388.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 5.837254619598388,
|
|
"epoch": 0.6834698592732619,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004959551565018392,
|
|
"loss": 5.6286,
|
|
"mean_token_accuracy": 0.15535787492990494,
|
|
"num_tokens": 15018586.0,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"entropy": 5.778875064849854,
|
|
"epoch": 0.6838899390884268,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004959495017825104,
|
|
"loss": 5.6407,
|
|
"mean_token_accuracy": 0.15465399324893953,
|
|
"num_tokens": 15027982.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 5.739845132827758,
|
|
"epoch": 0.6843100189035917,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004959438431491749,
|
|
"loss": 5.6278,
|
|
"mean_token_accuracy": 0.15651622265577317,
|
|
"num_tokens": 15037103.0,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"entropy": 5.728132820129394,
|
|
"epoch": 0.6847300987187566,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000495938180601933,
|
|
"loss": 5.7184,
|
|
"mean_token_accuracy": 0.14796946495771407,
|
|
"num_tokens": 15046739.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 5.822361660003662,
|
|
"epoch": 0.6851501785339215,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004959325141408851,
|
|
"loss": 5.666,
|
|
"mean_token_accuracy": 0.15593857914209366,
|
|
"num_tokens": 15056586.0,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"entropy": 5.768631410598755,
|
|
"epoch": 0.6855702583490864,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004959268437661313,
|
|
"loss": 5.641,
|
|
"mean_token_accuracy": 0.15448189303278922,
|
|
"num_tokens": 15066622.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 5.767803955078125,
|
|
"epoch": 0.6859903381642513,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004959211694777724,
|
|
"loss": 5.6293,
|
|
"mean_token_accuracy": 0.15781602412462234,
|
|
"num_tokens": 15075415.0,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"entropy": 5.731510210037231,
|
|
"epoch": 0.686410417979416,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004959154912759086,
|
|
"loss": 5.6134,
|
|
"mean_token_accuracy": 0.15285183787345885,
|
|
"num_tokens": 15085087.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 5.772061681747436,
|
|
"epoch": 0.6868304977945809,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004959098091606406,
|
|
"loss": 5.6231,
|
|
"mean_token_accuracy": 0.1562209889292717,
|
|
"num_tokens": 15093580.0,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"entropy": 5.681428337097168,
|
|
"epoch": 0.6872505776097458,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004959041231320692,
|
|
"loss": 5.5996,
|
|
"mean_token_accuracy": 0.15760979950428008,
|
|
"num_tokens": 15104033.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 5.769718980789184,
|
|
"epoch": 0.6876706574249107,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958984331902951,
|
|
"loss": 5.6773,
|
|
"mean_token_accuracy": 0.14753246530890465,
|
|
"num_tokens": 15113164.0,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"entropy": 5.745969009399414,
|
|
"epoch": 0.6880907372400756,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004958927393354188,
|
|
"loss": 5.6297,
|
|
"mean_token_accuracy": 0.15737390518188477,
|
|
"num_tokens": 15122215.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 5.765387773513794,
|
|
"epoch": 0.6885108170552405,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004958870415675415,
|
|
"loss": 5.6091,
|
|
"mean_token_accuracy": 0.15159644484519957,
|
|
"num_tokens": 15130877.0,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"entropy": 5.7833487033844,
|
|
"epoch": 0.6889308968704054,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004958813398867639,
|
|
"loss": 5.5909,
|
|
"mean_token_accuracy": 0.1610761597752571,
|
|
"num_tokens": 15140227.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 5.874035358428955,
|
|
"epoch": 0.6893509766855702,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004958756342931872,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.14578953385353088,
|
|
"num_tokens": 15150006.0,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"entropy": 5.7979443073272705,
|
|
"epoch": 0.6897710565007351,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004958699247869122,
|
|
"loss": 5.6734,
|
|
"mean_token_accuracy": 0.15173593461513518,
|
|
"num_tokens": 15160032.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 5.775300407409668,
|
|
"epoch": 0.6901911363159,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004958642113680404,
|
|
"loss": 5.607,
|
|
"mean_token_accuracy": 0.15672277957201003,
|
|
"num_tokens": 15168966.0,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"entropy": 5.886404323577881,
|
|
"epoch": 0.6906112161310649,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004958584940366727,
|
|
"loss": 5.7931,
|
|
"mean_token_accuracy": 0.1462364301085472,
|
|
"num_tokens": 15179337.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 5.845329141616821,
|
|
"epoch": 0.6910312959462298,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958527727929106,
|
|
"loss": 5.6901,
|
|
"mean_token_accuracy": 0.15126113295555116,
|
|
"num_tokens": 15188395.0,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"entropy": 5.777632856369019,
|
|
"epoch": 0.6914513757613947,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004958470476368552,
|
|
"loss": 5.6175,
|
|
"mean_token_accuracy": 0.1590783603489399,
|
|
"num_tokens": 15198669.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 5.717659664154053,
|
|
"epoch": 0.6918714555765595,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004958413185686082,
|
|
"loss": 5.637,
|
|
"mean_token_accuracy": 0.15654054433107376,
|
|
"num_tokens": 15207371.0,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"entropy": 5.771133661270142,
|
|
"epoch": 0.6922915353917244,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004958355855882709,
|
|
"loss": 5.6623,
|
|
"mean_token_accuracy": 0.15609176307916642,
|
|
"num_tokens": 15215694.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 5.838139247894287,
|
|
"epoch": 0.6927116152068893,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000495829848695945,
|
|
"loss": 5.6462,
|
|
"mean_token_accuracy": 0.15314621180295945,
|
|
"num_tokens": 15224963.0,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"entropy": 5.6792638301849365,
|
|
"epoch": 0.6931316950220542,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000495824107891732,
|
|
"loss": 5.4601,
|
|
"mean_token_accuracy": 0.16161370724439622,
|
|
"num_tokens": 15233569.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 5.702935647964478,
|
|
"epoch": 0.6935517748372191,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004958183631757336,
|
|
"loss": 5.6456,
|
|
"mean_token_accuracy": 0.15384626239538193,
|
|
"num_tokens": 15242671.0,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"entropy": 5.757969760894776,
|
|
"epoch": 0.693971854652384,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004958126145480517,
|
|
"loss": 5.6062,
|
|
"mean_token_accuracy": 0.15589472502470017,
|
|
"num_tokens": 15251698.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 5.881031131744384,
|
|
"epoch": 0.6943919344675489,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004958068620087879,
|
|
"loss": 5.7131,
|
|
"mean_token_accuracy": 0.15278587341308594,
|
|
"num_tokens": 15260608.0,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"entropy": 5.7654228687286375,
|
|
"epoch": 0.6948120142827137,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004958011055580443,
|
|
"loss": 5.5824,
|
|
"mean_token_accuracy": 0.1566091775894165,
|
|
"num_tokens": 15268866.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 5.691988468170166,
|
|
"epoch": 0.6952320940978786,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004957953451959229,
|
|
"loss": 5.5428,
|
|
"mean_token_accuracy": 0.1687786027789116,
|
|
"num_tokens": 15277600.0,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"entropy": 5.712690019607544,
|
|
"epoch": 0.6956521739130435,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004957895809225254,
|
|
"loss": 5.577,
|
|
"mean_token_accuracy": 0.15904618948698043,
|
|
"num_tokens": 15286016.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 5.791261529922485,
|
|
"epoch": 0.6960722537282084,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957838127379544,
|
|
"loss": 5.6203,
|
|
"mean_token_accuracy": 0.15775981694459915,
|
|
"num_tokens": 15294676.0,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"entropy": 5.787760162353516,
|
|
"epoch": 0.6964923335433733,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004957780406423118,
|
|
"loss": 5.6093,
|
|
"mean_token_accuracy": 0.1520596593618393,
|
|
"num_tokens": 15304084.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 5.732133674621582,
|
|
"epoch": 0.6969124133585382,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004957722646356999,
|
|
"loss": 5.6145,
|
|
"mean_token_accuracy": 0.15437885522842407,
|
|
"num_tokens": 15314182.0,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"entropy": 5.82383394241333,
|
|
"epoch": 0.697332493173703,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004957664847182209,
|
|
"loss": 5.7321,
|
|
"mean_token_accuracy": 0.14916351363062857,
|
|
"num_tokens": 15324213.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 5.901606464385987,
|
|
"epoch": 0.6977525729888678,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004957607008899774,
|
|
"loss": 5.6654,
|
|
"mean_token_accuracy": 0.14808408319950103,
|
|
"num_tokens": 15333122.0,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"entropy": 5.821764516830444,
|
|
"epoch": 0.6981726528040327,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004957549131510717,
|
|
"loss": 5.7587,
|
|
"mean_token_accuracy": 0.14488900303840638,
|
|
"num_tokens": 15342199.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 5.85214409828186,
|
|
"epoch": 0.6985927326191976,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004957491215016065,
|
|
"loss": 5.7068,
|
|
"mean_token_accuracy": 0.14899201691150665,
|
|
"num_tokens": 15352463.0,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"entropy": 5.7340789318084715,
|
|
"epoch": 0.6990128124343625,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004957433259416841,
|
|
"loss": 5.5519,
|
|
"mean_token_accuracy": 0.15695535391569138,
|
|
"num_tokens": 15361815.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 5.829116296768189,
|
|
"epoch": 0.6994328922495274,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004957375264714075,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.14441719949245452,
|
|
"num_tokens": 15371773.0,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"entropy": 5.731393432617187,
|
|
"epoch": 0.6998529720646923,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957317230908792,
|
|
"loss": 5.6078,
|
|
"mean_token_accuracy": 0.153985595703125,
|
|
"num_tokens": 15380881.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 5.69814658164978,
|
|
"epoch": 0.7002730518798572,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004957259158002022,
|
|
"loss": 5.4853,
|
|
"mean_token_accuracy": 0.16338536590337754,
|
|
"num_tokens": 15389310.0,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"entropy": 5.65314564704895,
|
|
"epoch": 0.700693131695022,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004957201045994791,
|
|
"loss": 5.585,
|
|
"mean_token_accuracy": 0.15192776024341584,
|
|
"num_tokens": 15398584.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 5.752124881744384,
|
|
"epoch": 0.7011132115101869,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004957142894888131,
|
|
"loss": 5.6244,
|
|
"mean_token_accuracy": 0.1605387285351753,
|
|
"num_tokens": 15407208.0,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"entropy": 5.781596279144287,
|
|
"epoch": 0.7015332913253518,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004957084704683071,
|
|
"loss": 5.6552,
|
|
"mean_token_accuracy": 0.15119443833827972,
|
|
"num_tokens": 15416474.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 5.796496915817261,
|
|
"epoch": 0.7019533711405167,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004957026475380642,
|
|
"loss": 5.6589,
|
|
"mean_token_accuracy": 0.1581042394042015,
|
|
"num_tokens": 15426101.0,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"entropy": 5.8482013702392575,
|
|
"epoch": 0.7023734509556816,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004956968206981875,
|
|
"loss": 5.6866,
|
|
"mean_token_accuracy": 0.1528375506401062,
|
|
"num_tokens": 15435910.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 5.838450860977173,
|
|
"epoch": 0.7027935307708465,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004956909899487803,
|
|
"loss": 5.7297,
|
|
"mean_token_accuracy": 0.14721868485212325,
|
|
"num_tokens": 15445494.0,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"entropy": 5.773874664306641,
|
|
"epoch": 0.7032136105860114,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004956851552899459,
|
|
"loss": 5.6133,
|
|
"mean_token_accuracy": 0.15867630988359452,
|
|
"num_tokens": 15455332.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 5.7730677127838135,
|
|
"epoch": 0.7036336904011762,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004956793167217874,
|
|
"loss": 5.6813,
|
|
"mean_token_accuracy": 0.1490170478820801,
|
|
"num_tokens": 15464241.0,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"entropy": 5.8777241706848145,
|
|
"epoch": 0.7040537702163411,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004956734742444087,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.15121965557336808,
|
|
"num_tokens": 15473473.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 5.744890403747559,
|
|
"epoch": 0.704473850031506,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004956676278579129,
|
|
"loss": 5.563,
|
|
"mean_token_accuracy": 0.15540574193000795,
|
|
"num_tokens": 15482494.0,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"entropy": 5.676463556289673,
|
|
"epoch": 0.7048939298466709,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004956617775624037,
|
|
"loss": 5.5724,
|
|
"mean_token_accuracy": 0.15146812200546264,
|
|
"num_tokens": 15491180.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 5.786671447753906,
|
|
"epoch": 0.7053140096618358,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004956559233579848,
|
|
"loss": 5.6148,
|
|
"mean_token_accuracy": 0.15258617997169494,
|
|
"num_tokens": 15501035.0,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"entropy": 5.7913405418396,
|
|
"epoch": 0.7057340894770007,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004956500652447598,
|
|
"loss": 5.5994,
|
|
"mean_token_accuracy": 0.15323785319924355,
|
|
"num_tokens": 15510191.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 5.706702041625976,
|
|
"epoch": 0.7061541692921655,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004956442032228324,
|
|
"loss": 5.6875,
|
|
"mean_token_accuracy": 0.15146460086107255,
|
|
"num_tokens": 15519253.0,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"entropy": 5.7468561172485355,
|
|
"epoch": 0.7065742491073304,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004956383372923067,
|
|
"loss": 5.6573,
|
|
"mean_token_accuracy": 0.15219423472881316,
|
|
"num_tokens": 15528348.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 5.909702920913697,
|
|
"epoch": 0.7069943289224953,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004956324674532864,
|
|
"loss": 5.7312,
|
|
"mean_token_accuracy": 0.14496915340423583,
|
|
"num_tokens": 15537557.0,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"entropy": 5.853457021713257,
|
|
"epoch": 0.7074144087376601,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004956265937058757,
|
|
"loss": 5.6662,
|
|
"mean_token_accuracy": 0.14985378384590148,
|
|
"num_tokens": 15546745.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 5.753704071044922,
|
|
"epoch": 0.707834488552825,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004956207160501784,
|
|
"loss": 5.5646,
|
|
"mean_token_accuracy": 0.15850543081760407,
|
|
"num_tokens": 15555532.0,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"entropy": 5.728769159317016,
|
|
"epoch": 0.70825456836799,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004956148344862987,
|
|
"loss": 5.6209,
|
|
"mean_token_accuracy": 0.1560587242245674,
|
|
"num_tokens": 15564189.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 5.664771509170532,
|
|
"epoch": 0.7086746481831548,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004956089490143408,
|
|
"loss": 5.6492,
|
|
"mean_token_accuracy": 0.15197667628526687,
|
|
"num_tokens": 15574116.0,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"entropy": 5.824323844909668,
|
|
"epoch": 0.7090947279983196,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004956030596344089,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.149012803286314,
|
|
"num_tokens": 15583031.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 5.836510467529297,
|
|
"epoch": 0.7095148078134845,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004955971663466075,
|
|
"loss": 5.7671,
|
|
"mean_token_accuracy": 0.15028237402439118,
|
|
"num_tokens": 15592576.0,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"entropy": 5.823656129837036,
|
|
"epoch": 0.7099348876286494,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004955912691510407,
|
|
"loss": 5.697,
|
|
"mean_token_accuracy": 0.15281013548374175,
|
|
"num_tokens": 15601065.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 5.751941967010498,
|
|
"epoch": 0.7103549674438143,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004955853680478134,
|
|
"loss": 5.633,
|
|
"mean_token_accuracy": 0.14754925668239594,
|
|
"num_tokens": 15610112.0,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"entropy": 5.778195095062256,
|
|
"epoch": 0.7107750472589792,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004955794630370297,
|
|
"loss": 5.6139,
|
|
"mean_token_accuracy": 0.15469905436038972,
|
|
"num_tokens": 15618890.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 5.750346851348877,
|
|
"epoch": 0.7111951270741441,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004955735541187945,
|
|
"loss": 5.6397,
|
|
"mean_token_accuracy": 0.15139740109443664,
|
|
"num_tokens": 15627678.0,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"entropy": 5.838537120819092,
|
|
"epoch": 0.711615206889309,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004955676412932124,
|
|
"loss": 5.6254,
|
|
"mean_token_accuracy": 0.15495479255914688,
|
|
"num_tokens": 15636833.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 5.758643341064453,
|
|
"epoch": 0.7120352867044738,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004955617245603881,
|
|
"loss": 5.6441,
|
|
"mean_token_accuracy": 0.1475740984082222,
|
|
"num_tokens": 15646571.0,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"entropy": 5.771809864044189,
|
|
"epoch": 0.7124553665196387,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004955558039204263,
|
|
"loss": 5.6883,
|
|
"mean_token_accuracy": 0.1559377834200859,
|
|
"num_tokens": 15654907.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 5.87169828414917,
|
|
"epoch": 0.7128754463348036,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004955498793734321,
|
|
"loss": 5.6259,
|
|
"mean_token_accuracy": 0.15253366231918336,
|
|
"num_tokens": 15664336.0,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"entropy": 5.775359678268432,
|
|
"epoch": 0.7132955261499685,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004955439509195103,
|
|
"loss": 5.6818,
|
|
"mean_token_accuracy": 0.15552834868431092,
|
|
"num_tokens": 15674000.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 5.817126750946045,
|
|
"epoch": 0.7137156059651334,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004955380185587661,
|
|
"loss": 5.6655,
|
|
"mean_token_accuracy": 0.15541905909776688,
|
|
"num_tokens": 15684214.0,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"entropy": 5.823128080368042,
|
|
"epoch": 0.7141356857802983,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0004955320822913043,
|
|
"loss": 5.695,
|
|
"mean_token_accuracy": 0.14909214079380034,
|
|
"num_tokens": 15693546.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 5.796035861968994,
|
|
"epoch": 0.7145557655954632,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004955261421172302,
|
|
"loss": 5.6006,
|
|
"mean_token_accuracy": 0.15094921365380287,
|
|
"num_tokens": 15702310.0,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"entropy": 5.765657234191894,
|
|
"epoch": 0.714975845410628,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004955201980366493,
|
|
"loss": 5.6549,
|
|
"mean_token_accuracy": 0.1583261877298355,
|
|
"num_tokens": 15711544.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 5.701775074005127,
|
|
"epoch": 0.7153959252257929,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004955142500496665,
|
|
"loss": 5.5378,
|
|
"mean_token_accuracy": 0.15932040065526962,
|
|
"num_tokens": 15720914.0,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"entropy": 5.806231927871704,
|
|
"epoch": 0.7158160050409578,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004955082981563872,
|
|
"loss": 5.636,
|
|
"mean_token_accuracy": 0.1497705653309822,
|
|
"num_tokens": 15729825.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 5.731112813949585,
|
|
"epoch": 0.7162360848561227,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000495502342356917,
|
|
"loss": 5.6407,
|
|
"mean_token_accuracy": 0.15358344316482545,
|
|
"num_tokens": 15739649.0,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"entropy": 5.775957298278809,
|
|
"epoch": 0.7166561646712876,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004954963826513614,
|
|
"loss": 5.5312,
|
|
"mean_token_accuracy": 0.15533651560544967,
|
|
"num_tokens": 15747805.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 5.848172760009765,
|
|
"epoch": 0.7170762444864525,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000495490419039826,
|
|
"loss": 5.6763,
|
|
"mean_token_accuracy": 0.15182012543082238,
|
|
"num_tokens": 15757267.0,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"entropy": 5.734999704360962,
|
|
"epoch": 0.7174963243016174,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954844515224162,
|
|
"loss": 5.6442,
|
|
"mean_token_accuracy": 0.15498089045286179,
|
|
"num_tokens": 15767412.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 5.702851438522339,
|
|
"epoch": 0.7179164041167821,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004954784800992379,
|
|
"loss": 5.6434,
|
|
"mean_token_accuracy": 0.1511929914355278,
|
|
"num_tokens": 15776813.0,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"entropy": 5.8534894466400145,
|
|
"epoch": 0.718336483931947,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004954725047703969,
|
|
"loss": 5.6771,
|
|
"mean_token_accuracy": 0.152647565305233,
|
|
"num_tokens": 15786258.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 5.836289310455323,
|
|
"epoch": 0.7187565637471119,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000495466525535999,
|
|
"loss": 5.6667,
|
|
"mean_token_accuracy": 0.15143323093652725,
|
|
"num_tokens": 15795673.0,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"entropy": 5.811659526824951,
|
|
"epoch": 0.7191766435622768,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004954605423961501,
|
|
"loss": 5.6561,
|
|
"mean_token_accuracy": 0.15157762318849563,
|
|
"num_tokens": 15805050.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 5.681427240371704,
|
|
"epoch": 0.7195967233774417,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954545553509562,
|
|
"loss": 5.606,
|
|
"mean_token_accuracy": 0.16409880369901658,
|
|
"num_tokens": 15813347.0,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"entropy": 5.839797496795654,
|
|
"epoch": 0.7200168031926066,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004954485644005235,
|
|
"loss": 5.7266,
|
|
"mean_token_accuracy": 0.1489485539495945,
|
|
"num_tokens": 15823528.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 5.8334362506866455,
|
|
"epoch": 0.7204368830077714,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004954425695449578,
|
|
"loss": 5.6173,
|
|
"mean_token_accuracy": 0.15086468532681466,
|
|
"num_tokens": 15832727.0,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"entropy": 5.822533702850341,
|
|
"epoch": 0.7208569628229363,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004954365707843657,
|
|
"loss": 5.6976,
|
|
"mean_token_accuracy": 0.14436446502804756,
|
|
"num_tokens": 15842402.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 5.748192930221558,
|
|
"epoch": 0.7212770426381012,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004954305681188531,
|
|
"loss": 5.5623,
|
|
"mean_token_accuracy": 0.1519525095820427,
|
|
"num_tokens": 15850886.0,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"entropy": 5.9683891296386715,
|
|
"epoch": 0.7216971224532661,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004954245615485265,
|
|
"loss": 5.8576,
|
|
"mean_token_accuracy": 0.14881062209606172,
|
|
"num_tokens": 15860093.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 5.825228261947632,
|
|
"epoch": 0.722117202268431,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004954185510734924,
|
|
"loss": 5.5603,
|
|
"mean_token_accuracy": 0.15691882967948914,
|
|
"num_tokens": 15868681.0,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"entropy": 5.775141906738281,
|
|
"epoch": 0.7225372820835959,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004954125366938571,
|
|
"loss": 5.6425,
|
|
"mean_token_accuracy": 0.15889365077018738,
|
|
"num_tokens": 15878041.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 5.759042358398437,
|
|
"epoch": 0.7229573618987608,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004954065184097271,
|
|
"loss": 5.6357,
|
|
"mean_token_accuracy": 0.15483569353818893,
|
|
"num_tokens": 15887562.0,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"entropy": 5.751525020599365,
|
|
"epoch": 0.7233774417139256,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004954004962212092,
|
|
"loss": 5.5541,
|
|
"mean_token_accuracy": 0.1643654190003872,
|
|
"num_tokens": 15896480.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 5.911052465438843,
|
|
"epoch": 0.7237975215290905,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004953944701284101,
|
|
"loss": 5.7752,
|
|
"mean_token_accuracy": 0.1463731437921524,
|
|
"num_tokens": 15906743.0,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"entropy": 5.830478382110596,
|
|
"epoch": 0.7242176013442554,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004953884401314363,
|
|
"loss": 5.7213,
|
|
"mean_token_accuracy": 0.13995275720953942,
|
|
"num_tokens": 15915981.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 5.8113525867462155,
|
|
"epoch": 0.7246376811594203,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004953824062303949,
|
|
"loss": 5.5765,
|
|
"mean_token_accuracy": 0.1530995100736618,
|
|
"num_tokens": 15924117.0,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"entropy": 5.7734462261199955,
|
|
"epoch": 0.7250577609745852,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004953763684253926,
|
|
"loss": 5.6054,
|
|
"mean_token_accuracy": 0.16219132840633393,
|
|
"num_tokens": 15933124.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 5.7224249839782715,
|
|
"epoch": 0.7254778407897501,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004953703267165364,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.1558832585811615,
|
|
"num_tokens": 15942422.0,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"entropy": 5.749732875823975,
|
|
"epoch": 0.725897920604915,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004953642811039332,
|
|
"loss": 5.7128,
|
|
"mean_token_accuracy": 0.14854123890399934,
|
|
"num_tokens": 15950989.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 5.855362319946289,
|
|
"epoch": 0.7263180004200798,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004953582315876904,
|
|
"loss": 5.7185,
|
|
"mean_token_accuracy": 0.15013131573796273,
|
|
"num_tokens": 15959659.0,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"entropy": 5.837911462783813,
|
|
"epoch": 0.7267380802352447,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.000495352178167915,
|
|
"loss": 5.5977,
|
|
"mean_token_accuracy": 0.16410948783159257,
|
|
"num_tokens": 15968102.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 5.854554653167725,
|
|
"epoch": 0.7271581600504096,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004953461208447143,
|
|
"loss": 5.7132,
|
|
"mean_token_accuracy": 0.14808624759316444,
|
|
"num_tokens": 15977705.0,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"entropy": 5.801808023452759,
|
|
"epoch": 0.7275782398655745,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004953400596181953,
|
|
"loss": 5.7244,
|
|
"mean_token_accuracy": 0.1447308510541916,
|
|
"num_tokens": 15986703.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 5.839752292633056,
|
|
"epoch": 0.7279983196807394,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004953339944884657,
|
|
"loss": 5.6309,
|
|
"mean_token_accuracy": 0.15707603991031646,
|
|
"num_tokens": 15995672.0,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"entropy": 5.702234554290771,
|
|
"epoch": 0.7284183994959043,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004953279254556329,
|
|
"loss": 5.5683,
|
|
"mean_token_accuracy": 0.16529579162597657,
|
|
"num_tokens": 16004437.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 5.786400604248047,
|
|
"epoch": 0.7288384793110692,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004953218525198043,
|
|
"loss": 5.6136,
|
|
"mean_token_accuracy": 0.1482889771461487,
|
|
"num_tokens": 16012847.0,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"entropy": 5.820078039169312,
|
|
"epoch": 0.7292585591262339,
|
|
"grad_norm": 9.3125,
|
|
"learning_rate": 0.0004953157756810876,
|
|
"loss": 5.6444,
|
|
"mean_token_accuracy": 0.15196260213851928,
|
|
"num_tokens": 16022213.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 5.784472417831421,
|
|
"epoch": 0.7296786389413988,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004953096949395902,
|
|
"loss": 5.6938,
|
|
"mean_token_accuracy": 0.15605147629976274,
|
|
"num_tokens": 16031411.0,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"entropy": 5.822618913650513,
|
|
"epoch": 0.7300987187565637,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004953036102954202,
|
|
"loss": 5.7282,
|
|
"mean_token_accuracy": 0.14967211931943894,
|
|
"num_tokens": 16041227.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 5.778734588623047,
|
|
"epoch": 0.7305187985717286,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004952975217486852,
|
|
"loss": 5.5479,
|
|
"mean_token_accuracy": 0.1602558448910713,
|
|
"num_tokens": 16049777.0,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"entropy": 5.83000955581665,
|
|
"epoch": 0.7309388783868935,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0004952914292994928,
|
|
"loss": 5.659,
|
|
"mean_token_accuracy": 0.15439933240413667,
|
|
"num_tokens": 16059093.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 5.840744495391846,
|
|
"epoch": 0.7313589582020584,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004952853329479514,
|
|
"loss": 5.6861,
|
|
"mean_token_accuracy": 0.15537820011377335,
|
|
"num_tokens": 16068550.0,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"entropy": 5.810123777389526,
|
|
"epoch": 0.7317790380172233,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004952792326941686,
|
|
"loss": 5.7191,
|
|
"mean_token_accuracy": 0.14849043488502503,
|
|
"num_tokens": 16078286.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 5.814086198806763,
|
|
"epoch": 0.7321991178323881,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004952731285382527,
|
|
"loss": 5.6667,
|
|
"mean_token_accuracy": 0.15178068578243256,
|
|
"num_tokens": 16087560.0,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"entropy": 5.787434864044189,
|
|
"epoch": 0.732619197647553,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004952670204803118,
|
|
"loss": 5.6204,
|
|
"mean_token_accuracy": 0.1559364140033722,
|
|
"num_tokens": 16097478.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 5.850944232940674,
|
|
"epoch": 0.7330392774627179,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004952609085204539,
|
|
"loss": 5.7189,
|
|
"mean_token_accuracy": 0.15533626079559326,
|
|
"num_tokens": 16106884.0,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"entropy": 5.731724834442138,
|
|
"epoch": 0.7334593572778828,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004952547926587876,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.15004593282938003,
|
|
"num_tokens": 16115689.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 5.7415611743927,
|
|
"epoch": 0.7338794370930477,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004952486728954209,
|
|
"loss": 5.5761,
|
|
"mean_token_accuracy": 0.1599406212568283,
|
|
"num_tokens": 16125237.0,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"entropy": 5.7435039520263675,
|
|
"epoch": 0.7342995169082126,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004952425492304624,
|
|
"loss": 5.5816,
|
|
"mean_token_accuracy": 0.15830608755350112,
|
|
"num_tokens": 16133940.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 5.803058242797851,
|
|
"epoch": 0.7347195967233774,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004952364216640207,
|
|
"loss": 5.6865,
|
|
"mean_token_accuracy": 0.15288463681936265,
|
|
"num_tokens": 16143256.0,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"entropy": 5.834009265899658,
|
|
"epoch": 0.7351396765385423,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000495230290196204,
|
|
"loss": 5.5648,
|
|
"mean_token_accuracy": 0.15222593396902084,
|
|
"num_tokens": 16153259.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 5.86444673538208,
|
|
"epoch": 0.7355597563537072,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004952241548271212,
|
|
"loss": 5.8055,
|
|
"mean_token_accuracy": 0.14142679050564766,
|
|
"num_tokens": 16162125.0,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"entropy": 5.84849967956543,
|
|
"epoch": 0.7359798361688721,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004952180155568809,
|
|
"loss": 5.7224,
|
|
"mean_token_accuracy": 0.14703101068735122,
|
|
"num_tokens": 16171680.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 5.853292989730835,
|
|
"epoch": 0.736399915984037,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004952118723855919,
|
|
"loss": 5.7153,
|
|
"mean_token_accuracy": 0.15350899547338487,
|
|
"num_tokens": 16181559.0,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"entropy": 5.755408191680909,
|
|
"epoch": 0.7368199957992019,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004952057253133628,
|
|
"loss": 5.668,
|
|
"mean_token_accuracy": 0.15180395692586898,
|
|
"num_tokens": 16190611.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 5.833858060836792,
|
|
"epoch": 0.7372400756143668,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004951995743403028,
|
|
"loss": 5.6769,
|
|
"mean_token_accuracy": 0.15253981202840805,
|
|
"num_tokens": 16200156.0,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"entropy": 5.824840307235718,
|
|
"epoch": 0.7376601554295316,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004951934194665208,
|
|
"loss": 5.6458,
|
|
"mean_token_accuracy": 0.14709821194410325,
|
|
"num_tokens": 16209808.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 5.756002902984619,
|
|
"epoch": 0.7380802352446965,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004951872606921257,
|
|
"loss": 5.6136,
|
|
"mean_token_accuracy": 0.15270906686782837,
|
|
"num_tokens": 16219243.0,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"entropy": 5.72284197807312,
|
|
"epoch": 0.7385003150598614,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004951810980172265,
|
|
"loss": 5.627,
|
|
"mean_token_accuracy": 0.1641955330967903,
|
|
"num_tokens": 16228180.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 5.785319805145264,
|
|
"epoch": 0.7389203948750263,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004951749314419327,
|
|
"loss": 5.6417,
|
|
"mean_token_accuracy": 0.15115589275956154,
|
|
"num_tokens": 16237045.0,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"entropy": 5.791619110107422,
|
|
"epoch": 0.7393404746901912,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004951687609663533,
|
|
"loss": 5.5589,
|
|
"mean_token_accuracy": 0.15952047407627107,
|
|
"num_tokens": 16245307.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 5.765593528747559,
|
|
"epoch": 0.739760554505356,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004951625865905977,
|
|
"loss": 5.5974,
|
|
"mean_token_accuracy": 0.14921371787786483,
|
|
"num_tokens": 16255047.0,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"entropy": 5.749333095550537,
|
|
"epoch": 0.740180634320521,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004951564083147753,
|
|
"loss": 5.6447,
|
|
"mean_token_accuracy": 0.1600167080760002,
|
|
"num_tokens": 16264969.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 5.81842737197876,
|
|
"epoch": 0.7406007141356857,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004951502261389953,
|
|
"loss": 5.7327,
|
|
"mean_token_accuracy": 0.14656912833452224,
|
|
"num_tokens": 16274757.0,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"entropy": 5.780880069732666,
|
|
"epoch": 0.7410207939508506,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004951440400633677,
|
|
"loss": 5.6351,
|
|
"mean_token_accuracy": 0.16265199482440948,
|
|
"num_tokens": 16283409.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 5.687593412399292,
|
|
"epoch": 0.7414408737660155,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004951378500880015,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.1549723207950592,
|
|
"num_tokens": 16293206.0,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"entropy": 5.82498950958252,
|
|
"epoch": 0.7418609535811804,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004951316562130067,
|
|
"loss": 5.6332,
|
|
"mean_token_accuracy": 0.15318880528211593,
|
|
"num_tokens": 16303121.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 5.778778553009033,
|
|
"epoch": 0.7422810333963453,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000495125458438493,
|
|
"loss": 5.5975,
|
|
"mean_token_accuracy": 0.16230110377073287,
|
|
"num_tokens": 16312710.0,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"entropy": 5.8864704132080075,
|
|
"epoch": 0.7427011132115102,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004951192567645702,
|
|
"loss": 5.7853,
|
|
"mean_token_accuracy": 0.14685365781188012,
|
|
"num_tokens": 16322280.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 5.721866273880005,
|
|
"epoch": 0.7431211930266751,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004951130511913481,
|
|
"loss": 5.635,
|
|
"mean_token_accuracy": 0.15453375428915023,
|
|
"num_tokens": 16331656.0,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"entropy": 5.7635541439056395,
|
|
"epoch": 0.7435412728418399,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004951068417189366,
|
|
"loss": 5.6607,
|
|
"mean_token_accuracy": 0.15400536656379699,
|
|
"num_tokens": 16341074.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 5.806599426269531,
|
|
"epoch": 0.7439613526570048,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004951006283474457,
|
|
"loss": 5.6525,
|
|
"mean_token_accuracy": 0.15177395343780517,
|
|
"num_tokens": 16350097.0,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"entropy": 5.6168114185333256,
|
|
"epoch": 0.7443814324721697,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004950944110769856,
|
|
"loss": 5.5518,
|
|
"mean_token_accuracy": 0.16385273784399032,
|
|
"num_tokens": 16359274.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 5.655103158950806,
|
|
"epoch": 0.7448015122873346,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004950881899076663,
|
|
"loss": 5.5365,
|
|
"mean_token_accuracy": 0.1682687819004059,
|
|
"num_tokens": 16368445.0,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"entropy": 5.878038167953491,
|
|
"epoch": 0.7452215921024995,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004950819648395979,
|
|
"loss": 5.6423,
|
|
"mean_token_accuracy": 0.1565190926194191,
|
|
"num_tokens": 16377689.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 5.772777366638183,
|
|
"epoch": 0.7456416719176644,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000495075735872891,
|
|
"loss": 5.5949,
|
|
"mean_token_accuracy": 0.1571029394865036,
|
|
"num_tokens": 16386713.0,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"entropy": 5.772426748275757,
|
|
"epoch": 0.7460617517328293,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004950695030076557,
|
|
"loss": 5.6116,
|
|
"mean_token_accuracy": 0.152817103266716,
|
|
"num_tokens": 16395390.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 5.862038803100586,
|
|
"epoch": 0.7464818315479941,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004950632662440027,
|
|
"loss": 5.6909,
|
|
"mean_token_accuracy": 0.15143778100609778,
|
|
"num_tokens": 16404531.0,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"entropy": 5.734190988540649,
|
|
"epoch": 0.746901911363159,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004950570255820419,
|
|
"loss": 5.5892,
|
|
"mean_token_accuracy": 0.15557831078767775,
|
|
"num_tokens": 16413649.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 5.679434442520142,
|
|
"epoch": 0.7473219911783239,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004950507810218843,
|
|
"loss": 5.7074,
|
|
"mean_token_accuracy": 0.14878712072968484,
|
|
"num_tokens": 16423247.0,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"entropy": 5.8338196754455565,
|
|
"epoch": 0.7477420709934888,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004950445325636405,
|
|
"loss": 5.649,
|
|
"mean_token_accuracy": 0.14864842891693114,
|
|
"num_tokens": 16432190.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 5.864486503601074,
|
|
"epoch": 0.7481621508086537,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004950382802074211,
|
|
"loss": 5.6038,
|
|
"mean_token_accuracy": 0.15934911370277405,
|
|
"num_tokens": 16443091.0,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"entropy": 5.711412811279297,
|
|
"epoch": 0.7485822306238186,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004950320239533369,
|
|
"loss": 5.6338,
|
|
"mean_token_accuracy": 0.15670278668403625,
|
|
"num_tokens": 16452077.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 5.8399248123168945,
|
|
"epoch": 0.7490023104389834,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004950257638014986,
|
|
"loss": 5.7602,
|
|
"mean_token_accuracy": 0.14474717825651168,
|
|
"num_tokens": 16461893.0,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"entropy": 5.905817985534668,
|
|
"epoch": 0.7494223902541483,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004950194997520172,
|
|
"loss": 5.5814,
|
|
"mean_token_accuracy": 0.1564013957977295,
|
|
"num_tokens": 16470904.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 5.779659080505371,
|
|
"epoch": 0.7498424700693131,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004950132318050037,
|
|
"loss": 5.6502,
|
|
"mean_token_accuracy": 0.14872682839632034,
|
|
"num_tokens": 16480130.0,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"entropy": 5.735926008224487,
|
|
"epoch": 0.750262549884478,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004950069599605691,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.1561155989766121,
|
|
"num_tokens": 16489485.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 5.7690812110900875,
|
|
"epoch": 0.750682629699643,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004950006842188245,
|
|
"loss": 5.6526,
|
|
"mean_token_accuracy": 0.15704655051231384,
|
|
"num_tokens": 16498529.0,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"entropy": 5.776333618164062,
|
|
"epoch": 0.7511027095148078,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.000494994404579881,
|
|
"loss": 5.5733,
|
|
"mean_token_accuracy": 0.1540952205657959,
|
|
"num_tokens": 16508094.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 5.810970735549927,
|
|
"epoch": 0.7515227893299727,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00049498812104385,
|
|
"loss": 5.6854,
|
|
"mean_token_accuracy": 0.14840709492564202,
|
|
"num_tokens": 16517620.0,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"entropy": 5.717817068099976,
|
|
"epoch": 0.7519428691451375,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004949818336108425,
|
|
"loss": 5.6743,
|
|
"mean_token_accuracy": 0.1453969433903694,
|
|
"num_tokens": 16526720.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 5.782077789306641,
|
|
"epoch": 0.7523629489603024,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004949755422809703,
|
|
"loss": 5.6349,
|
|
"mean_token_accuracy": 0.15297809839248658,
|
|
"num_tokens": 16535979.0,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"entropy": 5.789309072494507,
|
|
"epoch": 0.7527830287754673,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004949692470543446,
|
|
"loss": 5.518,
|
|
"mean_token_accuracy": 0.16405045241117477,
|
|
"num_tokens": 16544538.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 5.700740957260132,
|
|
"epoch": 0.7532031085906322,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004949629479310769,
|
|
"loss": 5.6021,
|
|
"mean_token_accuracy": 0.15271754264831544,
|
|
"num_tokens": 16553962.0,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"entropy": 5.7723414421081545,
|
|
"epoch": 0.7536231884057971,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004949566449112788,
|
|
"loss": 5.5341,
|
|
"mean_token_accuracy": 0.1600716605782509,
|
|
"num_tokens": 16562652.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 5.816875839233399,
|
|
"epoch": 0.754043268220962,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004949503379950621,
|
|
"loss": 5.6381,
|
|
"mean_token_accuracy": 0.15340977758169175,
|
|
"num_tokens": 16570887.0,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"entropy": 5.825795125961304,
|
|
"epoch": 0.7544633480361269,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004949440271825385,
|
|
"loss": 5.7669,
|
|
"mean_token_accuracy": 0.15065207779407502,
|
|
"num_tokens": 16581469.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 5.783386135101319,
|
|
"epoch": 0.7548834278512917,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004949377124738196,
|
|
"loss": 5.6376,
|
|
"mean_token_accuracy": 0.15028667375445365,
|
|
"num_tokens": 16590213.0,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"entropy": 5.759113931655884,
|
|
"epoch": 0.7553035076664566,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004949313938690174,
|
|
"loss": 5.6301,
|
|
"mean_token_accuracy": 0.1542770192027092,
|
|
"num_tokens": 16598384.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 5.692385244369507,
|
|
"epoch": 0.7557235874816215,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004949250713682438,
|
|
"loss": 5.6114,
|
|
"mean_token_accuracy": 0.15893905013799667,
|
|
"num_tokens": 16607670.0,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"entropy": 5.830786418914795,
|
|
"epoch": 0.7561436672967864,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004949187449716107,
|
|
"loss": 5.6932,
|
|
"mean_token_accuracy": 0.15244348496198654,
|
|
"num_tokens": 16617560.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7561436672967864,
|
|
"eval_entropy": 5.638838640603793,
|
|
"eval_loss": 5.66161584854126,
|
|
"eval_mean_token_accuracy": 0.1600216546673523,
|
|
"eval_num_tokens": 16617560.0,
|
|
"eval_runtime": 27.3107,
|
|
"eval_samples_per_second": 1368.184,
|
|
"eval_steps_per_second": 171.032,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 5.768628692626953,
|
|
"epoch": 0.7565637471119513,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004949124146792304,
|
|
"loss": 5.6053,
|
|
"mean_token_accuracy": 0.15778433308005332,
|
|
"num_tokens": 16626038.0,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"entropy": 5.731417560577393,
|
|
"epoch": 0.7569838269271162,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004949060804912149,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.15456314831972123,
|
|
"num_tokens": 16636490.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 5.799277114868164,
|
|
"epoch": 0.7574039067422811,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004948997424076764,
|
|
"loss": 5.6171,
|
|
"mean_token_accuracy": 0.15356937795877457,
|
|
"num_tokens": 16645369.0,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"entropy": 5.897463607788086,
|
|
"epoch": 0.7578239865574459,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004948934004287272,
|
|
"loss": 5.7045,
|
|
"mean_token_accuracy": 0.15171536356210708,
|
|
"num_tokens": 16654348.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 5.868229866027832,
|
|
"epoch": 0.7582440663726108,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004948870545544796,
|
|
"loss": 5.6922,
|
|
"mean_token_accuracy": 0.14805838614702224,
|
|
"num_tokens": 16664009.0,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"entropy": 5.773172187805176,
|
|
"epoch": 0.7586641461877757,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000494880704785046,
|
|
"loss": 5.7196,
|
|
"mean_token_accuracy": 0.14663708806037903,
|
|
"num_tokens": 16674079.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 5.847867155075074,
|
|
"epoch": 0.7590842260029406,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004948743511205392,
|
|
"loss": 5.6426,
|
|
"mean_token_accuracy": 0.1503751888871193,
|
|
"num_tokens": 16683687.0,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"entropy": 5.763606691360474,
|
|
"epoch": 0.7595043058181055,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004948679935610712,
|
|
"loss": 5.5392,
|
|
"mean_token_accuracy": 0.1664429262280464,
|
|
"num_tokens": 16693311.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 5.767797994613647,
|
|
"epoch": 0.7599243856332704,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000494861632106755,
|
|
"loss": 5.5897,
|
|
"mean_token_accuracy": 0.15403168946504592,
|
|
"num_tokens": 16702121.0,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"entropy": 5.778069067001343,
|
|
"epoch": 0.7603444654484351,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004948552667577033,
|
|
"loss": 5.6211,
|
|
"mean_token_accuracy": 0.1538814291357994,
|
|
"num_tokens": 16711883.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 5.786386919021607,
|
|
"epoch": 0.7607645452636,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004948488975140286,
|
|
"loss": 5.6847,
|
|
"mean_token_accuracy": 0.1501935139298439,
|
|
"num_tokens": 16721449.0,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"entropy": 5.747914838790893,
|
|
"epoch": 0.7611846250787649,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.000494842524375844,
|
|
"loss": 5.6196,
|
|
"mean_token_accuracy": 0.15546474158763884,
|
|
"num_tokens": 16730068.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 5.748115968704224,
|
|
"epoch": 0.7616047048939298,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004948361473432623,
|
|
"loss": 5.6365,
|
|
"mean_token_accuracy": 0.15265990495681764,
|
|
"num_tokens": 16739970.0,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"entropy": 5.84217677116394,
|
|
"epoch": 0.7620247847090947,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004948297664163964,
|
|
"loss": 5.7024,
|
|
"mean_token_accuracy": 0.1518349438905716,
|
|
"num_tokens": 16749461.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 5.866002225875855,
|
|
"epoch": 0.7624448645242596,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004948233815953593,
|
|
"loss": 5.7617,
|
|
"mean_token_accuracy": 0.15022132098674773,
|
|
"num_tokens": 16758747.0,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"entropy": 5.733387041091919,
|
|
"epoch": 0.7628649443394245,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004948169928802643,
|
|
"loss": 5.4962,
|
|
"mean_token_accuracy": 0.1622622489929199,
|
|
"num_tokens": 16767212.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 5.798452520370484,
|
|
"epoch": 0.7632850241545893,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004948106002712245,
|
|
"loss": 5.6462,
|
|
"mean_token_accuracy": 0.15284293740987778,
|
|
"num_tokens": 16776514.0,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"entropy": 5.7934057235717775,
|
|
"epoch": 0.7637051039697542,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004948042037683529,
|
|
"loss": 5.6197,
|
|
"mean_token_accuracy": 0.15112117901444436,
|
|
"num_tokens": 16786310.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 5.78909387588501,
|
|
"epoch": 0.7641251837849191,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004947978033717632,
|
|
"loss": 5.6515,
|
|
"mean_token_accuracy": 0.1501218557357788,
|
|
"num_tokens": 16795551.0,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"entropy": 5.803013610839844,
|
|
"epoch": 0.764545263600084,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004947913990815684,
|
|
"loss": 5.6264,
|
|
"mean_token_accuracy": 0.15242374390363694,
|
|
"num_tokens": 16805099.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 5.776360607147216,
|
|
"epoch": 0.7649653434152489,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004947849908978824,
|
|
"loss": 5.6647,
|
|
"mean_token_accuracy": 0.15367067903280257,
|
|
"num_tokens": 16813963.0,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"entropy": 5.838750600814819,
|
|
"epoch": 0.7653854232304138,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004947785788208182,
|
|
"loss": 5.6921,
|
|
"mean_token_accuracy": 0.1523756965994835,
|
|
"num_tokens": 16822814.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 5.851981353759766,
|
|
"epoch": 0.7658055030455787,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004947721628504898,
|
|
"loss": 5.7322,
|
|
"mean_token_accuracy": 0.15000171959400177,
|
|
"num_tokens": 16831906.0,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"entropy": 5.700230932235717,
|
|
"epoch": 0.7662255828607435,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004947657429870108,
|
|
"loss": 5.5446,
|
|
"mean_token_accuracy": 0.1595570996403694,
|
|
"num_tokens": 16840050.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 5.705719661712647,
|
|
"epoch": 0.7666456626759084,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004947593192304946,
|
|
"loss": 5.5713,
|
|
"mean_token_accuracy": 0.15321452915668488,
|
|
"num_tokens": 16848404.0,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"entropy": 5.759864425659179,
|
|
"epoch": 0.7670657424910733,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004947528915810554,
|
|
"loss": 5.5722,
|
|
"mean_token_accuracy": 0.1579087942838669,
|
|
"num_tokens": 16856568.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 5.756204128265381,
|
|
"epoch": 0.7674858223062382,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004947464600388066,
|
|
"loss": 5.6034,
|
|
"mean_token_accuracy": 0.15562164336442946,
|
|
"num_tokens": 16864936.0,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"entropy": 5.9225013732910154,
|
|
"epoch": 0.7679059021214031,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004947400246038627,
|
|
"loss": 5.7416,
|
|
"mean_token_accuracy": 0.14872185736894608,
|
|
"num_tokens": 16874504.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 5.684078311920166,
|
|
"epoch": 0.768325981936568,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004947335852763374,
|
|
"loss": 5.4846,
|
|
"mean_token_accuracy": 0.15627673268318176,
|
|
"num_tokens": 16883365.0,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"entropy": 5.801791000366211,
|
|
"epoch": 0.7687460617517329,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004947271420563447,
|
|
"loss": 5.7415,
|
|
"mean_token_accuracy": 0.14208680838346482,
|
|
"num_tokens": 16892701.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 5.769952487945557,
|
|
"epoch": 0.7691661415668977,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004947206949439989,
|
|
"loss": 5.553,
|
|
"mean_token_accuracy": 0.15000357180833818,
|
|
"num_tokens": 16901864.0,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"entropy": 5.736040306091309,
|
|
"epoch": 0.7695862213820626,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000494714243939414,
|
|
"loss": 5.608,
|
|
"mean_token_accuracy": 0.15939729958772658,
|
|
"num_tokens": 16910908.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 5.721309995651245,
|
|
"epoch": 0.7700063011972275,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004947077890427045,
|
|
"loss": 5.6325,
|
|
"mean_token_accuracy": 0.15240202248096466,
|
|
"num_tokens": 16920299.0,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"entropy": 5.880091524124145,
|
|
"epoch": 0.7704263810123924,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004947013302539846,
|
|
"loss": 5.7698,
|
|
"mean_token_accuracy": 0.1435159295797348,
|
|
"num_tokens": 16930027.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 5.877429723739624,
|
|
"epoch": 0.7708464608275573,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0004946948675733688,
|
|
"loss": 5.6626,
|
|
"mean_token_accuracy": 0.15365543216466904,
|
|
"num_tokens": 16939387.0,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"entropy": 5.743030834197998,
|
|
"epoch": 0.7712665406427222,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004946884010009714,
|
|
"loss": 5.6303,
|
|
"mean_token_accuracy": 0.15409868359565734,
|
|
"num_tokens": 16950024.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 5.677621221542358,
|
|
"epoch": 0.771686620457887,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004946819305369073,
|
|
"loss": 5.525,
|
|
"mean_token_accuracy": 0.16152018159627915,
|
|
"num_tokens": 16958219.0,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"entropy": 5.728807067871093,
|
|
"epoch": 0.7721067002730518,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004946754561812909,
|
|
"loss": 5.5102,
|
|
"mean_token_accuracy": 0.16226852238178252,
|
|
"num_tokens": 16966829.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 5.733729887008667,
|
|
"epoch": 0.7725267800882167,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004946689779342367,
|
|
"loss": 5.6145,
|
|
"mean_token_accuracy": 0.15137282758951187,
|
|
"num_tokens": 16975585.0,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"entropy": 5.743972539901733,
|
|
"epoch": 0.7729468599033816,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004946624957958599,
|
|
"loss": 5.5951,
|
|
"mean_token_accuracy": 0.15704918652772903,
|
|
"num_tokens": 16984848.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 5.737395524978638,
|
|
"epoch": 0.7733669397185465,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000494656009766275,
|
|
"loss": 5.5839,
|
|
"mean_token_accuracy": 0.16066163033246994,
|
|
"num_tokens": 16993179.0,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"entropy": 5.7287391185760494,
|
|
"epoch": 0.7737870195337114,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.000494649519845597,
|
|
"loss": 5.62,
|
|
"mean_token_accuracy": 0.15363839864730836,
|
|
"num_tokens": 17002563.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 5.80807056427002,
|
|
"epoch": 0.7742070993488763,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004946430260339409,
|
|
"loss": 5.6284,
|
|
"mean_token_accuracy": 0.15289961099624633,
|
|
"num_tokens": 17011805.0,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"entropy": 5.779461622238159,
|
|
"epoch": 0.7746271791640411,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004946365283314216,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.1561885267496109,
|
|
"num_tokens": 17020398.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 5.694942331314087,
|
|
"epoch": 0.775047258979206,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004946300267381545,
|
|
"loss": 5.5753,
|
|
"mean_token_accuracy": 0.15811678916215896,
|
|
"num_tokens": 17030805.0,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"entropy": 5.794308614730835,
|
|
"epoch": 0.7754673387943709,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004946235212542544,
|
|
"loss": 5.597,
|
|
"mean_token_accuracy": 0.1565954014658928,
|
|
"num_tokens": 17040164.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 5.77291522026062,
|
|
"epoch": 0.7758874186095358,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004946170118798367,
|
|
"loss": 5.67,
|
|
"mean_token_accuracy": 0.14761753827333451,
|
|
"num_tokens": 17049519.0,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"entropy": 5.802110385894776,
|
|
"epoch": 0.7763074984247007,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004946104986150167,
|
|
"loss": 5.5979,
|
|
"mean_token_accuracy": 0.15635768324136734,
|
|
"num_tokens": 17058042.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 5.77113904953003,
|
|
"epoch": 0.7767275782398656,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004946039814599099,
|
|
"loss": 5.624,
|
|
"mean_token_accuracy": 0.15740283727645873,
|
|
"num_tokens": 17067107.0,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"entropy": 5.784947872161865,
|
|
"epoch": 0.7771476580550305,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004945974604146316,
|
|
"loss": 5.7176,
|
|
"mean_token_accuracy": 0.15673644915223123,
|
|
"num_tokens": 17076975.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 5.760613203048706,
|
|
"epoch": 0.7775677378701953,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004945909354792974,
|
|
"loss": 5.5674,
|
|
"mean_token_accuracy": 0.15634535551071166,
|
|
"num_tokens": 17086405.0,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"entropy": 5.718491649627685,
|
|
"epoch": 0.7779878176853602,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004945844066540229,
|
|
"loss": 5.6449,
|
|
"mean_token_accuracy": 0.1455477386713028,
|
|
"num_tokens": 17095333.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 5.7345335483551025,
|
|
"epoch": 0.7784078975005251,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004945778739389236,
|
|
"loss": 5.684,
|
|
"mean_token_accuracy": 0.150144724547863,
|
|
"num_tokens": 17103631.0,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"entropy": 5.794864368438721,
|
|
"epoch": 0.77882797731569,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004945713373341152,
|
|
"loss": 5.5715,
|
|
"mean_token_accuracy": 0.15383470058441162,
|
|
"num_tokens": 17112612.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 5.823299360275269,
|
|
"epoch": 0.7792480571308549,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 0.0004945647968397139,
|
|
"loss": 5.6242,
|
|
"mean_token_accuracy": 0.15435410290956497,
|
|
"num_tokens": 17121592.0,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"entropy": 5.742037677764893,
|
|
"epoch": 0.7796681369460198,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004945582524558352,
|
|
"loss": 5.6497,
|
|
"mean_token_accuracy": 0.15522131621837615,
|
|
"num_tokens": 17131003.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 5.8117687702178955,
|
|
"epoch": 0.7800882167611847,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000494551704182595,
|
|
"loss": 5.6434,
|
|
"mean_token_accuracy": 0.1501818783581257,
|
|
"num_tokens": 17140013.0,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"entropy": 5.904456377029419,
|
|
"epoch": 0.7805082965763495,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004945451520201095,
|
|
"loss": 5.7995,
|
|
"mean_token_accuracy": 0.1440419152379036,
|
|
"num_tokens": 17150406.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 5.804939079284668,
|
|
"epoch": 0.7809283763915144,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004945385959684947,
|
|
"loss": 5.643,
|
|
"mean_token_accuracy": 0.15583974719047547,
|
|
"num_tokens": 17159757.0,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"entropy": 5.787489128112793,
|
|
"epoch": 0.7813484562066793,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004945320360278667,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.15916707813739778,
|
|
"num_tokens": 17169317.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 5.814616775512695,
|
|
"epoch": 0.7817685360218442,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004945254721983416,
|
|
"loss": 5.6676,
|
|
"mean_token_accuracy": 0.1608291007578373,
|
|
"num_tokens": 17178410.0,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"entropy": 5.825447988510132,
|
|
"epoch": 0.782188615837009,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.000494518904480036,
|
|
"loss": 5.5898,
|
|
"mean_token_accuracy": 0.15593952387571336,
|
|
"num_tokens": 17186922.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 5.802917385101319,
|
|
"epoch": 0.782608695652174,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004945123328730659,
|
|
"loss": 5.6666,
|
|
"mean_token_accuracy": 0.1478397913277149,
|
|
"num_tokens": 17197125.0,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"entropy": 5.739556694030762,
|
|
"epoch": 0.7830287754673388,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000494505757377548,
|
|
"loss": 5.597,
|
|
"mean_token_accuracy": 0.15432032942771912,
|
|
"num_tokens": 17206169.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 5.679258155822754,
|
|
"epoch": 0.7834488552825036,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004944991779935985,
|
|
"loss": 5.538,
|
|
"mean_token_accuracy": 0.15561339557170867,
|
|
"num_tokens": 17214607.0,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"entropy": 5.689110612869262,
|
|
"epoch": 0.7838689350976685,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.000494492594721334,
|
|
"loss": 5.5188,
|
|
"mean_token_accuracy": 0.15666710287332536,
|
|
"num_tokens": 17223616.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 5.764066362380982,
|
|
"epoch": 0.7842890149128334,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004944860075608715,
|
|
"loss": 5.607,
|
|
"mean_token_accuracy": 0.15148743987083435,
|
|
"num_tokens": 17232729.0,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"entropy": 5.747860622406006,
|
|
"epoch": 0.7847090947279983,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004944794165123272,
|
|
"loss": 5.6633,
|
|
"mean_token_accuracy": 0.1552363008260727,
|
|
"num_tokens": 17242128.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 5.7937798500061035,
|
|
"epoch": 0.7851291745431632,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.000494472821575818,
|
|
"loss": 5.572,
|
|
"mean_token_accuracy": 0.15619071274995805,
|
|
"num_tokens": 17250806.0,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"entropy": 5.884761095046997,
|
|
"epoch": 0.7855492543583281,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004944662227514609,
|
|
"loss": 5.796,
|
|
"mean_token_accuracy": 0.14290329068899155,
|
|
"num_tokens": 17260888.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 5.765118503570557,
|
|
"epoch": 0.785969334173493,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004944596200393726,
|
|
"loss": 5.5632,
|
|
"mean_token_accuracy": 0.1571262151002884,
|
|
"num_tokens": 17270387.0,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"entropy": 5.790839576721192,
|
|
"epoch": 0.7863894139886578,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004944530134396702,
|
|
"loss": 5.5971,
|
|
"mean_token_accuracy": 0.1504202328622341,
|
|
"num_tokens": 17279866.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 5.764979267120362,
|
|
"epoch": 0.7868094938038227,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004944464029524707,
|
|
"loss": 5.5927,
|
|
"mean_token_accuracy": 0.15793014466762542,
|
|
"num_tokens": 17289233.0,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"entropy": 5.78815860748291,
|
|
"epoch": 0.7872295736189876,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000494439788577891,
|
|
"loss": 5.6811,
|
|
"mean_token_accuracy": 0.15233502089977263,
|
|
"num_tokens": 17298705.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 5.803197431564331,
|
|
"epoch": 0.7876496534341525,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004944331703160486,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.1556847333908081,
|
|
"num_tokens": 17307793.0,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"entropy": 5.768749332427978,
|
|
"epoch": 0.7880697332493174,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004944265481670605,
|
|
"loss": 5.7109,
|
|
"mean_token_accuracy": 0.14565183371305465,
|
|
"num_tokens": 17318248.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 5.781773900985717,
|
|
"epoch": 0.7884898130644823,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004944199221310441,
|
|
"loss": 5.6174,
|
|
"mean_token_accuracy": 0.15221924781799318,
|
|
"num_tokens": 17327281.0,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"entropy": 5.823486852645874,
|
|
"epoch": 0.7889098928796471,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004944132922081168,
|
|
"loss": 5.6269,
|
|
"mean_token_accuracy": 0.15858044922351838,
|
|
"num_tokens": 17336805.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 5.736378765106201,
|
|
"epoch": 0.789329972694812,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004944066583983961,
|
|
"loss": 5.5747,
|
|
"mean_token_accuracy": 0.15340599566698074,
|
|
"num_tokens": 17346024.0,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"entropy": 5.739033269882202,
|
|
"epoch": 0.7897500525099769,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004944000207019992,
|
|
"loss": 5.6743,
|
|
"mean_token_accuracy": 0.15382137894630432,
|
|
"num_tokens": 17355100.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 5.865094900131226,
|
|
"epoch": 0.7901701323251418,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004943933791190441,
|
|
"loss": 5.7171,
|
|
"mean_token_accuracy": 0.14582199305295945,
|
|
"num_tokens": 17364769.0,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"entropy": 5.805460023880005,
|
|
"epoch": 0.7905902121403067,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004943867336496482,
|
|
"loss": 5.5593,
|
|
"mean_token_accuracy": 0.156871497631073,
|
|
"num_tokens": 17374082.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 5.704965591430664,
|
|
"epoch": 0.7910102919554716,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004943800842939293,
|
|
"loss": 5.6061,
|
|
"mean_token_accuracy": 0.1573358103632927,
|
|
"num_tokens": 17383570.0,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"entropy": 5.762260246276855,
|
|
"epoch": 0.7914303717706365,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.000494373431052005,
|
|
"loss": 5.6136,
|
|
"mean_token_accuracy": 0.15585907325148582,
|
|
"num_tokens": 17392105.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 5.754047203063965,
|
|
"epoch": 0.7918504515858013,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004943667739239935,
|
|
"loss": 5.5694,
|
|
"mean_token_accuracy": 0.1567780628800392,
|
|
"num_tokens": 17401363.0,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"entropy": 5.831571578979492,
|
|
"epoch": 0.7922705314009661,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004943601129100125,
|
|
"loss": 5.5907,
|
|
"mean_token_accuracy": 0.15667269229888917,
|
|
"num_tokens": 17411333.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 5.808466386795044,
|
|
"epoch": 0.792690611216131,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004943534480101801,
|
|
"loss": 5.6449,
|
|
"mean_token_accuracy": 0.1564931645989418,
|
|
"num_tokens": 17421162.0,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"entropy": 5.764466953277588,
|
|
"epoch": 0.793110691031296,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.0004943467792246142,
|
|
"loss": 5.5917,
|
|
"mean_token_accuracy": 0.1545848786830902,
|
|
"num_tokens": 17430119.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 5.799530792236328,
|
|
"epoch": 0.7935307708464608,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004943401065534332,
|
|
"loss": 5.6028,
|
|
"mean_token_accuracy": 0.1538163974881172,
|
|
"num_tokens": 17439617.0,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"entropy": 5.715310573577881,
|
|
"epoch": 0.7939508506616257,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004943334299967551,
|
|
"loss": 5.7132,
|
|
"mean_token_accuracy": 0.14998757019639014,
|
|
"num_tokens": 17448720.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 5.697250175476074,
|
|
"epoch": 0.7943709304767906,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004943267495546982,
|
|
"loss": 5.5917,
|
|
"mean_token_accuracy": 0.16172372549772263,
|
|
"num_tokens": 17457458.0,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"entropy": 5.81586275100708,
|
|
"epoch": 0.7947910102919554,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004943200652273809,
|
|
"loss": 5.6191,
|
|
"mean_token_accuracy": 0.15560947209596634,
|
|
"num_tokens": 17467095.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 5.75570330619812,
|
|
"epoch": 0.7952110901071203,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004943133770149216,
|
|
"loss": 5.657,
|
|
"mean_token_accuracy": 0.14877953082323075,
|
|
"num_tokens": 17476247.0,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"entropy": 5.799701309204101,
|
|
"epoch": 0.7956311699222852,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004943066849174386,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.1575782373547554,
|
|
"num_tokens": 17486352.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 5.821471929550171,
|
|
"epoch": 0.7960512497374501,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004942999889350508,
|
|
"loss": 5.6216,
|
|
"mean_token_accuracy": 0.15541253834962845,
|
|
"num_tokens": 17495633.0,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"entropy": 5.826534175872803,
|
|
"epoch": 0.796471329552615,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004942932890678765,
|
|
"loss": 5.6665,
|
|
"mean_token_accuracy": 0.14694230481982232,
|
|
"num_tokens": 17504325.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 5.7822521209716795,
|
|
"epoch": 0.7968914093677799,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004942865853160346,
|
|
"loss": 5.6862,
|
|
"mean_token_accuracy": 0.1536302775144577,
|
|
"num_tokens": 17513265.0,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"entropy": 5.799659156799317,
|
|
"epoch": 0.7973114891829448,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004942798776796436,
|
|
"loss": 5.6811,
|
|
"mean_token_accuracy": 0.1501378260552883,
|
|
"num_tokens": 17522939.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 5.848496150970459,
|
|
"epoch": 0.7977315689981096,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004942731661588226,
|
|
"loss": 5.699,
|
|
"mean_token_accuracy": 0.1455768197774887,
|
|
"num_tokens": 17532250.0,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"entropy": 5.859736204147339,
|
|
"epoch": 0.7981516488132745,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004942664507536904,
|
|
"loss": 5.7145,
|
|
"mean_token_accuracy": 0.1528845690190792,
|
|
"num_tokens": 17541368.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 5.775320148468017,
|
|
"epoch": 0.7985717286284394,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004942597314643659,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.15444121211767198,
|
|
"num_tokens": 17550871.0,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"entropy": 5.797231960296631,
|
|
"epoch": 0.7989918084436043,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004942530082909681,
|
|
"loss": 5.5808,
|
|
"mean_token_accuracy": 0.16195199489593506,
|
|
"num_tokens": 17559683.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 5.815419673919678,
|
|
"epoch": 0.7994118882587692,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0004942462812336163,
|
|
"loss": 5.5933,
|
|
"mean_token_accuracy": 0.1550535589456558,
|
|
"num_tokens": 17568877.0,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"entropy": 5.879213762283325,
|
|
"epoch": 0.7998319680739341,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004942395502924293,
|
|
"loss": 5.7466,
|
|
"mean_token_accuracy": 0.14571947157382964,
|
|
"num_tokens": 17578202.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 5.797115516662598,
|
|
"epoch": 0.800252047889099,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004942328154675268,
|
|
"loss": 5.577,
|
|
"mean_token_accuracy": 0.15988959819078447,
|
|
"num_tokens": 17587342.0,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"entropy": 5.750249338150025,
|
|
"epoch": 0.8006721277042638,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004942260767590277,
|
|
"loss": 5.4334,
|
|
"mean_token_accuracy": 0.16428305059671403,
|
|
"num_tokens": 17595671.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 5.749629020690918,
|
|
"epoch": 0.8010922075194287,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004942193341670516,
|
|
"loss": 5.7607,
|
|
"mean_token_accuracy": 0.1483020693063736,
|
|
"num_tokens": 17605649.0,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"entropy": 5.763780164718628,
|
|
"epoch": 0.8015122873345936,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 0.0004942125876917178,
|
|
"loss": 5.6478,
|
|
"mean_token_accuracy": 0.1507388584315777,
|
|
"num_tokens": 17615286.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 5.7505041599273685,
|
|
"epoch": 0.8019323671497585,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.000494205837333146,
|
|
"loss": 5.636,
|
|
"mean_token_accuracy": 0.1552906632423401,
|
|
"num_tokens": 17624583.0,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"entropy": 5.812001085281372,
|
|
"epoch": 0.8023524469649234,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004941990830914557,
|
|
"loss": 5.6149,
|
|
"mean_token_accuracy": 0.15935958474874495,
|
|
"num_tokens": 17633894.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 5.8303131580352785,
|
|
"epoch": 0.8027725267800883,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004941923249667663,
|
|
"loss": 5.71,
|
|
"mean_token_accuracy": 0.149199178814888,
|
|
"num_tokens": 17643172.0,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"entropy": 5.764499855041504,
|
|
"epoch": 0.803192606595253,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004941855629591979,
|
|
"loss": 5.5945,
|
|
"mean_token_accuracy": 0.15305460765957832,
|
|
"num_tokens": 17651901.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 5.755572938919068,
|
|
"epoch": 0.8036126864104179,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004941787970688701,
|
|
"loss": 5.5957,
|
|
"mean_token_accuracy": 0.15799273997545243,
|
|
"num_tokens": 17660806.0,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"entropy": 5.837345361709595,
|
|
"epoch": 0.8040327662255828,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 0.0004941720272959027,
|
|
"loss": 5.6559,
|
|
"mean_token_accuracy": 0.16126096546649932,
|
|
"num_tokens": 17669157.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 5.719307231903076,
|
|
"epoch": 0.8044528460407477,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004941652536404157,
|
|
"loss": 5.5632,
|
|
"mean_token_accuracy": 0.15555428415536882,
|
|
"num_tokens": 17678664.0,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"entropy": 5.79267258644104,
|
|
"epoch": 0.8048729258559126,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 0.0004941584761025291,
|
|
"loss": 5.6044,
|
|
"mean_token_accuracy": 0.15480156391859054,
|
|
"num_tokens": 17688252.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 5.7286498069763185,
|
|
"epoch": 0.8052930056710775,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.000494151694682363,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.1567763715982437,
|
|
"num_tokens": 17696473.0,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"entropy": 5.752206754684448,
|
|
"epoch": 0.8057130854862424,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004941449093800374,
|
|
"loss": 5.6529,
|
|
"mean_token_accuracy": 0.15852190256118776,
|
|
"num_tokens": 17706177.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 5.7514872550964355,
|
|
"epoch": 0.8061331653014072,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004941381201956726,
|
|
"loss": 5.5015,
|
|
"mean_token_accuracy": 0.16315654218196868,
|
|
"num_tokens": 17715355.0,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"entropy": 5.747640895843506,
|
|
"epoch": 0.8065532451165721,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004941313271293889,
|
|
"loss": 5.6008,
|
|
"mean_token_accuracy": 0.1622050292789936,
|
|
"num_tokens": 17724345.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 5.7168864727020265,
|
|
"epoch": 0.806973324931737,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004941245301813065,
|
|
"loss": 5.5143,
|
|
"mean_token_accuracy": 0.16414132565259934,
|
|
"num_tokens": 17732805.0,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"entropy": 5.737041282653808,
|
|
"epoch": 0.8073934047469019,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004941177293515459,
|
|
"loss": 5.5799,
|
|
"mean_token_accuracy": 0.157880100607872,
|
|
"num_tokens": 17741963.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 5.705282735824585,
|
|
"epoch": 0.8078134845620668,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004941109246402275,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.151243394613266,
|
|
"num_tokens": 17751858.0,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"entropy": 5.866819715499878,
|
|
"epoch": 0.8082335643772317,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0004941041160474721,
|
|
"loss": 5.7059,
|
|
"mean_token_accuracy": 0.1499703124165535,
|
|
"num_tokens": 17761152.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 5.859082937240601,
|
|
"epoch": 0.8086536441923966,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004940973035733999,
|
|
"loss": 5.6428,
|
|
"mean_token_accuracy": 0.15314959064126016,
|
|
"num_tokens": 17770493.0,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"entropy": 5.887163877487183,
|
|
"epoch": 0.8090737240075614,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004940904872181318,
|
|
"loss": 5.6534,
|
|
"mean_token_accuracy": 0.15107578188180923,
|
|
"num_tokens": 17779871.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 5.8648134708404545,
|
|
"epoch": 0.8094938038227263,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004940836669817887,
|
|
"loss": 5.6633,
|
|
"mean_token_accuracy": 0.1496044009923935,
|
|
"num_tokens": 17788606.0,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"entropy": 5.72215781211853,
|
|
"epoch": 0.8099138836378912,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 0.0004940768428644911,
|
|
"loss": 5.5938,
|
|
"mean_token_accuracy": 0.1555838018655777,
|
|
"num_tokens": 17797458.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 5.666493082046509,
|
|
"epoch": 0.8103339634530561,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004940700148663601,
|
|
"loss": 5.5519,
|
|
"mean_token_accuracy": 0.1551619812846184,
|
|
"num_tokens": 17806902.0,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"entropy": 5.774869537353515,
|
|
"epoch": 0.810754043268221,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004940631829875165,
|
|
"loss": 5.6878,
|
|
"mean_token_accuracy": 0.1476306848227978,
|
|
"num_tokens": 17816374.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 5.793194580078125,
|
|
"epoch": 0.8111741230833859,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004940563472280815,
|
|
"loss": 5.6585,
|
|
"mean_token_accuracy": 0.15614343285560608,
|
|
"num_tokens": 17825267.0,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"entropy": 5.768211507797242,
|
|
"epoch": 0.8115942028985508,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004940495075881761,
|
|
"loss": 5.5722,
|
|
"mean_token_accuracy": 0.15710717141628266,
|
|
"num_tokens": 17834027.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 5.728369903564453,
|
|
"epoch": 0.8120142827137156,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004940426640679214,
|
|
"loss": 5.5753,
|
|
"mean_token_accuracy": 0.15249805226922036,
|
|
"num_tokens": 17843587.0,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"entropy": 5.8064950466156,
|
|
"epoch": 0.8124343625288805,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004940358166674388,
|
|
"loss": 5.6147,
|
|
"mean_token_accuracy": 0.15565043687820435,
|
|
"num_tokens": 17852284.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 5.845684242248535,
|
|
"epoch": 0.8128544423440454,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004940289653868494,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.15545963644981384,
|
|
"num_tokens": 17860896.0,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"entropy": 5.703367519378662,
|
|
"epoch": 0.8132745221592103,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004940221102262747,
|
|
"loss": 5.5942,
|
|
"mean_token_accuracy": 0.15159963369369506,
|
|
"num_tokens": 17870796.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 5.789257049560547,
|
|
"epoch": 0.8136946019743752,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.0004940152511858361,
|
|
"loss": 5.6788,
|
|
"mean_token_accuracy": 0.14908051788806914,
|
|
"num_tokens": 17880016.0,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"entropy": 5.866326093673706,
|
|
"epoch": 0.81411468178954,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004940083882656551,
|
|
"loss": 5.7101,
|
|
"mean_token_accuracy": 0.14765079468488693,
|
|
"num_tokens": 17889348.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 5.818946790695191,
|
|
"epoch": 0.814534761604705,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004940015214658532,
|
|
"loss": 5.5647,
|
|
"mean_token_accuracy": 0.16243199706077577,
|
|
"num_tokens": 17898392.0,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"entropy": 5.796739816665649,
|
|
"epoch": 0.8149548414198697,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004939946507865522,
|
|
"loss": 5.6743,
|
|
"mean_token_accuracy": 0.1524437814950943,
|
|
"num_tokens": 17907141.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 5.688076829910278,
|
|
"epoch": 0.8153749212350346,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004939877762278737,
|
|
"loss": 5.5548,
|
|
"mean_token_accuracy": 0.15888291895389556,
|
|
"num_tokens": 17915792.0,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"entropy": 5.819617366790771,
|
|
"epoch": 0.8157950010501995,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004939808977899396,
|
|
"loss": 5.7061,
|
|
"mean_token_accuracy": 0.14910464882850646,
|
|
"num_tokens": 17925603.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 5.840267324447632,
|
|
"epoch": 0.8162150808653644,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004939740154728716,
|
|
"loss": 5.6424,
|
|
"mean_token_accuracy": 0.15840867161750793,
|
|
"num_tokens": 17934436.0,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"entropy": 5.819521951675415,
|
|
"epoch": 0.8166351606805293,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004939671292767915,
|
|
"loss": 5.595,
|
|
"mean_token_accuracy": 0.16303292959928511,
|
|
"num_tokens": 17942969.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 5.824506616592407,
|
|
"epoch": 0.8170552404956942,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004939602392018216,
|
|
"loss": 5.6782,
|
|
"mean_token_accuracy": 0.15368429720401763,
|
|
"num_tokens": 17952053.0,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"entropy": 5.7638860702514645,
|
|
"epoch": 0.817475320310859,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004939533452480839,
|
|
"loss": 5.6463,
|
|
"mean_token_accuracy": 0.15662853494286538,
|
|
"num_tokens": 17960707.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 5.850724697113037,
|
|
"epoch": 0.8178954001260239,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004939464474157003,
|
|
"loss": 5.7485,
|
|
"mean_token_accuracy": 0.143310609459877,
|
|
"num_tokens": 17971035.0,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"entropy": 5.811854696273803,
|
|
"epoch": 0.8183154799411888,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004939395457047932,
|
|
"loss": 5.6147,
|
|
"mean_token_accuracy": 0.1503463938832283,
|
|
"num_tokens": 17980656.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 5.849935054779053,
|
|
"epoch": 0.8187355597563537,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004939326401154847,
|
|
"loss": 5.6425,
|
|
"mean_token_accuracy": 0.14753958508372306,
|
|
"num_tokens": 17990977.0,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"entropy": 5.72039909362793,
|
|
"epoch": 0.8191556395715186,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004939257306478973,
|
|
"loss": 5.6386,
|
|
"mean_token_accuracy": 0.15436331778764725,
|
|
"num_tokens": 18000186.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 5.743032836914063,
|
|
"epoch": 0.8195757193866835,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004939188173021532,
|
|
"loss": 5.6294,
|
|
"mean_token_accuracy": 0.15535037443041802,
|
|
"num_tokens": 18010269.0,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"entropy": 5.837440872192383,
|
|
"epoch": 0.8199957992018484,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004939119000783751,
|
|
"loss": 5.5548,
|
|
"mean_token_accuracy": 0.1628822222352028,
|
|
"num_tokens": 18018461.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 5.726272964477539,
|
|
"epoch": 0.8204158790170132,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004939049789766855,
|
|
"loss": 5.5727,
|
|
"mean_token_accuracy": 0.1559150367975235,
|
|
"num_tokens": 18027173.0,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"entropy": 5.681005191802979,
|
|
"epoch": 0.8208359588321781,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004938980539972068,
|
|
"loss": 5.6704,
|
|
"mean_token_accuracy": 0.15305837988853455,
|
|
"num_tokens": 18036791.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 5.741180467605591,
|
|
"epoch": 0.821256038647343,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004938911251400617,
|
|
"loss": 5.6164,
|
|
"mean_token_accuracy": 0.15979565382003785,
|
|
"num_tokens": 18046908.0,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"entropy": 5.690343570709229,
|
|
"epoch": 0.8216761184625079,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004938841924053731,
|
|
"loss": 5.5305,
|
|
"mean_token_accuracy": 0.166619610786438,
|
|
"num_tokens": 18055825.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 5.83678789138794,
|
|
"epoch": 0.8220961982776728,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004938772557932637,
|
|
"loss": 5.7218,
|
|
"mean_token_accuracy": 0.1443665809929371,
|
|
"num_tokens": 18065334.0,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"entropy": 5.830995225906372,
|
|
"epoch": 0.8225162780928377,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004938703153038565,
|
|
"loss": 5.585,
|
|
"mean_token_accuracy": 0.15924161821603774,
|
|
"num_tokens": 18073999.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 5.664001035690307,
|
|
"epoch": 0.8229363579080026,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004938633709372744,
|
|
"loss": 5.6106,
|
|
"mean_token_accuracy": 0.15344761908054352,
|
|
"num_tokens": 18083665.0,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"entropy": 5.735060787200927,
|
|
"epoch": 0.8233564377231674,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004938564226936403,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.15541263967752456,
|
|
"num_tokens": 18092501.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 5.726347208023071,
|
|
"epoch": 0.8237765175383323,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004938494705730773,
|
|
"loss": 5.5879,
|
|
"mean_token_accuracy": 0.15256380438804626,
|
|
"num_tokens": 18101320.0,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"entropy": 5.76941032409668,
|
|
"epoch": 0.8241965973534972,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004938425145757087,
|
|
"loss": 5.6155,
|
|
"mean_token_accuracy": 0.15062929540872574,
|
|
"num_tokens": 18110190.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 5.77293291091919,
|
|
"epoch": 0.824616677168662,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004938355547016577,
|
|
"loss": 5.6121,
|
|
"mean_token_accuracy": 0.15612404122948648,
|
|
"num_tokens": 18119301.0,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"entropy": 5.836658191680908,
|
|
"epoch": 0.825036756983827,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004938285909510474,
|
|
"loss": 5.6581,
|
|
"mean_token_accuracy": 0.15127312690019606,
|
|
"num_tokens": 18128959.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 5.728819894790649,
|
|
"epoch": 0.8254568367989918,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004938216233240014,
|
|
"loss": 5.6313,
|
|
"mean_token_accuracy": 0.15713003724813462,
|
|
"num_tokens": 18138156.0,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"entropy": 5.834373140335083,
|
|
"epoch": 0.8258769166141567,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000493814651820643,
|
|
"loss": 5.6643,
|
|
"mean_token_accuracy": 0.14810227751731872,
|
|
"num_tokens": 18147244.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 5.870449686050415,
|
|
"epoch": 0.8262969964293215,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004938076764410956,
|
|
"loss": 5.6655,
|
|
"mean_token_accuracy": 0.15398952662944793,
|
|
"num_tokens": 18156040.0,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"entropy": 5.834972286224366,
|
|
"epoch": 0.8267170762444864,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.000493800697185483,
|
|
"loss": 5.5916,
|
|
"mean_token_accuracy": 0.14924859553575515,
|
|
"num_tokens": 18165210.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 5.787695646286011,
|
|
"epoch": 0.8271371560596513,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004937937140539288,
|
|
"loss": 5.6591,
|
|
"mean_token_accuracy": 0.15217285007238388,
|
|
"num_tokens": 18174841.0,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"entropy": 5.705031299591065,
|
|
"epoch": 0.8275572358748162,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004937867270465564,
|
|
"loss": 5.5282,
|
|
"mean_token_accuracy": 0.1546058475971222,
|
|
"num_tokens": 18184112.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 5.810121345520019,
|
|
"epoch": 0.8279773156899811,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004937797361634899,
|
|
"loss": 5.7327,
|
|
"mean_token_accuracy": 0.15010684877634048,
|
|
"num_tokens": 18193564.0,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"entropy": 5.696271514892578,
|
|
"epoch": 0.828397395505146,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000493772741404853,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.1613880753517151,
|
|
"num_tokens": 18202836.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 5.766516923904419,
|
|
"epoch": 0.8288174753203108,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004937657427707698,
|
|
"loss": 5.5918,
|
|
"mean_token_accuracy": 0.16525972336530687,
|
|
"num_tokens": 18212098.0,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"entropy": 5.790452575683593,
|
|
"epoch": 0.8292375551354757,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004937587402613639,
|
|
"loss": 5.6181,
|
|
"mean_token_accuracy": 0.15452115386724471,
|
|
"num_tokens": 18221541.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 5.693503141403198,
|
|
"epoch": 0.8296576349506406,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004937517338767597,
|
|
"loss": 5.6181,
|
|
"mean_token_accuracy": 0.14959986433386802,
|
|
"num_tokens": 18231015.0,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"entropy": 5.776920127868652,
|
|
"epoch": 0.8300777147658055,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004937447236170811,
|
|
"loss": 5.6442,
|
|
"mean_token_accuracy": 0.15097325891256333,
|
|
"num_tokens": 18239729.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 5.846532917022705,
|
|
"epoch": 0.8304977945809704,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004937377094824523,
|
|
"loss": 5.6934,
|
|
"mean_token_accuracy": 0.14850014224648475,
|
|
"num_tokens": 18249773.0,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"entropy": 5.829236078262329,
|
|
"epoch": 0.8309178743961353,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004937306914729977,
|
|
"loss": 5.6466,
|
|
"mean_token_accuracy": 0.14962287619709969,
|
|
"num_tokens": 18259179.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 5.640655469894409,
|
|
"epoch": 0.8313379542113002,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004937236695888416,
|
|
"loss": 5.5285,
|
|
"mean_token_accuracy": 0.16359366923570634,
|
|
"num_tokens": 18268164.0,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"entropy": 5.7921144485473635,
|
|
"epoch": 0.831758034026465,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004937166438301082,
|
|
"loss": 5.7047,
|
|
"mean_token_accuracy": 0.15264711230993272,
|
|
"num_tokens": 18276259.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 5.795594167709351,
|
|
"epoch": 0.8321781138416299,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004937096141969221,
|
|
"loss": 5.6749,
|
|
"mean_token_accuracy": 0.15689299032092094,
|
|
"num_tokens": 18285729.0,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"entropy": 5.905335474014282,
|
|
"epoch": 0.8325981936567948,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004937025806894077,
|
|
"loss": 5.8351,
|
|
"mean_token_accuracy": 0.139414294809103,
|
|
"num_tokens": 18295873.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 5.881864213943482,
|
|
"epoch": 0.8330182734719597,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004936955433076899,
|
|
"loss": 5.6606,
|
|
"mean_token_accuracy": 0.15778864026069642,
|
|
"num_tokens": 18305135.0,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"entropy": 5.854172706604004,
|
|
"epoch": 0.8334383532871246,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.000493688502051893,
|
|
"loss": 5.7077,
|
|
"mean_token_accuracy": 0.15348225384950637,
|
|
"num_tokens": 18314251.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 5.7477837085723875,
|
|
"epoch": 0.8338584331022895,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004936814569221421,
|
|
"loss": 5.5373,
|
|
"mean_token_accuracy": 0.16807708740234376,
|
|
"num_tokens": 18322863.0,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"entropy": 5.710943984985351,
|
|
"epoch": 0.8342785129174544,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004936744079185616,
|
|
"loss": 5.5515,
|
|
"mean_token_accuracy": 0.15136271864175796,
|
|
"num_tokens": 18332129.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 5.780642127990722,
|
|
"epoch": 0.8346985927326191,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004936673550412767,
|
|
"loss": 5.6502,
|
|
"mean_token_accuracy": 0.15562164932489395,
|
|
"num_tokens": 18341457.0,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"entropy": 5.817247200012207,
|
|
"epoch": 0.835118672547784,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000493660298290412,
|
|
"loss": 5.6352,
|
|
"mean_token_accuracy": 0.14964016079902648,
|
|
"num_tokens": 18351397.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 5.7535981178283695,
|
|
"epoch": 0.8355387523629489,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004936532376660929,
|
|
"loss": 5.5601,
|
|
"mean_token_accuracy": 0.15686817914247514,
|
|
"num_tokens": 18360005.0,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"entropy": 5.856048727035523,
|
|
"epoch": 0.8359588321781138,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004936461731684442,
|
|
"loss": 5.6621,
|
|
"mean_token_accuracy": 0.15645960420370103,
|
|
"num_tokens": 18369707.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 5.904961681365966,
|
|
"epoch": 0.8363789119932787,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.0004936391047975912,
|
|
"loss": 5.7951,
|
|
"mean_token_accuracy": 0.14975984990596772,
|
|
"num_tokens": 18379514.0,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"entropy": 5.728058910369873,
|
|
"epoch": 0.8367989918084436,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004936320325536589,
|
|
"loss": 5.4989,
|
|
"mean_token_accuracy": 0.15699619948863983,
|
|
"num_tokens": 18388854.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 5.80841555595398,
|
|
"epoch": 0.8372190716236085,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004936249564367729,
|
|
"loss": 5.6713,
|
|
"mean_token_accuracy": 0.15378804504871368,
|
|
"num_tokens": 18397806.0,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"entropy": 5.713347768783569,
|
|
"epoch": 0.8376391514387733,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004936178764470583,
|
|
"loss": 5.5296,
|
|
"mean_token_accuracy": 0.1534825384616852,
|
|
"num_tokens": 18406645.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 5.6835887908935545,
|
|
"epoch": 0.8380592312539382,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004936107925846405,
|
|
"loss": 5.5458,
|
|
"mean_token_accuracy": 0.15742876827716829,
|
|
"num_tokens": 18415730.0,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"entropy": 5.7930676460266115,
|
|
"epoch": 0.8384793110691031,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004936037048496452,
|
|
"loss": 5.6499,
|
|
"mean_token_accuracy": 0.1560029774904251,
|
|
"num_tokens": 18424638.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 5.799233627319336,
|
|
"epoch": 0.838899390884268,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004935966132421977,
|
|
"loss": 5.6852,
|
|
"mean_token_accuracy": 0.14873172864317893,
|
|
"num_tokens": 18434090.0,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"entropy": 5.67788405418396,
|
|
"epoch": 0.8393194706994329,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004935895177624239,
|
|
"loss": 5.5532,
|
|
"mean_token_accuracy": 0.1584454283118248,
|
|
"num_tokens": 18442965.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 5.811638116836548,
|
|
"epoch": 0.8397395505145978,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004935824184104493,
|
|
"loss": 5.5789,
|
|
"mean_token_accuracy": 0.1549446702003479,
|
|
"num_tokens": 18451553.0,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"entropy": 5.778439950942993,
|
|
"epoch": 0.8401596303297627,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004935753151863997,
|
|
"loss": 5.6168,
|
|
"mean_token_accuracy": 0.15213518738746643,
|
|
"num_tokens": 18461325.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 5.781700515747071,
|
|
"epoch": 0.8405797101449275,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004935682080904009,
|
|
"loss": 5.6206,
|
|
"mean_token_accuracy": 0.16005493104457855,
|
|
"num_tokens": 18469977.0,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"entropy": 5.758043384552002,
|
|
"epoch": 0.8409997899600924,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004935610971225789,
|
|
"loss": 5.5862,
|
|
"mean_token_accuracy": 0.1575999900698662,
|
|
"num_tokens": 18479534.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 5.688985300064087,
|
|
"epoch": 0.8414198697752573,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004935539822830597,
|
|
"loss": 5.6943,
|
|
"mean_token_accuracy": 0.14613962322473525,
|
|
"num_tokens": 18488800.0,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"entropy": 5.7671685218811035,
|
|
"epoch": 0.8418399495904222,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000493546863571969,
|
|
"loss": 5.6557,
|
|
"mean_token_accuracy": 0.1554260805249214,
|
|
"num_tokens": 18498083.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 5.817663335800171,
|
|
"epoch": 0.8422600294055871,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004935397409894333,
|
|
"loss": 5.6099,
|
|
"mean_token_accuracy": 0.14785023778676987,
|
|
"num_tokens": 18508265.0,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"entropy": 5.810160112380982,
|
|
"epoch": 0.842680109220752,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004935326145355787,
|
|
"loss": 5.6445,
|
|
"mean_token_accuracy": 0.15227773338556289,
|
|
"num_tokens": 18517283.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 5.775955724716186,
|
|
"epoch": 0.8431001890359168,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004935254842105311,
|
|
"loss": 5.6577,
|
|
"mean_token_accuracy": 0.158540278673172,
|
|
"num_tokens": 18526482.0,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"entropy": 5.6810362339019775,
|
|
"epoch": 0.8435202688510817,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004935183500144173,
|
|
"loss": 5.4966,
|
|
"mean_token_accuracy": 0.16830503046512604,
|
|
"num_tokens": 18536150.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 5.821089220046997,
|
|
"epoch": 0.8439403486662466,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004935112119473634,
|
|
"loss": 5.6978,
|
|
"mean_token_accuracy": 0.15066490024328233,
|
|
"num_tokens": 18545168.0,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"entropy": 5.785538864135742,
|
|
"epoch": 0.8443604284814115,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004935040700094959,
|
|
"loss": 5.6256,
|
|
"mean_token_accuracy": 0.15842598676681519,
|
|
"num_tokens": 18553363.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 5.740128374099731,
|
|
"epoch": 0.8447805082965764,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004934969242009412,
|
|
"loss": 5.5817,
|
|
"mean_token_accuracy": 0.15919749736785888,
|
|
"num_tokens": 18562546.0,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"entropy": 5.705161762237549,
|
|
"epoch": 0.8452005881117413,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004934897745218262,
|
|
"loss": 5.6338,
|
|
"mean_token_accuracy": 0.15164628773927688,
|
|
"num_tokens": 18572149.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 5.729842662811279,
|
|
"epoch": 0.8456206679269062,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004934826209722772,
|
|
"loss": 5.5077,
|
|
"mean_token_accuracy": 0.1547485738992691,
|
|
"num_tokens": 18580842.0,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"entropy": 5.7600654602050785,
|
|
"epoch": 0.8460407477420709,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004934754635524211,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.15985522121191026,
|
|
"num_tokens": 18589765.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 5.762496757507324,
|
|
"epoch": 0.8464608275572358,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004934683022623847,
|
|
"loss": 5.6401,
|
|
"mean_token_accuracy": 0.15011052042245865,
|
|
"num_tokens": 18599532.0,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"entropy": 5.685576343536377,
|
|
"epoch": 0.8468809073724007,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004934611371022947,
|
|
"loss": 5.5281,
|
|
"mean_token_accuracy": 0.16043669879436492,
|
|
"num_tokens": 18608438.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 5.787454748153687,
|
|
"epoch": 0.8473009871875656,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004934539680722783,
|
|
"loss": 5.6793,
|
|
"mean_token_accuracy": 0.1521899461746216,
|
|
"num_tokens": 18617313.0,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"entropy": 5.731491613388061,
|
|
"epoch": 0.8477210670027305,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004934467951724622,
|
|
"loss": 5.5123,
|
|
"mean_token_accuracy": 0.1605857416987419,
|
|
"num_tokens": 18625880.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 5.730096912384033,
|
|
"epoch": 0.8481411468178954,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004934396184029737,
|
|
"loss": 5.6046,
|
|
"mean_token_accuracy": 0.15527373552322388,
|
|
"num_tokens": 18635727.0,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"entropy": 5.769042825698852,
|
|
"epoch": 0.8485612266330603,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004934324377639398,
|
|
"loss": 5.662,
|
|
"mean_token_accuracy": 0.15308721214532853,
|
|
"num_tokens": 18645619.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 5.736938428878784,
|
|
"epoch": 0.8489813064482251,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004934252532554878,
|
|
"loss": 5.5544,
|
|
"mean_token_accuracy": 0.1575164332985878,
|
|
"num_tokens": 18654901.0,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"entropy": 5.844228029251099,
|
|
"epoch": 0.84940138626339,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004934180648777449,
|
|
"loss": 5.8122,
|
|
"mean_token_accuracy": 0.15224194526672363,
|
|
"num_tokens": 18664523.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 5.8306056499481205,
|
|
"epoch": 0.8498214660785549,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004934108726308384,
|
|
"loss": 5.6362,
|
|
"mean_token_accuracy": 0.14759955704212188,
|
|
"num_tokens": 18673685.0,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"entropy": 5.767707586288452,
|
|
"epoch": 0.8502415458937198,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004934036765148958,
|
|
"loss": 5.6142,
|
|
"mean_token_accuracy": 0.14617660790681838,
|
|
"num_tokens": 18682889.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 5.758945083618164,
|
|
"epoch": 0.8506616257088847,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004933964765300446,
|
|
"loss": 5.6533,
|
|
"mean_token_accuracy": 0.15302490592002868,
|
|
"num_tokens": 18692978.0,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"entropy": 5.750522422790527,
|
|
"epoch": 0.8510817055240496,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000493389272676412,
|
|
"loss": 5.5705,
|
|
"mean_token_accuracy": 0.1600403904914856,
|
|
"num_tokens": 18701846.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 5.79836106300354,
|
|
"epoch": 0.8515017853392145,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004933820649541262,
|
|
"loss": 5.5935,
|
|
"mean_token_accuracy": 0.16571370661258697,
|
|
"num_tokens": 18711492.0,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"entropy": 5.670457267761231,
|
|
"epoch": 0.8519218651543793,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004933748533633145,
|
|
"loss": 5.5244,
|
|
"mean_token_accuracy": 0.16938419491052628,
|
|
"num_tokens": 18720407.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 5.713903999328613,
|
|
"epoch": 0.8523419449695442,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004933676379041045,
|
|
"loss": 5.5771,
|
|
"mean_token_accuracy": 0.1604509249329567,
|
|
"num_tokens": 18729968.0,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"entropy": 5.8019672393798825,
|
|
"epoch": 0.8527620247847091,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004933604185766245,
|
|
"loss": 5.6939,
|
|
"mean_token_accuracy": 0.1484614282846451,
|
|
"num_tokens": 18739525.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 5.755314731597901,
|
|
"epoch": 0.853182104599874,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004933531953810019,
|
|
"loss": 5.5984,
|
|
"mean_token_accuracy": 0.15788624286651612,
|
|
"num_tokens": 18749087.0,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"entropy": 5.818537855148316,
|
|
"epoch": 0.8536021844150389,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004933459683173652,
|
|
"loss": 5.6259,
|
|
"mean_token_accuracy": 0.1562245801091194,
|
|
"num_tokens": 18758174.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 5.796029376983642,
|
|
"epoch": 0.8540222642302038,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004933387373858418,
|
|
"loss": 5.6637,
|
|
"mean_token_accuracy": 0.15472310557961463,
|
|
"num_tokens": 18767679.0,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"entropy": 5.743490171432495,
|
|
"epoch": 0.8544423440453687,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004933315025865602,
|
|
"loss": 5.5875,
|
|
"mean_token_accuracy": 0.15303896814584733,
|
|
"num_tokens": 18776749.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 5.814285850524902,
|
|
"epoch": 0.8548624238605335,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004933242639196485,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.14032013416290284,
|
|
"num_tokens": 18786313.0,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"entropy": 5.87596173286438,
|
|
"epoch": 0.8552825036756984,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004933170213852348,
|
|
"loss": 5.632,
|
|
"mean_token_accuracy": 0.15269517451524733,
|
|
"num_tokens": 18795340.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 5.749491739273071,
|
|
"epoch": 0.8557025834908633,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004933097749834476,
|
|
"loss": 5.5675,
|
|
"mean_token_accuracy": 0.1547122523188591,
|
|
"num_tokens": 18804114.0,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"entropy": 5.750264501571655,
|
|
"epoch": 0.8561226633060282,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.000493302524714415,
|
|
"loss": 5.5798,
|
|
"mean_token_accuracy": 0.1528068631887436,
|
|
"num_tokens": 18813797.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 5.751224422454834,
|
|
"epoch": 0.856542743121193,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004932952705782657,
|
|
"loss": 5.631,
|
|
"mean_token_accuracy": 0.15325366854667663,
|
|
"num_tokens": 18822410.0,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"entropy": 5.709691667556763,
|
|
"epoch": 0.856962822936358,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000493288012575128,
|
|
"loss": 5.5632,
|
|
"mean_token_accuracy": 0.1608235776424408,
|
|
"num_tokens": 18832091.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 5.747391223907471,
|
|
"epoch": 0.8573829027515227,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004932807507051307,
|
|
"loss": 5.5981,
|
|
"mean_token_accuracy": 0.14849429577589035,
|
|
"num_tokens": 18841298.0,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"entropy": 5.7065764427185055,
|
|
"epoch": 0.8578029825666876,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004932734849684022,
|
|
"loss": 5.5663,
|
|
"mean_token_accuracy": 0.15466026067733765,
|
|
"num_tokens": 18849683.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 5.744755029678345,
|
|
"epoch": 0.8582230623818525,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004932662153650712,
|
|
"loss": 5.5082,
|
|
"mean_token_accuracy": 0.15981326550245284,
|
|
"num_tokens": 18858832.0,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"entropy": 5.647493553161621,
|
|
"epoch": 0.8586431421970174,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004932589418952668,
|
|
"loss": 5.5438,
|
|
"mean_token_accuracy": 0.15799610018730165,
|
|
"num_tokens": 18867652.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 5.78511266708374,
|
|
"epoch": 0.8590632220121823,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004932516645591175,
|
|
"loss": 5.6315,
|
|
"mean_token_accuracy": 0.1554282858967781,
|
|
"num_tokens": 18877282.0,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"entropy": 5.833698844909668,
|
|
"epoch": 0.8594833018273472,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004932443833567524,
|
|
"loss": 5.7462,
|
|
"mean_token_accuracy": 0.1505351722240448,
|
|
"num_tokens": 18886565.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 5.777234220504761,
|
|
"epoch": 0.8599033816425121,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004932370982883003,
|
|
"loss": 5.6656,
|
|
"mean_token_accuracy": 0.15549270063638687,
|
|
"num_tokens": 18896440.0,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"entropy": 5.8239048480987545,
|
|
"epoch": 0.8603234614576769,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004932298093538905,
|
|
"loss": 5.6887,
|
|
"mean_token_accuracy": 0.15299588292837143,
|
|
"num_tokens": 18906246.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 5.746791028976441,
|
|
"epoch": 0.8607435412728418,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000493222516553652,
|
|
"loss": 5.5925,
|
|
"mean_token_accuracy": 0.1533835083246231,
|
|
"num_tokens": 18915108.0,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"entropy": 5.781469821929932,
|
|
"epoch": 0.8611636210880067,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004932152198877139,
|
|
"loss": 5.6,
|
|
"mean_token_accuracy": 0.15372219830751419,
|
|
"num_tokens": 18923664.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 5.7778332233428955,
|
|
"epoch": 0.8615837009031716,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004932079193562057,
|
|
"loss": 5.697,
|
|
"mean_token_accuracy": 0.15252179205417632,
|
|
"num_tokens": 18933496.0,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"entropy": 5.733058881759644,
|
|
"epoch": 0.8620037807183365,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004932006149592564,
|
|
"loss": 5.5788,
|
|
"mean_token_accuracy": 0.15552108436822892,
|
|
"num_tokens": 18942222.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 5.810169363021851,
|
|
"epoch": 0.8624238605335014,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004931933066969957,
|
|
"loss": 5.5888,
|
|
"mean_token_accuracy": 0.15849068462848664,
|
|
"num_tokens": 18952057.0,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"entropy": 5.738401651382446,
|
|
"epoch": 0.8628439403486663,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004931859945695528,
|
|
"loss": 5.6356,
|
|
"mean_token_accuracy": 0.15441264659166337,
|
|
"num_tokens": 18961664.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 5.665639925003052,
|
|
"epoch": 0.8632640201638311,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004931786785770575,
|
|
"loss": 5.429,
|
|
"mean_token_accuracy": 0.16940231174230574,
|
|
"num_tokens": 18969900.0,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"entropy": 5.793166017532348,
|
|
"epoch": 0.863684099978996,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004931713587196392,
|
|
"loss": 5.7206,
|
|
"mean_token_accuracy": 0.1475231796503067,
|
|
"num_tokens": 18979286.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 5.855304002761841,
|
|
"epoch": 0.8641041797941609,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004931640349974275,
|
|
"loss": 5.603,
|
|
"mean_token_accuracy": 0.1532246984541416,
|
|
"num_tokens": 18987553.0,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"entropy": 5.77991795539856,
|
|
"epoch": 0.8645242596093258,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004931567074105524,
|
|
"loss": 5.6872,
|
|
"mean_token_accuracy": 0.15210114121437074,
|
|
"num_tokens": 18996354.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 5.688443899154663,
|
|
"epoch": 0.8649443394244907,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 0.0004931493759591435,
|
|
"loss": 5.5749,
|
|
"mean_token_accuracy": 0.15452788174152374,
|
|
"num_tokens": 19005150.0,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"entropy": 5.801825380325317,
|
|
"epoch": 0.8653644192396556,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004931420406433308,
|
|
"loss": 5.5793,
|
|
"mean_token_accuracy": 0.15020548403263093,
|
|
"num_tokens": 19014572.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 5.703862047195434,
|
|
"epoch": 0.8657844990548205,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000493134701463244,
|
|
"loss": 5.4508,
|
|
"mean_token_accuracy": 0.16280461698770524,
|
|
"num_tokens": 19023462.0,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"entropy": 5.649288606643677,
|
|
"epoch": 0.8662045788699853,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004931273584190135,
|
|
"loss": 5.5405,
|
|
"mean_token_accuracy": 0.15991990268230438,
|
|
"num_tokens": 19032460.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 5.731163692474365,
|
|
"epoch": 0.8666246586851502,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004931200115107691,
|
|
"loss": 5.579,
|
|
"mean_token_accuracy": 0.16041069328784943,
|
|
"num_tokens": 19041734.0,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"entropy": 5.697036027908325,
|
|
"epoch": 0.867044738500315,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000493112660738641,
|
|
"loss": 5.5608,
|
|
"mean_token_accuracy": 0.15314172506332396,
|
|
"num_tokens": 19050867.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 5.708456945419312,
|
|
"epoch": 0.86746481831548,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004931053061027594,
|
|
"loss": 5.5539,
|
|
"mean_token_accuracy": 0.15272417664527893,
|
|
"num_tokens": 19060518.0,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"entropy": 5.742541694641114,
|
|
"epoch": 0.8678848981306448,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004930979476032546,
|
|
"loss": 5.5539,
|
|
"mean_token_accuracy": 0.15664585381746293,
|
|
"num_tokens": 19069588.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 5.725212717056275,
|
|
"epoch": 0.8683049779458097,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000493090585240257,
|
|
"loss": 5.6005,
|
|
"mean_token_accuracy": 0.14247507825493813,
|
|
"num_tokens": 19079060.0,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"entropy": 5.6803240299224855,
|
|
"epoch": 0.8687250577609746,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004930832190138969,
|
|
"loss": 5.533,
|
|
"mean_token_accuracy": 0.15190561562776567,
|
|
"num_tokens": 19087721.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 5.769875383377075,
|
|
"epoch": 0.8691451375761394,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000493075848924305,
|
|
"loss": 5.5676,
|
|
"mean_token_accuracy": 0.1551969662308693,
|
|
"num_tokens": 19096800.0,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"entropy": 5.790397357940674,
|
|
"epoch": 0.8695652173913043,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004930684749716117,
|
|
"loss": 5.6411,
|
|
"mean_token_accuracy": 0.15215054303407669,
|
|
"num_tokens": 19106774.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 5.751374912261963,
|
|
"epoch": 0.8699852972064692,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004930610971559476,
|
|
"loss": 5.5861,
|
|
"mean_token_accuracy": 0.1551279380917549,
|
|
"num_tokens": 19116413.0,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"entropy": 5.739291095733643,
|
|
"epoch": 0.8704053770216341,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004930537154774436,
|
|
"loss": 5.6015,
|
|
"mean_token_accuracy": 0.15086202025413514,
|
|
"num_tokens": 19125363.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 5.794745826721192,
|
|
"epoch": 0.870825456836799,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004930463299362302,
|
|
"loss": 5.6984,
|
|
"mean_token_accuracy": 0.14360912814736365,
|
|
"num_tokens": 19135461.0,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"entropy": 5.806246614456176,
|
|
"epoch": 0.8712455366519639,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004930389405324383,
|
|
"loss": 5.5582,
|
|
"mean_token_accuracy": 0.16600679904222487,
|
|
"num_tokens": 19144085.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 5.762925720214843,
|
|
"epoch": 0.8716656164671287,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004930315472661987,
|
|
"loss": 5.5741,
|
|
"mean_token_accuracy": 0.15904655829071998,
|
|
"num_tokens": 19153291.0,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"entropy": 5.732652473449707,
|
|
"epoch": 0.8720856962822936,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004930241501376428,
|
|
"loss": 5.5947,
|
|
"mean_token_accuracy": 0.15122335851192475,
|
|
"num_tokens": 19163514.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 5.602568197250366,
|
|
"epoch": 0.8725057760974585,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004930167491469013,
|
|
"loss": 5.4792,
|
|
"mean_token_accuracy": 0.1624978721141815,
|
|
"num_tokens": 19172103.0,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"entropy": 5.75473918914795,
|
|
"epoch": 0.8729258559126234,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004930093442941053,
|
|
"loss": 5.5509,
|
|
"mean_token_accuracy": 0.15365159437060355,
|
|
"num_tokens": 19180893.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 5.764384841918945,
|
|
"epoch": 0.8733459357277883,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004930019355793858,
|
|
"loss": 5.4714,
|
|
"mean_token_accuracy": 0.1572717860341072,
|
|
"num_tokens": 19190495.0,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"entropy": 5.709274530410767,
|
|
"epoch": 0.8737660155429532,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004929945230028746,
|
|
"loss": 5.5633,
|
|
"mean_token_accuracy": 0.16117294877767563,
|
|
"num_tokens": 19198988.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 5.656596994400024,
|
|
"epoch": 0.8741860953581181,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004929871065647024,
|
|
"loss": 5.4723,
|
|
"mean_token_accuracy": 0.1623318910598755,
|
|
"num_tokens": 19208014.0,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"entropy": 5.754249525070191,
|
|
"epoch": 0.8746061751732829,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004929796862650011,
|
|
"loss": 5.6686,
|
|
"mean_token_accuracy": 0.15798502415418625,
|
|
"num_tokens": 19218220.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 5.750339126586914,
|
|
"epoch": 0.8750262549884478,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004929722621039018,
|
|
"loss": 5.5613,
|
|
"mean_token_accuracy": 0.1570570647716522,
|
|
"num_tokens": 19227176.0,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"entropy": 5.721258115768433,
|
|
"epoch": 0.8754463348036127,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004929648340815362,
|
|
"loss": 5.5929,
|
|
"mean_token_accuracy": 0.15091799348592758,
|
|
"num_tokens": 19236085.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 5.767314195632935,
|
|
"epoch": 0.8758664146187776,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004929574021980355,
|
|
"loss": 5.643,
|
|
"mean_token_accuracy": 0.1486381933093071,
|
|
"num_tokens": 19246671.0,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"entropy": 5.76701602935791,
|
|
"epoch": 0.8762864944339425,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004929499664535319,
|
|
"loss": 5.5492,
|
|
"mean_token_accuracy": 0.15346565693616868,
|
|
"num_tokens": 19256321.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 5.763290786743164,
|
|
"epoch": 0.8767065742491074,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004929425268481569,
|
|
"loss": 5.5126,
|
|
"mean_token_accuracy": 0.1608709618449211,
|
|
"num_tokens": 19265518.0,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"entropy": 5.718894052505493,
|
|
"epoch": 0.8771266540642723,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004929350833820422,
|
|
"loss": 5.5147,
|
|
"mean_token_accuracy": 0.15873141810297967,
|
|
"num_tokens": 19274120.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 5.731625127792358,
|
|
"epoch": 0.877546733879437,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004929276360553197,
|
|
"loss": 5.5882,
|
|
"mean_token_accuracy": 0.16043589189648627,
|
|
"num_tokens": 19284377.0,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"entropy": 5.711872720718384,
|
|
"epoch": 0.8779668136946019,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004929201848681213,
|
|
"loss": 5.4576,
|
|
"mean_token_accuracy": 0.15541169792413712,
|
|
"num_tokens": 19293326.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 5.690513658523559,
|
|
"epoch": 0.8783868935097668,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004929127298205792,
|
|
"loss": 5.5079,
|
|
"mean_token_accuracy": 0.1659105733036995,
|
|
"num_tokens": 19302086.0,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"entropy": 5.804715394973755,
|
|
"epoch": 0.8788069733249317,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004929052709128251,
|
|
"loss": 5.5488,
|
|
"mean_token_accuracy": 0.1627936765551567,
|
|
"num_tokens": 19310124.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 5.633396434783935,
|
|
"epoch": 0.8792270531400966,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004928978081449914,
|
|
"loss": 5.5709,
|
|
"mean_token_accuracy": 0.15216370820999145,
|
|
"num_tokens": 19321269.0,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"entropy": 5.696399784088134,
|
|
"epoch": 0.8796471329552615,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004928903415172103,
|
|
"loss": 5.5728,
|
|
"mean_token_accuracy": 0.15912040174007416,
|
|
"num_tokens": 19330390.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 5.818605709075928,
|
|
"epoch": 0.8800672127704264,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.000492882871029614,
|
|
"loss": 5.5743,
|
|
"mean_token_accuracy": 0.15722174644470216,
|
|
"num_tokens": 19339457.0,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"entropy": 5.749679517745972,
|
|
"epoch": 0.8804872925855912,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004928753966823348,
|
|
"loss": 5.638,
|
|
"mean_token_accuracy": 0.15191923528909684,
|
|
"num_tokens": 19348710.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 5.747959899902344,
|
|
"epoch": 0.8809073724007561,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004928679184755051,
|
|
"loss": 5.6689,
|
|
"mean_token_accuracy": 0.15637236088514328,
|
|
"num_tokens": 19357215.0,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"entropy": 5.747460222244262,
|
|
"epoch": 0.881327452215921,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004928604364092574,
|
|
"loss": 5.6071,
|
|
"mean_token_accuracy": 0.15696584284305573,
|
|
"num_tokens": 19366043.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 5.8075761795043945,
|
|
"epoch": 0.8817475320310859,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004928529504837243,
|
|
"loss": 5.6882,
|
|
"mean_token_accuracy": 0.15294934064149857,
|
|
"num_tokens": 19375468.0,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"entropy": 5.845993375778198,
|
|
"epoch": 0.8821676118462508,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004928454606990383,
|
|
"loss": 5.5475,
|
|
"mean_token_accuracy": 0.16165847033262254,
|
|
"num_tokens": 19384467.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 5.70394549369812,
|
|
"epoch": 0.8825876916614157,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004928379670553322,
|
|
"loss": 5.5885,
|
|
"mean_token_accuracy": 0.15876393169164657,
|
|
"num_tokens": 19393618.0,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"entropy": 5.758576488494873,
|
|
"epoch": 0.8830077714765806,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004928304695527387,
|
|
"loss": 5.6432,
|
|
"mean_token_accuracy": 0.15267120897769929,
|
|
"num_tokens": 19402921.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 5.864232301712036,
|
|
"epoch": 0.8834278512917454,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004928229681913905,
|
|
"loss": 5.6261,
|
|
"mean_token_accuracy": 0.15496253222227097,
|
|
"num_tokens": 19412048.0,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"entropy": 5.862086200714112,
|
|
"epoch": 0.8838479311069103,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0004928154629714207,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.15387734174728393,
|
|
"num_tokens": 19420993.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 5.727069139480591,
|
|
"epoch": 0.8842680109220752,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.000492807953892962,
|
|
"loss": 5.5841,
|
|
"mean_token_accuracy": 0.15330443829298018,
|
|
"num_tokens": 19430145.0,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"entropy": 5.723509407043457,
|
|
"epoch": 0.8846880907372401,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004928004409561476,
|
|
"loss": 5.4892,
|
|
"mean_token_accuracy": 0.15867023319005966,
|
|
"num_tokens": 19438918.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 5.691130256652832,
|
|
"epoch": 0.885108170552405,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004927929241611106,
|
|
"loss": 5.5303,
|
|
"mean_token_accuracy": 0.1610460638999939,
|
|
"num_tokens": 19448490.0,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"entropy": 5.709879350662232,
|
|
"epoch": 0.8855282503675699,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000492785403507984,
|
|
"loss": 5.6012,
|
|
"mean_token_accuracy": 0.1556025877594948,
|
|
"num_tokens": 19457098.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 5.761733865737915,
|
|
"epoch": 0.8859483301827347,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004927778789969012,
|
|
"loss": 5.5863,
|
|
"mean_token_accuracy": 0.15728465467691422,
|
|
"num_tokens": 19466419.0,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"entropy": 5.740839338302612,
|
|
"epoch": 0.8863684099978996,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004927703506279955,
|
|
"loss": 5.6421,
|
|
"mean_token_accuracy": 0.14617049992084502,
|
|
"num_tokens": 19475882.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 5.88862247467041,
|
|
"epoch": 0.8867884898130645,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004927628184014,
|
|
"loss": 5.6836,
|
|
"mean_token_accuracy": 0.15036097317934036,
|
|
"num_tokens": 19485917.0,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"entropy": 5.807638216018677,
|
|
"epoch": 0.8872085696282294,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004927552823172483,
|
|
"loss": 5.608,
|
|
"mean_token_accuracy": 0.1534525066614151,
|
|
"num_tokens": 19494984.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 5.803097820281982,
|
|
"epoch": 0.8876286494433943,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000492747742375674,
|
|
"loss": 5.5521,
|
|
"mean_token_accuracy": 0.16029339879751206,
|
|
"num_tokens": 19504087.0,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"entropy": 5.809068632125855,
|
|
"epoch": 0.8880487292585592,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004927401985768106,
|
|
"loss": 5.6142,
|
|
"mean_token_accuracy": 0.15856605321168898,
|
|
"num_tokens": 19512880.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 5.732918643951416,
|
|
"epoch": 0.888468809073724,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004927326509207915,
|
|
"loss": 5.5741,
|
|
"mean_token_accuracy": 0.1594431221485138,
|
|
"num_tokens": 19521723.0,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"entropy": 5.782747840881347,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004927250994077508,
|
|
"loss": 5.66,
|
|
"mean_token_accuracy": 0.15072188079357146,
|
|
"num_tokens": 19531352.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 5.858024024963379,
|
|
"epoch": 0.8893089687040537,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.000492717544037822,
|
|
"loss": 5.7545,
|
|
"mean_token_accuracy": 0.15927736610174179,
|
|
"num_tokens": 19540943.0,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"entropy": 5.770633697509766,
|
|
"epoch": 0.8897290485192186,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000492709984811139,
|
|
"loss": 5.5227,
|
|
"mean_token_accuracy": 0.1598847970366478,
|
|
"num_tokens": 19550527.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 5.72091121673584,
|
|
"epoch": 0.8901491283343835,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004927024217278358,
|
|
"loss": 5.5219,
|
|
"mean_token_accuracy": 0.16189746409654618,
|
|
"num_tokens": 19559746.0,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"entropy": 5.759682607650757,
|
|
"epoch": 0.8905692081495484,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004926948547880462,
|
|
"loss": 5.6816,
|
|
"mean_token_accuracy": 0.14713766053318977,
|
|
"num_tokens": 19569286.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 5.684707307815552,
|
|
"epoch": 0.8909892879647133,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004926872839919044,
|
|
"loss": 5.5681,
|
|
"mean_token_accuracy": 0.15598509460687637,
|
|
"num_tokens": 19578245.0,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"entropy": 5.722670841217041,
|
|
"epoch": 0.8914093677798782,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004926797093395446,
|
|
"loss": 5.5325,
|
|
"mean_token_accuracy": 0.16016458123922347,
|
|
"num_tokens": 19587244.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 5.762173748016357,
|
|
"epoch": 0.891829447595043,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 0.0004926721308311006,
|
|
"loss": 5.615,
|
|
"mean_token_accuracy": 0.15994844064116479,
|
|
"num_tokens": 19596932.0,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"entropy": 5.879995727539063,
|
|
"epoch": 0.8922495274102079,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004926645484667069,
|
|
"loss": 5.7186,
|
|
"mean_token_accuracy": 0.14976566582918166,
|
|
"num_tokens": 19606256.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 5.882073593139649,
|
|
"epoch": 0.8926696072253728,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004926569622464979,
|
|
"loss": 5.7089,
|
|
"mean_token_accuracy": 0.15212067142128943,
|
|
"num_tokens": 19615726.0,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"entropy": 5.8041211605072025,
|
|
"epoch": 0.8930896870405377,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0004926493721706079,
|
|
"loss": 5.5764,
|
|
"mean_token_accuracy": 0.1547590583562851,
|
|
"num_tokens": 19624037.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 5.756782007217407,
|
|
"epoch": 0.8935097668557026,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004926417782391713,
|
|
"loss": 5.5781,
|
|
"mean_token_accuracy": 0.16269729286432266,
|
|
"num_tokens": 19632882.0,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"entropy": 5.793702459335327,
|
|
"epoch": 0.8939298466708675,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004926341804523227,
|
|
"loss": 5.6828,
|
|
"mean_token_accuracy": 0.15286366492509842,
|
|
"num_tokens": 19642686.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 5.759325933456421,
|
|
"epoch": 0.8943499264860324,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004926265788101966,
|
|
"loss": 5.5821,
|
|
"mean_token_accuracy": 0.15535581558942796,
|
|
"num_tokens": 19651380.0,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"entropy": 5.718085050582886,
|
|
"epoch": 0.8947700063011972,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004926189733129278,
|
|
"loss": 5.5035,
|
|
"mean_token_accuracy": 0.15965501517057418,
|
|
"num_tokens": 19660136.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 5.696755981445312,
|
|
"epoch": 0.8951900861163621,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004926113639606509,
|
|
"loss": 5.5569,
|
|
"mean_token_accuracy": 0.16951919198036194,
|
|
"num_tokens": 19669146.0,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"entropy": 5.8493866443634035,
|
|
"epoch": 0.895610165931527,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004926037507535008,
|
|
"loss": 5.6893,
|
|
"mean_token_accuracy": 0.15577448457479476,
|
|
"num_tokens": 19678627.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 5.76816759109497,
|
|
"epoch": 0.8960302457466919,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004925961336916122,
|
|
"loss": 5.6246,
|
|
"mean_token_accuracy": 0.15917440131306648,
|
|
"num_tokens": 19688033.0,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"entropy": 5.772870635986328,
|
|
"epoch": 0.8964503255618568,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004925885127751202,
|
|
"loss": 5.6191,
|
|
"mean_token_accuracy": 0.15711403042078018,
|
|
"num_tokens": 19696523.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 5.815629243850708,
|
|
"epoch": 0.8968704053770217,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004925808880041596,
|
|
"loss": 5.5466,
|
|
"mean_token_accuracy": 0.1619081273674965,
|
|
"num_tokens": 19706339.0,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"entropy": 5.771422576904297,
|
|
"epoch": 0.8972904851921865,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004925732593788658,
|
|
"loss": 5.5756,
|
|
"mean_token_accuracy": 0.15582350715994836,
|
|
"num_tokens": 19714779.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 5.788242959976197,
|
|
"epoch": 0.8977105650073514,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004925656268993737,
|
|
"loss": 5.6434,
|
|
"mean_token_accuracy": 0.15538930594921113,
|
|
"num_tokens": 19723727.0,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"entropy": 5.679297971725464,
|
|
"epoch": 0.8981306448225163,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004925579905658185,
|
|
"loss": 5.6078,
|
|
"mean_token_accuracy": 0.15833698809146882,
|
|
"num_tokens": 19732783.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 5.815406656265258,
|
|
"epoch": 0.8985507246376812,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004925503503783355,
|
|
"loss": 5.5923,
|
|
"mean_token_accuracy": 0.14969452172517778,
|
|
"num_tokens": 19741268.0,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"entropy": 5.8421392917633055,
|
|
"epoch": 0.898970804452846,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004925427063370601,
|
|
"loss": 5.5229,
|
|
"mean_token_accuracy": 0.1585152953863144,
|
|
"num_tokens": 19751490.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 5.75557165145874,
|
|
"epoch": 0.899390884268011,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004925350584421278,
|
|
"loss": 5.5722,
|
|
"mean_token_accuracy": 0.15308883041143417,
|
|
"num_tokens": 19760487.0,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"entropy": 5.820067501068115,
|
|
"epoch": 0.8998109640831758,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004925274066936738,
|
|
"loss": 5.5441,
|
|
"mean_token_accuracy": 0.16286628544330597,
|
|
"num_tokens": 19768984.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 5.693412828445434,
|
|
"epoch": 0.9002310438983406,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004925197510918339,
|
|
"loss": 5.5163,
|
|
"mean_token_accuracy": 0.1612228661775589,
|
|
"num_tokens": 19778335.0,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"entropy": 5.740248203277588,
|
|
"epoch": 0.9006511237135055,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004925120916367435,
|
|
"loss": 5.66,
|
|
"mean_token_accuracy": 0.14562905877828597,
|
|
"num_tokens": 19789082.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 5.676235198974609,
|
|
"epoch": 0.9010712035286704,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004925044283285384,
|
|
"loss": 5.3958,
|
|
"mean_token_accuracy": 0.17226272374391555,
|
|
"num_tokens": 19797902.0,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"entropy": 5.674381303787231,
|
|
"epoch": 0.9014912833438353,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004924967611673544,
|
|
"loss": 5.567,
|
|
"mean_token_accuracy": 0.15973830968141556,
|
|
"num_tokens": 19806481.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 5.625586986541748,
|
|
"epoch": 0.9019113631590002,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004924890901533273,
|
|
"loss": 5.4518,
|
|
"mean_token_accuracy": 0.16687363982200623,
|
|
"num_tokens": 19815226.0,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"entropy": 5.865736722946167,
|
|
"epoch": 0.9023314429741651,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004924814152865929,
|
|
"loss": 5.6794,
|
|
"mean_token_accuracy": 0.14995559379458429,
|
|
"num_tokens": 19824577.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 5.814121675491333,
|
|
"epoch": 0.90275152278933,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004924737365672873,
|
|
"loss": 5.5908,
|
|
"mean_token_accuracy": 0.15056767463684081,
|
|
"num_tokens": 19832936.0,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"entropy": 5.820723390579223,
|
|
"epoch": 0.9031716026044948,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004924660539955463,
|
|
"loss": 5.7351,
|
|
"mean_token_accuracy": 0.15998328030109404,
|
|
"num_tokens": 19841946.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 5.750902080535889,
|
|
"epoch": 0.9035916824196597,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004924583675715063,
|
|
"loss": 5.6077,
|
|
"mean_token_accuracy": 0.15404654592275618,
|
|
"num_tokens": 19851469.0,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"entropy": 5.799461030960083,
|
|
"epoch": 0.9040117622348246,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004924506772953031,
|
|
"loss": 5.678,
|
|
"mean_token_accuracy": 0.15529222413897514,
|
|
"num_tokens": 19860731.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 5.758323049545288,
|
|
"epoch": 0.9044318420499895,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004924429831670733,
|
|
"loss": 5.6852,
|
|
"mean_token_accuracy": 0.14765787720680237,
|
|
"num_tokens": 19869717.0,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"entropy": 5.825065422058105,
|
|
"epoch": 0.9048519218651544,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000492435285186953,
|
|
"loss": 5.6377,
|
|
"mean_token_accuracy": 0.15890030115842818,
|
|
"num_tokens": 19879229.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 5.873213052749634,
|
|
"epoch": 0.9052720016803193,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004924275833550785,
|
|
"loss": 5.6228,
|
|
"mean_token_accuracy": 0.1515662133693695,
|
|
"num_tokens": 19888260.0,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"entropy": 5.827171325683594,
|
|
"epoch": 0.9056920814954842,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004924198776715865,
|
|
"loss": 5.6436,
|
|
"mean_token_accuracy": 0.16024628281593323,
|
|
"num_tokens": 19897070.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 5.7876802444458,
|
|
"epoch": 0.906112161310649,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004924121681366132,
|
|
"loss": 5.6284,
|
|
"mean_token_accuracy": 0.15037994906306268,
|
|
"num_tokens": 19907170.0,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"entropy": 5.804350471496582,
|
|
"epoch": 0.9065322411258139,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004924044547502951,
|
|
"loss": 5.5682,
|
|
"mean_token_accuracy": 0.1583652213215828,
|
|
"num_tokens": 19917220.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 5.744189023971558,
|
|
"epoch": 0.9069523209409788,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004923967375127692,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.15887839794158937,
|
|
"num_tokens": 19926724.0,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"entropy": 5.845341348648072,
|
|
"epoch": 0.9073724007561437,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.000492389016424172,
|
|
"loss": 5.7404,
|
|
"mean_token_accuracy": 0.15144012570381166,
|
|
"num_tokens": 19936429.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 5.758127307891845,
|
|
"epoch": 0.9077924805713086,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004923812914846404,
|
|
"loss": 5.5099,
|
|
"mean_token_accuracy": 0.15872399806976317,
|
|
"num_tokens": 19945096.0,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"entropy": 5.708646059036255,
|
|
"epoch": 0.9082125603864735,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004923735626943111,
|
|
"loss": 5.5856,
|
|
"mean_token_accuracy": 0.16495954543352126,
|
|
"num_tokens": 19953560.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 5.765500879287719,
|
|
"epoch": 0.9086326402016384,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004923658300533211,
|
|
"loss": 5.5682,
|
|
"mean_token_accuracy": 0.1555124580860138,
|
|
"num_tokens": 19962669.0,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"entropy": 5.802539348602295,
|
|
"epoch": 0.9090527200168032,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004923580935618073,
|
|
"loss": 5.612,
|
|
"mean_token_accuracy": 0.1580589756369591,
|
|
"num_tokens": 19971990.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 5.759839391708374,
|
|
"epoch": 0.909472799831968,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004923503532199069,
|
|
"loss": 5.6108,
|
|
"mean_token_accuracy": 0.15835360288619996,
|
|
"num_tokens": 19981850.0,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"entropy": 5.804291439056397,
|
|
"epoch": 0.909892879647133,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004923426090277567,
|
|
"loss": 5.6433,
|
|
"mean_token_accuracy": 0.15101254507899284,
|
|
"num_tokens": 19991574.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 5.788902282714844,
|
|
"epoch": 0.9103129594622978,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004923348609854943,
|
|
"loss": 5.6121,
|
|
"mean_token_accuracy": 0.16281114518642426,
|
|
"num_tokens": 20001392.0,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"entropy": 5.778925085067749,
|
|
"epoch": 0.9107330392774627,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004923271090932566,
|
|
"loss": 5.6512,
|
|
"mean_token_accuracy": 0.1461693450808525,
|
|
"num_tokens": 20011277.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 5.704980707168579,
|
|
"epoch": 0.9111531190926276,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004923193533511812,
|
|
"loss": 5.5568,
|
|
"mean_token_accuracy": 0.1563573181629181,
|
|
"num_tokens": 20021171.0,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"entropy": 5.873466444015503,
|
|
"epoch": 0.9115731989077924,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004923115937594053,
|
|
"loss": 5.6403,
|
|
"mean_token_accuracy": 0.15872172266244888,
|
|
"num_tokens": 20030189.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 5.826998519897461,
|
|
"epoch": 0.9119932787229573,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004923038303180664,
|
|
"loss": 5.6089,
|
|
"mean_token_accuracy": 0.16154826879501344,
|
|
"num_tokens": 20038287.0,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"entropy": 5.704780101776123,
|
|
"epoch": 0.9124133585381222,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000492296063027302,
|
|
"loss": 5.6242,
|
|
"mean_token_accuracy": 0.1486751489341259,
|
|
"num_tokens": 20047653.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 5.720272779464722,
|
|
"epoch": 0.9128334383532871,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004922882918872498,
|
|
"loss": 5.611,
|
|
"mean_token_accuracy": 0.15257783234119415,
|
|
"num_tokens": 20057415.0,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"entropy": 5.843525409698486,
|
|
"epoch": 0.913253518168452,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004922805168980475,
|
|
"loss": 5.6436,
|
|
"mean_token_accuracy": 0.1583248570561409,
|
|
"num_tokens": 20065996.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 5.765225791931153,
|
|
"epoch": 0.9136735979836169,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004922727380598326,
|
|
"loss": 5.5794,
|
|
"mean_token_accuracy": 0.15503590703010559,
|
|
"num_tokens": 20075376.0,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"entropy": 5.7751857280731205,
|
|
"epoch": 0.9140936777987818,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.000492264955372743,
|
|
"loss": 5.6108,
|
|
"mean_token_accuracy": 0.14910593926906585,
|
|
"num_tokens": 20084950.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 5.851974725723267,
|
|
"epoch": 0.9145137576139466,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004922571688369165,
|
|
"loss": 5.5881,
|
|
"mean_token_accuracy": 0.1583369717001915,
|
|
"num_tokens": 20094011.0,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"entropy": 5.7428583145141605,
|
|
"epoch": 0.9149338374291115,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004922493784524914,
|
|
"loss": 5.56,
|
|
"mean_token_accuracy": 0.1584095723927021,
|
|
"num_tokens": 20103037.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 5.741644382476807,
|
|
"epoch": 0.9153539172442764,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004922415842196052,
|
|
"loss": 5.7116,
|
|
"mean_token_accuracy": 0.14545977264642715,
|
|
"num_tokens": 20112727.0,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"entropy": 5.699281311035156,
|
|
"epoch": 0.9157739970594413,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004922337861383963,
|
|
"loss": 5.522,
|
|
"mean_token_accuracy": 0.1605138972401619,
|
|
"num_tokens": 20122341.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 5.7933906555175785,
|
|
"epoch": 0.9161940768746062,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004922259842090027,
|
|
"loss": 5.5088,
|
|
"mean_token_accuracy": 0.15630880296230315,
|
|
"num_tokens": 20131354.0,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"entropy": 5.752344179153442,
|
|
"epoch": 0.9166141566897711,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004922181784315627,
|
|
"loss": 5.5565,
|
|
"mean_token_accuracy": 0.1608913227915764,
|
|
"num_tokens": 20140440.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 5.673103618621826,
|
|
"epoch": 0.917034236504936,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004922103688062145,
|
|
"loss": 5.556,
|
|
"mean_token_accuracy": 0.1585061579942703,
|
|
"num_tokens": 20149331.0,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"entropy": 5.721803379058838,
|
|
"epoch": 0.9174543163201008,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004922025553330964,
|
|
"loss": 5.5308,
|
|
"mean_token_accuracy": 0.16434049159288405,
|
|
"num_tokens": 20158566.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 5.820386266708374,
|
|
"epoch": 0.9178743961352657,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000492194738012347,
|
|
"loss": 5.6422,
|
|
"mean_token_accuracy": 0.15888736993074418,
|
|
"num_tokens": 20168339.0,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"entropy": 5.8344615459442135,
|
|
"epoch": 0.9182944759504306,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004921869168441045,
|
|
"loss": 5.6482,
|
|
"mean_token_accuracy": 0.15219517126679422,
|
|
"num_tokens": 20177967.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 5.748171138763428,
|
|
"epoch": 0.9187145557655955,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004921790918285077,
|
|
"loss": 5.6339,
|
|
"mean_token_accuracy": 0.1555405542254448,
|
|
"num_tokens": 20187279.0,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"entropy": 5.775131797790527,
|
|
"epoch": 0.9191346355807604,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004921712629656951,
|
|
"loss": 5.7308,
|
|
"mean_token_accuracy": 0.16442956626415253,
|
|
"num_tokens": 20195324.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 5.864813995361328,
|
|
"epoch": 0.9195547153959253,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004921634302558054,
|
|
"loss": 5.6618,
|
|
"mean_token_accuracy": 0.1532442182302475,
|
|
"num_tokens": 20204985.0,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"entropy": 5.7605233669281,
|
|
"epoch": 0.9199747952110902,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004921555936989773,
|
|
"loss": 5.6693,
|
|
"mean_token_accuracy": 0.15000807642936706,
|
|
"num_tokens": 20214553.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 5.815750789642334,
|
|
"epoch": 0.9203948750262549,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004921477532953497,
|
|
"loss": 5.5867,
|
|
"mean_token_accuracy": 0.15734840780496598,
|
|
"num_tokens": 20224118.0,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"entropy": 5.770743799209595,
|
|
"epoch": 0.9208149548414198,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004921399090450616,
|
|
"loss": 5.5348,
|
|
"mean_token_accuracy": 0.15028709322214126,
|
|
"num_tokens": 20233719.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 5.767902183532715,
|
|
"epoch": 0.9212350346565847,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004921320609482517,
|
|
"loss": 5.6305,
|
|
"mean_token_accuracy": 0.15462984144687653,
|
|
"num_tokens": 20242311.0,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"entropy": 5.807157325744629,
|
|
"epoch": 0.9216551144717496,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004921242090050591,
|
|
"loss": 5.6595,
|
|
"mean_token_accuracy": 0.14994974732398986,
|
|
"num_tokens": 20252998.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 5.818349123001099,
|
|
"epoch": 0.9220751942869145,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.000492116353215623,
|
|
"loss": 5.7205,
|
|
"mean_token_accuracy": 0.15557870492339135,
|
|
"num_tokens": 20262456.0,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"entropy": 5.695267009735107,
|
|
"epoch": 0.9224952741020794,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004921084935800825,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.16470759660005568,
|
|
"num_tokens": 20271516.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 5.735180997848511,
|
|
"epoch": 0.9229153539172443,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004921006300985768,
|
|
"loss": 5.5278,
|
|
"mean_token_accuracy": 0.1622763454914093,
|
|
"num_tokens": 20280373.0,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"entropy": 5.715144777297974,
|
|
"epoch": 0.9233354337324091,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004920927627712453,
|
|
"loss": 5.5267,
|
|
"mean_token_accuracy": 0.1575745850801468,
|
|
"num_tokens": 20289426.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 5.809565830230713,
|
|
"epoch": 0.923755513547574,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004920848915982273,
|
|
"loss": 5.6718,
|
|
"mean_token_accuracy": 0.15313809663057326,
|
|
"num_tokens": 20298045.0,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"entropy": 5.710923767089843,
|
|
"epoch": 0.9241755933627389,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004920770165796622,
|
|
"loss": 5.5569,
|
|
"mean_token_accuracy": 0.1600003331899643,
|
|
"num_tokens": 20307352.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 5.757216310501098,
|
|
"epoch": 0.9245956731779038,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004920691377156895,
|
|
"loss": 5.5865,
|
|
"mean_token_accuracy": 0.15644698292016984,
|
|
"num_tokens": 20316448.0,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"entropy": 5.867019748687744,
|
|
"epoch": 0.9250157529930687,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004920612550064488,
|
|
"loss": 5.7449,
|
|
"mean_token_accuracy": 0.1475832186639309,
|
|
"num_tokens": 20326440.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 5.769907808303833,
|
|
"epoch": 0.9254358328082336,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004920533684520797,
|
|
"loss": 5.5086,
|
|
"mean_token_accuracy": 0.15823858827352524,
|
|
"num_tokens": 20335447.0,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"entropy": 5.750536823272705,
|
|
"epoch": 0.9258559126233984,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000492045478052722,
|
|
"loss": 5.6596,
|
|
"mean_token_accuracy": 0.15351206958293914,
|
|
"num_tokens": 20344523.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 5.741793203353882,
|
|
"epoch": 0.9262759924385633,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004920375838085154,
|
|
"loss": 5.6171,
|
|
"mean_token_accuracy": 0.1559000790119171,
|
|
"num_tokens": 20354267.0,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"entropy": 5.798118543624878,
|
|
"epoch": 0.9266960722537282,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004920296857195998,
|
|
"loss": 5.6771,
|
|
"mean_token_accuracy": 0.15482696294784545,
|
|
"num_tokens": 20364137.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 5.799237871170044,
|
|
"epoch": 0.9271161520688931,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000492021783786115,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.16075632423162461,
|
|
"num_tokens": 20372583.0,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"entropy": 5.6686241149902346,
|
|
"epoch": 0.927536231884058,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004920138780082011,
|
|
"loss": 5.5397,
|
|
"mean_token_accuracy": 0.15648741349577905,
|
|
"num_tokens": 20382050.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 5.725726461410522,
|
|
"epoch": 0.9279563116992229,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004920059683859981,
|
|
"loss": 5.4955,
|
|
"mean_token_accuracy": 0.1606592372059822,
|
|
"num_tokens": 20391425.0,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"entropy": 5.798936271667481,
|
|
"epoch": 0.9283763915143878,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004919980549196461,
|
|
"loss": 5.6647,
|
|
"mean_token_accuracy": 0.15349570661783218,
|
|
"num_tokens": 20400559.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 5.767499828338623,
|
|
"epoch": 0.9287964713295526,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004919901376092853,
|
|
"loss": 5.5783,
|
|
"mean_token_accuracy": 0.16081294864416124,
|
|
"num_tokens": 20408985.0,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"entropy": 5.7440389633178714,
|
|
"epoch": 0.9292165511447175,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004919822164550559,
|
|
"loss": 5.6773,
|
|
"mean_token_accuracy": 0.14321673214435576,
|
|
"num_tokens": 20417855.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 5.744246864318848,
|
|
"epoch": 0.9296366309598824,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004919742914570983,
|
|
"loss": 5.6304,
|
|
"mean_token_accuracy": 0.1557525396347046,
|
|
"num_tokens": 20426191.0,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"entropy": 5.765243244171143,
|
|
"epoch": 0.9300567107750473,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000491966362615553,
|
|
"loss": 5.6006,
|
|
"mean_token_accuracy": 0.15035101026296616,
|
|
"num_tokens": 20435592.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 5.85240740776062,
|
|
"epoch": 0.9304767905902122,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00049195842993056,
|
|
"loss": 5.634,
|
|
"mean_token_accuracy": 0.15329790860414505,
|
|
"num_tokens": 20445504.0,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"entropy": 5.803719425201416,
|
|
"epoch": 0.930896870405377,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004919504934022604,
|
|
"loss": 5.578,
|
|
"mean_token_accuracy": 0.15457095801830292,
|
|
"num_tokens": 20455153.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 5.7237049579620365,
|
|
"epoch": 0.931316950220542,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004919425530307943,
|
|
"loss": 5.5681,
|
|
"mean_token_accuracy": 0.15656672269105912,
|
|
"num_tokens": 20465101.0,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"entropy": 5.742412662506103,
|
|
"epoch": 0.9317370300357067,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004919346088163028,
|
|
"loss": 5.615,
|
|
"mean_token_accuracy": 0.1582319989800453,
|
|
"num_tokens": 20474700.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 5.835652637481689,
|
|
"epoch": 0.9321571098508716,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004919266607589263,
|
|
"loss": 5.6564,
|
|
"mean_token_accuracy": 0.15037914365530014,
|
|
"num_tokens": 20483945.0,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"entropy": 5.8025538444519045,
|
|
"epoch": 0.9325771896660365,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004919187088588057,
|
|
"loss": 5.6307,
|
|
"mean_token_accuracy": 0.15815725028514863,
|
|
"num_tokens": 20493307.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 5.722408819198608,
|
|
"epoch": 0.9329972694812014,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004919107531160819,
|
|
"loss": 5.5552,
|
|
"mean_token_accuracy": 0.1643086478114128,
|
|
"num_tokens": 20501889.0,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"entropy": 5.729394769668579,
|
|
"epoch": 0.9334173492963663,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004919027935308957,
|
|
"loss": 5.5785,
|
|
"mean_token_accuracy": 0.15731487423181534,
|
|
"num_tokens": 20510577.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 5.651753997802734,
|
|
"epoch": 0.9338374291115312,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004918948301033884,
|
|
"loss": 5.5583,
|
|
"mean_token_accuracy": 0.15677412003278732,
|
|
"num_tokens": 20520025.0,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"entropy": 5.799631404876709,
|
|
"epoch": 0.9342575089266961,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004918868628337007,
|
|
"loss": 5.6042,
|
|
"mean_token_accuracy": 0.15233962684869767,
|
|
"num_tokens": 20528989.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 5.779157257080078,
|
|
"epoch": 0.9346775887418609,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004918788917219739,
|
|
"loss": 5.5609,
|
|
"mean_token_accuracy": 0.15591868460178376,
|
|
"num_tokens": 20538328.0,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"entropy": 5.714973402023316,
|
|
"epoch": 0.9350976685570258,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004918709167683493,
|
|
"loss": 5.686,
|
|
"mean_token_accuracy": 0.15123260617256165,
|
|
"num_tokens": 20548069.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 5.690325927734375,
|
|
"epoch": 0.9355177483721907,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004918629379729681,
|
|
"loss": 5.4379,
|
|
"mean_token_accuracy": 0.16827901899814607,
|
|
"num_tokens": 20557128.0,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"entropy": 5.725959730148316,
|
|
"epoch": 0.9359378281873556,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004918549553359715,
|
|
"loss": 5.5616,
|
|
"mean_token_accuracy": 0.15457266718149185,
|
|
"num_tokens": 20566352.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 5.780063915252685,
|
|
"epoch": 0.9363579080025205,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004918469688575012,
|
|
"loss": 5.6077,
|
|
"mean_token_accuracy": 0.1547131732106209,
|
|
"num_tokens": 20575814.0,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"entropy": 5.752800464630127,
|
|
"epoch": 0.9367779878176854,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004918389785376983,
|
|
"loss": 5.4704,
|
|
"mean_token_accuracy": 0.16297883689403533,
|
|
"num_tokens": 20584715.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 5.691038417816162,
|
|
"epoch": 0.9371980676328503,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004918309843767047,
|
|
"loss": 5.563,
|
|
"mean_token_accuracy": 0.15457476824522018,
|
|
"num_tokens": 20594630.0,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"entropy": 5.705981302261352,
|
|
"epoch": 0.9376181474480151,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004918229863746618,
|
|
"loss": 5.5344,
|
|
"mean_token_accuracy": 0.15329102724790572,
|
|
"num_tokens": 20603653.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 5.809178400039673,
|
|
"epoch": 0.93803822726318,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004918149845317114,
|
|
"loss": 5.6041,
|
|
"mean_token_accuracy": 0.15675780922174454,
|
|
"num_tokens": 20612188.0,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"entropy": 5.743681907653809,
|
|
"epoch": 0.9384583070783449,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004918069788479952,
|
|
"loss": 5.5291,
|
|
"mean_token_accuracy": 0.16411179453134536,
|
|
"num_tokens": 20620933.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 5.689119815826416,
|
|
"epoch": 0.9388783868935098,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004917989693236549,
|
|
"loss": 5.5733,
|
|
"mean_token_accuracy": 0.1595962554216385,
|
|
"num_tokens": 20629919.0,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"entropy": 5.739494895935058,
|
|
"epoch": 0.9392984667086747,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004917909559588326,
|
|
"loss": 5.5402,
|
|
"mean_token_accuracy": 0.1560191825032234,
|
|
"num_tokens": 20638475.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 5.911345434188843,
|
|
"epoch": 0.9397185465238396,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00049178293875367,
|
|
"loss": 5.6769,
|
|
"mean_token_accuracy": 0.1469906136393547,
|
|
"num_tokens": 20648105.0,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"entropy": 5.764797687530518,
|
|
"epoch": 0.9401386263390044,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004917749177083094,
|
|
"loss": 5.5703,
|
|
"mean_token_accuracy": 0.1515391141176224,
|
|
"num_tokens": 20657527.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 5.723624420166016,
|
|
"epoch": 0.9405587061541693,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004917668928228927,
|
|
"loss": 5.5763,
|
|
"mean_token_accuracy": 0.1612919121980667,
|
|
"num_tokens": 20666375.0,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"entropy": 5.723942565917969,
|
|
"epoch": 0.9409787859693342,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004917588640975622,
|
|
"loss": 5.5232,
|
|
"mean_token_accuracy": 0.1613648310303688,
|
|
"num_tokens": 20675350.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 5.659457588195801,
|
|
"epoch": 0.941398865784499,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00049175083153246,
|
|
"loss": 5.4574,
|
|
"mean_token_accuracy": 0.15883690416812896,
|
|
"num_tokens": 20684072.0,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"entropy": 5.678450441360473,
|
|
"epoch": 0.941818945599664,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004917427951277284,
|
|
"loss": 5.5619,
|
|
"mean_token_accuracy": 0.16161370575428008,
|
|
"num_tokens": 20692989.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 5.763214254379273,
|
|
"epoch": 0.9422390254148288,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004917347548835097,
|
|
"loss": 5.5035,
|
|
"mean_token_accuracy": 0.16200087666511537,
|
|
"num_tokens": 20701269.0,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"entropy": 5.759133005142212,
|
|
"epoch": 0.9426591052299937,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004917267107999466,
|
|
"loss": 5.6106,
|
|
"mean_token_accuracy": 0.15289842039346696,
|
|
"num_tokens": 20709739.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 5.739570665359497,
|
|
"epoch": 0.9430791850451585,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004917186628771812,
|
|
"loss": 5.5576,
|
|
"mean_token_accuracy": 0.16139040291309356,
|
|
"num_tokens": 20718950.0,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"entropy": 5.755300760269165,
|
|
"epoch": 0.9434992648603234,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004917106111153565,
|
|
"loss": 5.5673,
|
|
"mean_token_accuracy": 0.1547436758875847,
|
|
"num_tokens": 20729469.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 5.775959253311157,
|
|
"epoch": 0.9439193446754883,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004917025555146148,
|
|
"loss": 5.5744,
|
|
"mean_token_accuracy": 0.1662562906742096,
|
|
"num_tokens": 20738231.0,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"entropy": 5.756017684936523,
|
|
"epoch": 0.9443394244906532,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000491694496075099,
|
|
"loss": 5.7704,
|
|
"mean_token_accuracy": 0.14358580783009528,
|
|
"num_tokens": 20748578.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 5.837352752685547,
|
|
"epoch": 0.9447595043058181,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004916864327969517,
|
|
"loss": 5.7026,
|
|
"mean_token_accuracy": 0.14462515115737914,
|
|
"num_tokens": 20759284.0,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"entropy": 5.8536529541015625,
|
|
"epoch": 0.945179584120983,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004916783656803158,
|
|
"loss": 5.6316,
|
|
"mean_token_accuracy": 0.15945006310939788,
|
|
"num_tokens": 20768186.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 5.695327425003052,
|
|
"epoch": 0.9455996639361479,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004916702947253342,
|
|
"loss": 5.5009,
|
|
"mean_token_accuracy": 0.16529642790555954,
|
|
"num_tokens": 20776711.0,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"entropy": 5.7685582637786865,
|
|
"epoch": 0.9460197437513127,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004916622199321501,
|
|
"loss": 5.5766,
|
|
"mean_token_accuracy": 0.15894216895103455,
|
|
"num_tokens": 20785154.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 5.80894103050232,
|
|
"epoch": 0.9464398235664776,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004916541413009062,
|
|
"loss": 5.5194,
|
|
"mean_token_accuracy": 0.16128009110689162,
|
|
"num_tokens": 20794114.0,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"entropy": 5.7742784976959225,
|
|
"epoch": 0.9468599033816425,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004916460588317458,
|
|
"loss": 5.6258,
|
|
"mean_token_accuracy": 0.14817884638905526,
|
|
"num_tokens": 20803892.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 5.641349744796753,
|
|
"epoch": 0.9472799831968074,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004916379725248118,
|
|
"loss": 5.511,
|
|
"mean_token_accuracy": 0.16303833425045014,
|
|
"num_tokens": 20812892.0,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"entropy": 5.743069410324097,
|
|
"epoch": 0.9477000630119723,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004916298823802479,
|
|
"loss": 5.5676,
|
|
"mean_token_accuracy": 0.1500309720635414,
|
|
"num_tokens": 20821934.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 5.725360774993897,
|
|
"epoch": 0.9481201428271372,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004916217883981971,
|
|
"loss": 5.4977,
|
|
"mean_token_accuracy": 0.15707524865865707,
|
|
"num_tokens": 20830100.0,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"entropy": 5.692885828018189,
|
|
"epoch": 0.9485402226423021,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004916136905788029,
|
|
"loss": 5.5708,
|
|
"mean_token_accuracy": 0.15760752707719802,
|
|
"num_tokens": 20839890.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 5.779399299621582,
|
|
"epoch": 0.9489603024574669,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004916055889222087,
|
|
"loss": 5.6962,
|
|
"mean_token_accuracy": 0.14309152886271476,
|
|
"num_tokens": 20848670.0,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"entropy": 5.753837442398071,
|
|
"epoch": 0.9493803822726318,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.000491597483428558,
|
|
"loss": 5.5372,
|
|
"mean_token_accuracy": 0.16605689823627473,
|
|
"num_tokens": 20857291.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 5.653014183044434,
|
|
"epoch": 0.9498004620877967,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004915893740979944,
|
|
"loss": 5.4998,
|
|
"mean_token_accuracy": 0.16381447315216063,
|
|
"num_tokens": 20865341.0,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"entropy": 5.805274391174317,
|
|
"epoch": 0.9502205419029616,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004915812609306617,
|
|
"loss": 5.6431,
|
|
"mean_token_accuracy": 0.15660493373870848,
|
|
"num_tokens": 20875194.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 5.8244353294372555,
|
|
"epoch": 0.9506406217181265,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004915731439267034,
|
|
"loss": 5.5483,
|
|
"mean_token_accuracy": 0.1535589724779129,
|
|
"num_tokens": 20884831.0,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"entropy": 5.66036376953125,
|
|
"epoch": 0.9510607015332914,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004915650230862634,
|
|
"loss": 5.431,
|
|
"mean_token_accuracy": 0.16502011716365814,
|
|
"num_tokens": 20893790.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 5.638264322280884,
|
|
"epoch": 0.9514807813484563,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004915568984094854,
|
|
"loss": 5.5594,
|
|
"mean_token_accuracy": 0.15512819588184357,
|
|
"num_tokens": 20902175.0,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"entropy": 5.824262189865112,
|
|
"epoch": 0.951900861163621,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004915487698965136,
|
|
"loss": 5.694,
|
|
"mean_token_accuracy": 0.14529131203889847,
|
|
"num_tokens": 20911484.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 5.88162055015564,
|
|
"epoch": 0.952320940978786,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004915406375474917,
|
|
"loss": 5.6445,
|
|
"mean_token_accuracy": 0.14643194004893303,
|
|
"num_tokens": 20920916.0,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"entropy": 5.807900476455688,
|
|
"epoch": 0.9527410207939508,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000491532501362564,
|
|
"loss": 5.6522,
|
|
"mean_token_accuracy": 0.15773532688617706,
|
|
"num_tokens": 20930219.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 5.679394388198853,
|
|
"epoch": 0.9531611006091157,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004915243613418745,
|
|
"loss": 5.482,
|
|
"mean_token_accuracy": 0.16191438734531402,
|
|
"num_tokens": 20939591.0,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"entropy": 5.767440366744995,
|
|
"epoch": 0.9535811804242806,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004915162174855675,
|
|
"loss": 5.6543,
|
|
"mean_token_accuracy": 0.15383701771497726,
|
|
"num_tokens": 20950035.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 5.7626283168792725,
|
|
"epoch": 0.9540012602394455,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004915080697937872,
|
|
"loss": 5.5616,
|
|
"mean_token_accuracy": 0.15663446485996246,
|
|
"num_tokens": 20959168.0,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"entropy": 5.7188849449157715,
|
|
"epoch": 0.9544213400546103,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004914999182666779,
|
|
"loss": 5.4866,
|
|
"mean_token_accuracy": 0.1626068413257599,
|
|
"num_tokens": 20967887.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 5.763808012008667,
|
|
"epoch": 0.9548414198697752,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004914917629043839,
|
|
"loss": 5.5862,
|
|
"mean_token_accuracy": 0.15319453924894333,
|
|
"num_tokens": 20977558.0,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"entropy": 5.650288105010986,
|
|
"epoch": 0.9552614996849401,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00049148360370705,
|
|
"loss": 5.5436,
|
|
"mean_token_accuracy": 0.16097380816936493,
|
|
"num_tokens": 20986118.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 5.754954528808594,
|
|
"epoch": 0.955681579500105,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004914754406748204,
|
|
"loss": 5.4839,
|
|
"mean_token_accuracy": 0.16297108978033065,
|
|
"num_tokens": 20994623.0,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"entropy": 5.77275652885437,
|
|
"epoch": 0.9561016593152699,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00049146727380784,
|
|
"loss": 5.6615,
|
|
"mean_token_accuracy": 0.15102900862693786,
|
|
"num_tokens": 21004193.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 5.690639591217041,
|
|
"epoch": 0.9565217391304348,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004914591031062531,
|
|
"loss": 5.4908,
|
|
"mean_token_accuracy": 0.16743318736553192,
|
|
"num_tokens": 21013125.0,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"entropy": 5.655840444564819,
|
|
"epoch": 0.9569418189455997,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004914509285702048,
|
|
"loss": 5.4135,
|
|
"mean_token_accuracy": 0.16690310835838318,
|
|
"num_tokens": 21021402.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 5.710651922225952,
|
|
"epoch": 0.9573618987607645,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004914427501998397,
|
|
"loss": 5.5028,
|
|
"mean_token_accuracy": 0.15886269211769105,
|
|
"num_tokens": 21029639.0,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"entropy": 5.7445274829864506,
|
|
"epoch": 0.9577819785759294,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004914345679953027,
|
|
"loss": 5.5347,
|
|
"mean_token_accuracy": 0.16092265099287034,
|
|
"num_tokens": 21037525.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 5.767073345184326,
|
|
"epoch": 0.9582020583910943,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004914263819567388,
|
|
"loss": 5.6295,
|
|
"mean_token_accuracy": 0.1497710943222046,
|
|
"num_tokens": 21047702.0,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"entropy": 5.790897989273072,
|
|
"epoch": 0.9586221382062592,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000491418192084293,
|
|
"loss": 5.5474,
|
|
"mean_token_accuracy": 0.16184664964675904,
|
|
"num_tokens": 21056379.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 5.7481053352355955,
|
|
"epoch": 0.9590422180214241,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004914099983781104,
|
|
"loss": 5.5489,
|
|
"mean_token_accuracy": 0.16056970357894898,
|
|
"num_tokens": 21065283.0,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"entropy": 5.769461059570313,
|
|
"epoch": 0.959462297836589,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.000491401800838336,
|
|
"loss": 5.6633,
|
|
"mean_token_accuracy": 0.15242091715335845,
|
|
"num_tokens": 21074938.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 5.7138519287109375,
|
|
"epoch": 0.9598823776517539,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004913935994651153,
|
|
"loss": 5.514,
|
|
"mean_token_accuracy": 0.16224966198205948,
|
|
"num_tokens": 21084729.0,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"entropy": 5.642538785934448,
|
|
"epoch": 0.9603024574669187,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004913853942585932,
|
|
"loss": 5.4117,
|
|
"mean_token_accuracy": 0.16488994657993317,
|
|
"num_tokens": 21093456.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 5.686456680297852,
|
|
"epoch": 0.9607225372820836,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004913771852189155,
|
|
"loss": 5.5451,
|
|
"mean_token_accuracy": 0.15687460005283355,
|
|
"num_tokens": 21102980.0,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"entropy": 5.848186016082764,
|
|
"epoch": 0.9611426170972485,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004913689723462271,
|
|
"loss": 5.7858,
|
|
"mean_token_accuracy": 0.16201310455799103,
|
|
"num_tokens": 21112777.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 5.803880500793457,
|
|
"epoch": 0.9615626969124134,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000491360755640674,
|
|
"loss": 5.6636,
|
|
"mean_token_accuracy": 0.15397086888551711,
|
|
"num_tokens": 21122139.0,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"entropy": 5.7530255794525145,
|
|
"epoch": 0.9619827767275783,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004913525351024014,
|
|
"loss": 5.5361,
|
|
"mean_token_accuracy": 0.15754189491271972,
|
|
"num_tokens": 21131425.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 5.708436107635498,
|
|
"epoch": 0.9624028565427432,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004913443107315552,
|
|
"loss": 5.5081,
|
|
"mean_token_accuracy": 0.15728521049022676,
|
|
"num_tokens": 21140784.0,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"entropy": 5.700329685211182,
|
|
"epoch": 0.962822936357908,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004913360825282807,
|
|
"loss": 5.5271,
|
|
"mean_token_accuracy": 0.16611011624336242,
|
|
"num_tokens": 21150408.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 5.716848659515381,
|
|
"epoch": 0.9632430161730728,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.000491327850492724,
|
|
"loss": 5.6151,
|
|
"mean_token_accuracy": 0.1602442279458046,
|
|
"num_tokens": 21158915.0,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"entropy": 5.630804204940796,
|
|
"epoch": 0.9636630959882377,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004913196146250309,
|
|
"loss": 5.4172,
|
|
"mean_token_accuracy": 0.16883472204208375,
|
|
"num_tokens": 21167336.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 5.830715799331665,
|
|
"epoch": 0.9640831758034026,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004913113749253472,
|
|
"loss": 5.7333,
|
|
"mean_token_accuracy": 0.15114703625440598,
|
|
"num_tokens": 21177499.0,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"entropy": 5.8909022331237795,
|
|
"epoch": 0.9645032556185675,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004913031313938188,
|
|
"loss": 5.6341,
|
|
"mean_token_accuracy": 0.15465849339962007,
|
|
"num_tokens": 21186961.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 5.749186849594116,
|
|
"epoch": 0.9649233354337324,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004912948840305919,
|
|
"loss": 5.5207,
|
|
"mean_token_accuracy": 0.1647267997264862,
|
|
"num_tokens": 21196364.0,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"entropy": 5.710730838775635,
|
|
"epoch": 0.9653434152488973,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004912866328358125,
|
|
"loss": 5.5995,
|
|
"mean_token_accuracy": 0.15677765160799026,
|
|
"num_tokens": 21206376.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 5.779667091369629,
|
|
"epoch": 0.9657634950640621,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004912783778096266,
|
|
"loss": 5.5689,
|
|
"mean_token_accuracy": 0.16532181650400163,
|
|
"num_tokens": 21215889.0,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"entropy": 5.756943035125732,
|
|
"epoch": 0.966183574879227,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004912701189521808,
|
|
"loss": 5.5847,
|
|
"mean_token_accuracy": 0.16167923510074617,
|
|
"num_tokens": 21224959.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 5.824455404281617,
|
|
"epoch": 0.9666036546943919,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004912618562636211,
|
|
"loss": 5.6996,
|
|
"mean_token_accuracy": 0.1518427163362503,
|
|
"num_tokens": 21234495.0,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"entropy": 5.704730606079101,
|
|
"epoch": 0.9670237345095568,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000491253589744094,
|
|
"loss": 5.5344,
|
|
"mean_token_accuracy": 0.1582339495420456,
|
|
"num_tokens": 21244555.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 5.786595106124878,
|
|
"epoch": 0.9674438143247217,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004912453193937459,
|
|
"loss": 5.6929,
|
|
"mean_token_accuracy": 0.1545358881354332,
|
|
"num_tokens": 21254199.0,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"entropy": 5.79626088142395,
|
|
"epoch": 0.9678638941398866,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004912370452127234,
|
|
"loss": 5.5811,
|
|
"mean_token_accuracy": 0.15614356994628906,
|
|
"num_tokens": 21262723.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 5.749001598358154,
|
|
"epoch": 0.9682839739550515,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004912287672011728,
|
|
"loss": 5.498,
|
|
"mean_token_accuracy": 0.16639503091573715,
|
|
"num_tokens": 21271283.0,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"entropy": 5.694228219985962,
|
|
"epoch": 0.9687040537702163,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004912204853592411,
|
|
"loss": 5.5549,
|
|
"mean_token_accuracy": 0.1661546677350998,
|
|
"num_tokens": 21279542.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 5.738241815567017,
|
|
"epoch": 0.9691241335853812,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004912121996870748,
|
|
"loss": 5.5345,
|
|
"mean_token_accuracy": 0.16057475954294204,
|
|
"num_tokens": 21288678.0,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"entropy": 5.781418895721435,
|
|
"epoch": 0.9695442134005461,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004912039101848207,
|
|
"loss": 5.6681,
|
|
"mean_token_accuracy": 0.1558816574513912,
|
|
"num_tokens": 21298982.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 5.759183168411255,
|
|
"epoch": 0.969964293215711,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004911956168526257,
|
|
"loss": 5.609,
|
|
"mean_token_accuracy": 0.1565386489033699,
|
|
"num_tokens": 21307663.0,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"entropy": 5.845695209503174,
|
|
"epoch": 0.9703843730308759,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004911873196906366,
|
|
"loss": 5.6214,
|
|
"mean_token_accuracy": 0.1554511606693268,
|
|
"num_tokens": 21318004.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 5.676923847198486,
|
|
"epoch": 0.9708044528460408,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004911790186990005,
|
|
"loss": 5.4377,
|
|
"mean_token_accuracy": 0.16938215047121047,
|
|
"num_tokens": 21327373.0,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"entropy": 5.664393568038941,
|
|
"epoch": 0.9712245326612057,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004911707138778643,
|
|
"loss": 5.5261,
|
|
"mean_token_accuracy": 0.15850266367197036,
|
|
"num_tokens": 21335654.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 5.805261135101318,
|
|
"epoch": 0.9716446124763705,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004911624052273754,
|
|
"loss": 5.5917,
|
|
"mean_token_accuracy": 0.15714938044548035,
|
|
"num_tokens": 21344464.0,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"entropy": 5.811971664428711,
|
|
"epoch": 0.9720646922915354,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004911540927476807,
|
|
"loss": 5.6846,
|
|
"mean_token_accuracy": 0.15539554506540298,
|
|
"num_tokens": 21354121.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 5.761470699310303,
|
|
"epoch": 0.9724847721067003,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004911457764389275,
|
|
"loss": 5.6129,
|
|
"mean_token_accuracy": 0.16058044135570526,
|
|
"num_tokens": 21363395.0,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"entropy": 5.740648984909058,
|
|
"epoch": 0.9729048519218652,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004911374563012633,
|
|
"loss": 5.5736,
|
|
"mean_token_accuracy": 0.15647933781147003,
|
|
"num_tokens": 21372126.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 5.771029758453369,
|
|
"epoch": 0.97332493173703,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004911291323348352,
|
|
"loss": 5.6557,
|
|
"mean_token_accuracy": 0.14915687441825867,
|
|
"num_tokens": 21380554.0,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"entropy": 5.70338454246521,
|
|
"epoch": 0.973745011552195,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004911208045397909,
|
|
"loss": 5.5306,
|
|
"mean_token_accuracy": 0.15759393125772475,
|
|
"num_tokens": 21389317.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 5.784313058853149,
|
|
"epoch": 0.9741650913673598,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004911124729162778,
|
|
"loss": 5.66,
|
|
"mean_token_accuracy": 0.1539946123957634,
|
|
"num_tokens": 21398926.0,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"entropy": 5.741526746749878,
|
|
"epoch": 0.9745851711825246,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004911041374644435,
|
|
"loss": 5.4636,
|
|
"mean_token_accuracy": 0.16005127429962157,
|
|
"num_tokens": 21406962.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 5.734314489364624,
|
|
"epoch": 0.9750052509976895,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004910957981844357,
|
|
"loss": 5.5654,
|
|
"mean_token_accuracy": 0.16124276220798492,
|
|
"num_tokens": 21415868.0,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"entropy": 5.803146696090698,
|
|
"epoch": 0.9754253308128544,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004910874550764022,
|
|
"loss": 5.6967,
|
|
"mean_token_accuracy": 0.15788596943020822,
|
|
"num_tokens": 21424544.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 5.653631067276001,
|
|
"epoch": 0.9758454106280193,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004910791081404907,
|
|
"loss": 5.5587,
|
|
"mean_token_accuracy": 0.16439975649118424,
|
|
"num_tokens": 21433589.0,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"entropy": 5.75174469947815,
|
|
"epoch": 0.9762654904431842,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004910707573768489,
|
|
"loss": 5.6188,
|
|
"mean_token_accuracy": 0.15351523384451865,
|
|
"num_tokens": 21442084.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 5.711339998245239,
|
|
"epoch": 0.9766855702583491,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004910624027856251,
|
|
"loss": 5.5242,
|
|
"mean_token_accuracy": 0.15779978781938553,
|
|
"num_tokens": 21450962.0,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"entropy": 5.761394453048706,
|
|
"epoch": 0.977105650073514,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004910540443669669,
|
|
"loss": 5.616,
|
|
"mean_token_accuracy": 0.15358125492930413,
|
|
"num_tokens": 21461322.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 5.790155363082886,
|
|
"epoch": 0.9775257298886788,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004910456821210227,
|
|
"loss": 5.5963,
|
|
"mean_token_accuracy": 0.16139813885092735,
|
|
"num_tokens": 21470800.0,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"entropy": 5.705955171585083,
|
|
"epoch": 0.9779458097038437,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004910373160479404,
|
|
"loss": 5.4578,
|
|
"mean_token_accuracy": 0.1623155578970909,
|
|
"num_tokens": 21479707.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 5.705592966079712,
|
|
"epoch": 0.9783658895190086,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004910289461478683,
|
|
"loss": 5.6531,
|
|
"mean_token_accuracy": 0.14903590232133865,
|
|
"num_tokens": 21489469.0,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"entropy": 5.782165670394898,
|
|
"epoch": 0.9787859693341735,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004910205724209547,
|
|
"loss": 5.6102,
|
|
"mean_token_accuracy": 0.15439205691218377,
|
|
"num_tokens": 21499226.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 5.662615633010864,
|
|
"epoch": 0.9792060491493384,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004910121948673478,
|
|
"loss": 5.4725,
|
|
"mean_token_accuracy": 0.16271869242191314,
|
|
"num_tokens": 21508129.0,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"entropy": 5.677742385864258,
|
|
"epoch": 0.9796261289645033,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004910038134871962,
|
|
"loss": 5.5133,
|
|
"mean_token_accuracy": 0.16307872533798218,
|
|
"num_tokens": 21516293.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 5.8114800453186035,
|
|
"epoch": 0.9800462087796681,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004909954282806482,
|
|
"loss": 5.663,
|
|
"mean_token_accuracy": 0.15625039413571357,
|
|
"num_tokens": 21525393.0,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"entropy": 5.650265026092529,
|
|
"epoch": 0.980466288594833,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004909870392478524,
|
|
"loss": 5.5162,
|
|
"mean_token_accuracy": 0.15820949375629426,
|
|
"num_tokens": 21534585.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 5.637864255905152,
|
|
"epoch": 0.9808863684099979,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004909786463889575,
|
|
"loss": 5.4578,
|
|
"mean_token_accuracy": 0.16383379697799683,
|
|
"num_tokens": 21542947.0,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"entropy": 5.737944889068603,
|
|
"epoch": 0.9813064482251628,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004909702497041121,
|
|
"loss": 5.5743,
|
|
"mean_token_accuracy": 0.16033673286437988,
|
|
"num_tokens": 21552168.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 5.723841714859009,
|
|
"epoch": 0.9817265280403277,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004909618491934648,
|
|
"loss": 5.577,
|
|
"mean_token_accuracy": 0.16168997883796693,
|
|
"num_tokens": 21562131.0,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"entropy": 5.690407085418701,
|
|
"epoch": 0.9821466078554926,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004909534448571647,
|
|
"loss": 5.5295,
|
|
"mean_token_accuracy": 0.1657412603497505,
|
|
"num_tokens": 21571363.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 5.723976564407349,
|
|
"epoch": 0.9825666876706575,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004909450366953604,
|
|
"loss": 5.5015,
|
|
"mean_token_accuracy": 0.16331232860684394,
|
|
"num_tokens": 21580754.0,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"entropy": 5.728280305862427,
|
|
"epoch": 0.9829867674858223,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000490936624708201,
|
|
"loss": 5.6055,
|
|
"mean_token_accuracy": 0.15559826791286469,
|
|
"num_tokens": 21590053.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 5.720153570175171,
|
|
"epoch": 0.9834068473009872,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004909282088958356,
|
|
"loss": 5.5648,
|
|
"mean_token_accuracy": 0.1572035074234009,
|
|
"num_tokens": 21598681.0,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"entropy": 5.809522724151611,
|
|
"epoch": 0.983826927116152,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000490919789258413,
|
|
"loss": 5.5901,
|
|
"mean_token_accuracy": 0.1646919757127762,
|
|
"num_tokens": 21607465.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 5.760197687149048,
|
|
"epoch": 0.984247006931317,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004909113657960826,
|
|
"loss": 5.6859,
|
|
"mean_token_accuracy": 0.1438727371394634,
|
|
"num_tokens": 21617480.0,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"entropy": 5.747771978378296,
|
|
"epoch": 0.9846670867464818,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004909029385089935,
|
|
"loss": 5.5799,
|
|
"mean_token_accuracy": 0.16191355288028716,
|
|
"num_tokens": 21626434.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 5.759758377075196,
|
|
"epoch": 0.9850871665616467,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000490894507397295,
|
|
"loss": 5.5507,
|
|
"mean_token_accuracy": 0.1621351957321167,
|
|
"num_tokens": 21635627.0,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"entropy": 5.738065910339356,
|
|
"epoch": 0.9855072463768116,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004908860724611365,
|
|
"loss": 5.5608,
|
|
"mean_token_accuracy": 0.1566981017589569,
|
|
"num_tokens": 21644789.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 5.63734655380249,
|
|
"epoch": 0.9859273261919764,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004908776337006675,
|
|
"loss": 5.5664,
|
|
"mean_token_accuracy": 0.15962323546409607,
|
|
"num_tokens": 21653696.0,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"entropy": 5.737686443328857,
|
|
"epoch": 0.9863474060071413,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004908691911160373,
|
|
"loss": 5.5614,
|
|
"mean_token_accuracy": 0.15139272063970566,
|
|
"num_tokens": 21664420.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 5.753671407699585,
|
|
"epoch": 0.9867674858223062,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004908607447073954,
|
|
"loss": 5.5481,
|
|
"mean_token_accuracy": 0.15880379527807237,
|
|
"num_tokens": 21673716.0,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"entropy": 5.73064112663269,
|
|
"epoch": 0.9871875656374711,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004908522944748917,
|
|
"loss": 5.5493,
|
|
"mean_token_accuracy": 0.16386302858591079,
|
|
"num_tokens": 21682860.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 5.609640121459961,
|
|
"epoch": 0.987607645452636,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004908438404186758,
|
|
"loss": 5.5444,
|
|
"mean_token_accuracy": 0.1676987513899803,
|
|
"num_tokens": 21691915.0,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"entropy": 5.773650169372559,
|
|
"epoch": 0.9880277252678009,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004908353825388973,
|
|
"loss": 5.6686,
|
|
"mean_token_accuracy": 0.1477293998003006,
|
|
"num_tokens": 21701666.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 5.837761163711548,
|
|
"epoch": 0.9884478050829658,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004908269208357062,
|
|
"loss": 5.6005,
|
|
"mean_token_accuracy": 0.16498832553625106,
|
|
"num_tokens": 21709267.0,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"entropy": 5.687007045745849,
|
|
"epoch": 0.9888678848981306,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004908184553092523,
|
|
"loss": 5.4664,
|
|
"mean_token_accuracy": 0.16219521760940553,
|
|
"num_tokens": 21718117.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 5.74579439163208,
|
|
"epoch": 0.9892879647132955,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004908099859596856,
|
|
"loss": 5.6226,
|
|
"mean_token_accuracy": 0.16140222251415254,
|
|
"num_tokens": 21727952.0,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"entropy": 5.798332405090332,
|
|
"epoch": 0.9897080445284604,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004908015127871561,
|
|
"loss": 5.5076,
|
|
"mean_token_accuracy": 0.15737968385219575,
|
|
"num_tokens": 21737878.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 5.66026086807251,
|
|
"epoch": 0.9901281243436253,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000490793035791814,
|
|
"loss": 5.4333,
|
|
"mean_token_accuracy": 0.16479237079620362,
|
|
"num_tokens": 21747391.0,
|
|
"step": 11785
|
|
},
|
|
{
|
|
"entropy": 5.639067459106445,
|
|
"epoch": 0.9905482041587902,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004907845549738093,
|
|
"loss": 5.4825,
|
|
"mean_token_accuracy": 0.1608181118965149,
|
|
"num_tokens": 21756791.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 5.626802778244018,
|
|
"epoch": 0.9909682839739551,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004907760703332923,
|
|
"loss": 5.514,
|
|
"mean_token_accuracy": 0.16045965999364853,
|
|
"num_tokens": 21766020.0,
|
|
"step": 11795
|
|
},
|
|
{
|
|
"entropy": 5.7946771621704105,
|
|
"epoch": 0.99138836378912,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004907675818704134,
|
|
"loss": 5.6332,
|
|
"mean_token_accuracy": 0.15098711997270584,
|
|
"num_tokens": 21775895.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 5.720692729949951,
|
|
"epoch": 0.9918084436042848,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004907590895853228,
|
|
"loss": 5.5368,
|
|
"mean_token_accuracy": 0.16272979229688644,
|
|
"num_tokens": 21784543.0,
|
|
"step": 11805
|
|
},
|
|
{
|
|
"entropy": 5.734677982330322,
|
|
"epoch": 0.9922285234194497,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004907505934781712,
|
|
"loss": 5.5898,
|
|
"mean_token_accuracy": 0.15340466499328614,
|
|
"num_tokens": 21793938.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 5.73793478012085,
|
|
"epoch": 0.9926486032346146,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004907420935491087,
|
|
"loss": 5.5694,
|
|
"mean_token_accuracy": 0.15643597394227982,
|
|
"num_tokens": 21803641.0,
|
|
"step": 11815
|
|
},
|
|
{
|
|
"entropy": 5.734190273284912,
|
|
"epoch": 0.9930686830497795,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004907335897982862,
|
|
"loss": 5.4978,
|
|
"mean_token_accuracy": 0.1619450032711029,
|
|
"num_tokens": 21812542.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 5.653626728057861,
|
|
"epoch": 0.9934887628649444,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004907250822258543,
|
|
"loss": 5.5806,
|
|
"mean_token_accuracy": 0.15819441080093383,
|
|
"num_tokens": 21821847.0,
|
|
"step": 11825
|
|
},
|
|
{
|
|
"entropy": 5.8374409675598145,
|
|
"epoch": 0.9939088426801093,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004907165708319637,
|
|
"loss": 5.6198,
|
|
"mean_token_accuracy": 0.15984491556882857,
|
|
"num_tokens": 21830799.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 5.780798053741455,
|
|
"epoch": 0.994328922495274,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004907080556167651,
|
|
"loss": 5.5597,
|
|
"mean_token_accuracy": 0.15932203084230423,
|
|
"num_tokens": 21840202.0,
|
|
"step": 11835
|
|
},
|
|
{
|
|
"entropy": 5.827149820327759,
|
|
"epoch": 0.994749002310439,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004906995365804093,
|
|
"loss": 5.665,
|
|
"mean_token_accuracy": 0.15373467579483985,
|
|
"num_tokens": 21849701.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 5.745590162277222,
|
|
"epoch": 0.9951690821256038,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004906910137230472,
|
|
"loss": 5.5375,
|
|
"mean_token_accuracy": 0.161653570830822,
|
|
"num_tokens": 21859191.0,
|
|
"step": 11845
|
|
},
|
|
{
|
|
"entropy": 5.722856521606445,
|
|
"epoch": 0.9955891619407687,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00049068248704483,
|
|
"loss": 5.5202,
|
|
"mean_token_accuracy": 0.157266703248024,
|
|
"num_tokens": 21867944.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 5.644532155990601,
|
|
"epoch": 0.9960092417559336,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004906739565459085,
|
|
"loss": 5.5632,
|
|
"mean_token_accuracy": 0.15848701894283296,
|
|
"num_tokens": 21876368.0,
|
|
"step": 11855
|
|
},
|
|
{
|
|
"entropy": 5.863846969604492,
|
|
"epoch": 0.9964293215710985,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000490665422226434,
|
|
"loss": 5.6436,
|
|
"mean_token_accuracy": 0.1514528512954712,
|
|
"num_tokens": 21885634.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 5.6821434020996096,
|
|
"epoch": 0.9968494013862634,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004906568840865576,
|
|
"loss": 5.4504,
|
|
"mean_token_accuracy": 0.16308265626430513,
|
|
"num_tokens": 21894315.0,
|
|
"step": 11865
|
|
},
|
|
{
|
|
"entropy": 5.626799726486206,
|
|
"epoch": 0.9972694812014282,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004906483421264305,
|
|
"loss": 5.5695,
|
|
"mean_token_accuracy": 0.159691222012043,
|
|
"num_tokens": 21903342.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 5.7634326934814455,
|
|
"epoch": 0.9976895610165931,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.000490639796346204,
|
|
"loss": 5.686,
|
|
"mean_token_accuracy": 0.15302741080522536,
|
|
"num_tokens": 21914158.0,
|
|
"step": 11875
|
|
},
|
|
{
|
|
"entropy": 5.901743459701538,
|
|
"epoch": 0.998109640831758,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004906312467460297,
|
|
"loss": 5.5633,
|
|
"mean_token_accuracy": 0.16004915833473204,
|
|
"num_tokens": 21922639.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 5.736720323562622,
|
|
"epoch": 0.9985297206469229,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004906226933260588,
|
|
"loss": 5.5645,
|
|
"mean_token_accuracy": 0.15823576152324675,
|
|
"num_tokens": 21931385.0,
|
|
"step": 11885
|
|
},
|
|
{
|
|
"entropy": 5.78201150894165,
|
|
"epoch": 0.9989498004620878,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004906141360864429,
|
|
"loss": 5.5746,
|
|
"mean_token_accuracy": 0.15795834213495255,
|
|
"num_tokens": 21940788.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 5.749546051025391,
|
|
"epoch": 0.9993698802772527,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004906055750273336,
|
|
"loss": 5.5854,
|
|
"mean_token_accuracy": 0.15715595483779907,
|
|
"num_tokens": 21950309.0,
|
|
"step": 11895
|
|
},
|
|
{
|
|
"entropy": 5.691565322875976,
|
|
"epoch": 0.9997899600924176,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004905970101488826,
|
|
"loss": 5.5724,
|
|
"mean_token_accuracy": 0.15797929465770721,
|
|
"num_tokens": 21959141.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 5.813778877258301,
|
|
"epoch": 1.000168031926066,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004905884414512416,
|
|
"loss": 5.6073,
|
|
"mean_token_accuracy": 0.15993836356533897,
|
|
"num_tokens": 21966665.0,
|
|
"step": 11905
|
|
},
|
|
{
|
|
"entropy": 5.747717571258545,
|
|
"epoch": 1.0005881117412307,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004905798689345623,
|
|
"loss": 5.5985,
|
|
"mean_token_accuracy": 0.15958280488848686,
|
|
"num_tokens": 21976728.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 5.70471978187561,
|
|
"epoch": 1.0010081915563958,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004905712925989968,
|
|
"loss": 5.4321,
|
|
"mean_token_accuracy": 0.1577399954199791,
|
|
"num_tokens": 21985915.0,
|
|
"step": 11915
|
|
},
|
|
{
|
|
"entropy": 5.717014789581299,
|
|
"epoch": 1.0014282713715605,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004905627124446967,
|
|
"loss": 5.4817,
|
|
"mean_token_accuracy": 0.16125397384166718,
|
|
"num_tokens": 21995826.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 5.673809146881103,
|
|
"epoch": 1.0018483511867255,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004905541284718142,
|
|
"loss": 5.441,
|
|
"mean_token_accuracy": 0.16078125834465026,
|
|
"num_tokens": 22005299.0,
|
|
"step": 11925
|
|
},
|
|
{
|
|
"entropy": 5.688680934906006,
|
|
"epoch": 1.0022684310018903,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004905455406805011,
|
|
"loss": 5.48,
|
|
"mean_token_accuracy": 0.160285322368145,
|
|
"num_tokens": 22014499.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 5.827605724334717,
|
|
"epoch": 1.0026885108170553,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00049053694907091,
|
|
"loss": 5.6404,
|
|
"mean_token_accuracy": 0.15083224773406984,
|
|
"num_tokens": 22024531.0,
|
|
"step": 11935
|
|
},
|
|
{
|
|
"entropy": 5.765188217163086,
|
|
"epoch": 1.0031085906322201,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004905283536431928,
|
|
"loss": 5.546,
|
|
"mean_token_accuracy": 0.16389428079128265,
|
|
"num_tokens": 22034036.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 5.673288774490357,
|
|
"epoch": 1.003528670447385,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004905197543975017,
|
|
"loss": 5.4413,
|
|
"mean_token_accuracy": 0.16298594772815705,
|
|
"num_tokens": 22042910.0,
|
|
"step": 11945
|
|
},
|
|
{
|
|
"entropy": 5.742687463760376,
|
|
"epoch": 1.00394875026255,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004905111513339892,
|
|
"loss": 5.5236,
|
|
"mean_token_accuracy": 0.16467590481042862,
|
|
"num_tokens": 22052242.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 5.723882246017456,
|
|
"epoch": 1.0043688300777147,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004905025444528076,
|
|
"loss": 5.4865,
|
|
"mean_token_accuracy": 0.15788668096065522,
|
|
"num_tokens": 22061467.0,
|
|
"step": 11955
|
|
},
|
|
{
|
|
"entropy": 5.6063799381256105,
|
|
"epoch": 1.0047889098928797,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004904939337541093,
|
|
"loss": 5.3608,
|
|
"mean_token_accuracy": 0.1663319230079651,
|
|
"num_tokens": 22070300.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 5.7507532119750975,
|
|
"epoch": 1.0052089897080445,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004904853192380472,
|
|
"loss": 5.5215,
|
|
"mean_token_accuracy": 0.158057052642107,
|
|
"num_tokens": 22078960.0,
|
|
"step": 11965
|
|
},
|
|
{
|
|
"entropy": 5.719160795211792,
|
|
"epoch": 1.0056290695232095,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004904767009047733,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.1630512699484825,
|
|
"num_tokens": 22088135.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 5.731142950057984,
|
|
"epoch": 1.0060491493383743,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004904680787544408,
|
|
"loss": 5.582,
|
|
"mean_token_accuracy": 0.15549475252628325,
|
|
"num_tokens": 22098004.0,
|
|
"step": 11975
|
|
},
|
|
{
|
|
"entropy": 5.818147802352906,
|
|
"epoch": 1.006469229153539,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004904594527872022,
|
|
"loss": 5.5522,
|
|
"mean_token_accuracy": 0.15604811310768127,
|
|
"num_tokens": 22107680.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 5.7786630153656,
|
|
"epoch": 1.006889308968704,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004904508230032103,
|
|
"loss": 5.5677,
|
|
"mean_token_accuracy": 0.1585972711443901,
|
|
"num_tokens": 22118004.0,
|
|
"step": 11985
|
|
},
|
|
{
|
|
"entropy": 5.7285055160522464,
|
|
"epoch": 1.0073093887838689,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000490442189402618,
|
|
"loss": 5.5151,
|
|
"mean_token_accuracy": 0.17011249363422393,
|
|
"num_tokens": 22127825.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 5.711953926086426,
|
|
"epoch": 1.007729468599034,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004904335519855783,
|
|
"loss": 5.4227,
|
|
"mean_token_accuracy": 0.16442998498678207,
|
|
"num_tokens": 22136448.0,
|
|
"step": 11995
|
|
},
|
|
{
|
|
"entropy": 5.657416820526123,
|
|
"epoch": 1.0081495484141987,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004904249107522442,
|
|
"loss": 5.5436,
|
|
"mean_token_accuracy": 0.15949945598840715,
|
|
"num_tokens": 22146415.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 1.0081495484141987,
|
|
"eval_entropy": 5.525661500662507,
|
|
"eval_loss": 5.590455532073975,
|
|
"eval_mean_token_accuracy": 0.16449697244313435,
|
|
"eval_num_tokens": 22146415.0,
|
|
"eval_runtime": 27.36,
|
|
"eval_samples_per_second": 1365.715,
|
|
"eval_steps_per_second": 170.724,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 5.816870403289795,
|
|
"epoch": 1.0085696282293637,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004904162657027685,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.1565300554037094,
|
|
"num_tokens": 22156327.0,
|
|
"step": 12005
|
|
},
|
|
{
|
|
"entropy": 5.738042402267456,
|
|
"epoch": 1.0089897080445285,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004904076168373049,
|
|
"loss": 5.4672,
|
|
"mean_token_accuracy": 0.1601177304983139,
|
|
"num_tokens": 22165677.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 5.727717494964599,
|
|
"epoch": 1.0094097878596933,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004903989641560061,
|
|
"loss": 5.5975,
|
|
"mean_token_accuracy": 0.1590371698141098,
|
|
"num_tokens": 22175232.0,
|
|
"step": 12015
|
|
},
|
|
{
|
|
"entropy": 5.758626651763916,
|
|
"epoch": 1.0098298676748583,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004903903076590256,
|
|
"loss": 5.473,
|
|
"mean_token_accuracy": 0.15314906388521193,
|
|
"num_tokens": 22184026.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 5.663096857070923,
|
|
"epoch": 1.010249947490023,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004903816473465167,
|
|
"loss": 5.3778,
|
|
"mean_token_accuracy": 0.1727016821503639,
|
|
"num_tokens": 22192020.0,
|
|
"step": 12025
|
|
},
|
|
{
|
|
"entropy": 5.613332319259643,
|
|
"epoch": 1.010670027305188,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004903729832186328,
|
|
"loss": 5.3511,
|
|
"mean_token_accuracy": 0.16883303374052047,
|
|
"num_tokens": 22200060.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 5.620872068405151,
|
|
"epoch": 1.0110901071203529,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004903643152755274,
|
|
"loss": 5.407,
|
|
"mean_token_accuracy": 0.1603987216949463,
|
|
"num_tokens": 22208625.0,
|
|
"step": 12035
|
|
},
|
|
{
|
|
"entropy": 5.685234689712525,
|
|
"epoch": 1.0115101869355176,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004903556435173541,
|
|
"loss": 5.3922,
|
|
"mean_token_accuracy": 0.1666228473186493,
|
|
"num_tokens": 22217781.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 5.746535110473633,
|
|
"epoch": 1.0119302667506826,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004903469679442665,
|
|
"loss": 5.5318,
|
|
"mean_token_accuracy": 0.16123737245798112,
|
|
"num_tokens": 22226432.0,
|
|
"step": 12045
|
|
},
|
|
{
|
|
"entropy": 5.652414417266845,
|
|
"epoch": 1.0123503465658474,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004903382885564181,
|
|
"loss": 5.5297,
|
|
"mean_token_accuracy": 0.16408767104148864,
|
|
"num_tokens": 22234811.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 5.5869992733001705,
|
|
"epoch": 1.0127704263810124,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000490329605353963,
|
|
"loss": 5.4075,
|
|
"mean_token_accuracy": 0.17152390927076339,
|
|
"num_tokens": 22242808.0,
|
|
"step": 12055
|
|
},
|
|
{
|
|
"entropy": 5.75869345664978,
|
|
"epoch": 1.0131905061961772,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004903209183370547,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.1645299270749092,
|
|
"num_tokens": 22251371.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 5.830525541305542,
|
|
"epoch": 1.0136105860113422,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004903122275058472,
|
|
"loss": 5.5546,
|
|
"mean_token_accuracy": 0.16162935346364976,
|
|
"num_tokens": 22260868.0,
|
|
"step": 12065
|
|
},
|
|
{
|
|
"entropy": 5.650126838684082,
|
|
"epoch": 1.014030665826507,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004903035328604944,
|
|
"loss": 5.4551,
|
|
"mean_token_accuracy": 0.16388770192861557,
|
|
"num_tokens": 22270554.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 5.599603319168091,
|
|
"epoch": 1.0144507456416718,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004902948344011506,
|
|
"loss": 5.4471,
|
|
"mean_token_accuracy": 0.16133227497339248,
|
|
"num_tokens": 22279170.0,
|
|
"step": 12075
|
|
},
|
|
{
|
|
"entropy": 5.739398241043091,
|
|
"epoch": 1.0148708254568368,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004902861321279694,
|
|
"loss": 5.6051,
|
|
"mean_token_accuracy": 0.1532390832901001,
|
|
"num_tokens": 22288788.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 5.6841353416442875,
|
|
"epoch": 1.0152909052720016,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004902774260411055,
|
|
"loss": 5.385,
|
|
"mean_token_accuracy": 0.1635892152786255,
|
|
"num_tokens": 22297501.0,
|
|
"step": 12085
|
|
},
|
|
{
|
|
"entropy": 5.612368249893189,
|
|
"epoch": 1.0157109850871666,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004902687161407126,
|
|
"loss": 5.3466,
|
|
"mean_token_accuracy": 0.17515814155340195,
|
|
"num_tokens": 22306181.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 5.670634174346924,
|
|
"epoch": 1.0161310649023314,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004902600024269454,
|
|
"loss": 5.5072,
|
|
"mean_token_accuracy": 0.16697340905666352,
|
|
"num_tokens": 22315762.0,
|
|
"step": 12095
|
|
},
|
|
{
|
|
"entropy": 5.626059675216675,
|
|
"epoch": 1.0165511447174964,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.000490251284899958,
|
|
"loss": 5.439,
|
|
"mean_token_accuracy": 0.16588278263807296,
|
|
"num_tokens": 22325127.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 5.649977350234986,
|
|
"epoch": 1.0169712245326612,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000490242563559905,
|
|
"loss": 5.5278,
|
|
"mean_token_accuracy": 0.15909326523542405,
|
|
"num_tokens": 22334038.0,
|
|
"step": 12105
|
|
},
|
|
{
|
|
"entropy": 5.681149196624756,
|
|
"epoch": 1.017391304347826,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004902338384069408,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.16700164079666138,
|
|
"num_tokens": 22342658.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 5.748837232589722,
|
|
"epoch": 1.017811384162991,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00049022510944122,
|
|
"loss": 5.5592,
|
|
"mean_token_accuracy": 0.1559050902724266,
|
|
"num_tokens": 22352559.0,
|
|
"step": 12115
|
|
},
|
|
{
|
|
"entropy": 5.741272211074829,
|
|
"epoch": 1.0182314639781558,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004902163766628972,
|
|
"loss": 5.4663,
|
|
"mean_token_accuracy": 0.16664180606603624,
|
|
"num_tokens": 22361455.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 5.761194944381714,
|
|
"epoch": 1.0186515437933208,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004902076400721271,
|
|
"loss": 5.5025,
|
|
"mean_token_accuracy": 0.15924629420042039,
|
|
"num_tokens": 22371163.0,
|
|
"step": 12125
|
|
},
|
|
{
|
|
"entropy": 5.786735534667969,
|
|
"epoch": 1.0190716236084856,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004901988996690645,
|
|
"loss": 5.4939,
|
|
"mean_token_accuracy": 0.16901676952838898,
|
|
"num_tokens": 22379975.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 5.794359588623047,
|
|
"epoch": 1.0194917034236506,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004901901554538641,
|
|
"loss": 5.5351,
|
|
"mean_token_accuracy": 0.16184651851654053,
|
|
"num_tokens": 22389657.0,
|
|
"step": 12135
|
|
},
|
|
{
|
|
"entropy": 5.626089334487915,
|
|
"epoch": 1.0199117832388154,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000490181407426681,
|
|
"loss": 5.3773,
|
|
"mean_token_accuracy": 0.16764698773622513,
|
|
"num_tokens": 22398320.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 5.705850219726562,
|
|
"epoch": 1.0203318630539802,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004901726555876701,
|
|
"loss": 5.573,
|
|
"mean_token_accuracy": 0.1539936549961567,
|
|
"num_tokens": 22406634.0,
|
|
"step": 12145
|
|
},
|
|
{
|
|
"entropy": 5.800102376937867,
|
|
"epoch": 1.0207519428691452,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004901638999369862,
|
|
"loss": 5.6111,
|
|
"mean_token_accuracy": 0.15667299777269364,
|
|
"num_tokens": 22415939.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 5.758721494674683,
|
|
"epoch": 1.02117202268431,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004901551404747847,
|
|
"loss": 5.5353,
|
|
"mean_token_accuracy": 0.1576780617237091,
|
|
"num_tokens": 22425256.0,
|
|
"step": 12155
|
|
},
|
|
{
|
|
"entropy": 5.758379936218262,
|
|
"epoch": 1.021592102499475,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004901463772012209,
|
|
"loss": 5.6105,
|
|
"mean_token_accuracy": 0.15414702594280244,
|
|
"num_tokens": 22434750.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 5.7319268703460695,
|
|
"epoch": 1.0220121823146397,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004901376101164495,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.16012528240680696,
|
|
"num_tokens": 22443426.0,
|
|
"step": 12165
|
|
},
|
|
{
|
|
"entropy": 5.718150901794433,
|
|
"epoch": 1.0224322621298048,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004901288392206263,
|
|
"loss": 5.496,
|
|
"mean_token_accuracy": 0.15628497451543807,
|
|
"num_tokens": 22452778.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 5.683122968673706,
|
|
"epoch": 1.0228523419449695,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004901200645139064,
|
|
"loss": 5.4532,
|
|
"mean_token_accuracy": 0.1657660871744156,
|
|
"num_tokens": 22462864.0,
|
|
"step": 12175
|
|
},
|
|
{
|
|
"entropy": 5.715426301956176,
|
|
"epoch": 1.0232724217601343,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004901112859964454,
|
|
"loss": 5.515,
|
|
"mean_token_accuracy": 0.1562432289123535,
|
|
"num_tokens": 22472849.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 5.6747640609741214,
|
|
"epoch": 1.0236925015752993,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004901025036683987,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.15990415960550308,
|
|
"num_tokens": 22481693.0,
|
|
"step": 12185
|
|
},
|
|
{
|
|
"entropy": 5.717993688583374,
|
|
"epoch": 1.0241125813904641,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004900937175299219,
|
|
"loss": 5.4283,
|
|
"mean_token_accuracy": 0.16626838445663453,
|
|
"num_tokens": 22490934.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 5.723482513427735,
|
|
"epoch": 1.0245326612056291,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004900849275811707,
|
|
"loss": 5.488,
|
|
"mean_token_accuracy": 0.16016919761896134,
|
|
"num_tokens": 22500457.0,
|
|
"step": 12195
|
|
},
|
|
{
|
|
"entropy": 5.739189004898071,
|
|
"epoch": 1.024952741020794,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004900761338223007,
|
|
"loss": 5.4461,
|
|
"mean_token_accuracy": 0.15878349542617798,
|
|
"num_tokens": 22509641.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 5.648697996139527,
|
|
"epoch": 1.025372820835959,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004900673362534677,
|
|
"loss": 5.3597,
|
|
"mean_token_accuracy": 0.16653590351343156,
|
|
"num_tokens": 22518616.0,
|
|
"step": 12205
|
|
},
|
|
{
|
|
"entropy": 5.73726167678833,
|
|
"epoch": 1.0257929006511237,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004900585348748277,
|
|
"loss": 5.5152,
|
|
"mean_token_accuracy": 0.1678289592266083,
|
|
"num_tokens": 22527599.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 5.699249696731568,
|
|
"epoch": 1.0262129804662885,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004900497296865365,
|
|
"loss": 5.522,
|
|
"mean_token_accuracy": 0.15160454586148261,
|
|
"num_tokens": 22537399.0,
|
|
"step": 12215
|
|
},
|
|
{
|
|
"entropy": 5.909937381744385,
|
|
"epoch": 1.0266330602814535,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004900409206887499,
|
|
"loss": 5.7361,
|
|
"mean_token_accuracy": 0.1503012202680111,
|
|
"num_tokens": 22546746.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 5.758369112014771,
|
|
"epoch": 1.0270531400966183,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004900321078816243,
|
|
"loss": 5.4986,
|
|
"mean_token_accuracy": 0.16833491176366805,
|
|
"num_tokens": 22555735.0,
|
|
"step": 12225
|
|
},
|
|
{
|
|
"entropy": 5.747727394104004,
|
|
"epoch": 1.0274732199117833,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004900232912653156,
|
|
"loss": 5.5011,
|
|
"mean_token_accuracy": 0.16442441418766976,
|
|
"num_tokens": 22565010.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 5.755198526382446,
|
|
"epoch": 1.027893299726948,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00049001447083998,
|
|
"loss": 5.4897,
|
|
"mean_token_accuracy": 0.15753853023052217,
|
|
"num_tokens": 22573565.0,
|
|
"step": 12235
|
|
},
|
|
{
|
|
"entropy": 5.751472043991089,
|
|
"epoch": 1.028313379542113,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004900056466057737,
|
|
"loss": 5.4754,
|
|
"mean_token_accuracy": 0.15711085349321366,
|
|
"num_tokens": 22582549.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 5.6773108959198,
|
|
"epoch": 1.028733459357278,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004899968185628531,
|
|
"loss": 5.5407,
|
|
"mean_token_accuracy": 0.15574416965246202,
|
|
"num_tokens": 22592112.0,
|
|
"step": 12245
|
|
},
|
|
{
|
|
"entropy": 5.632958936691284,
|
|
"epoch": 1.0291535391724427,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004899879867113746,
|
|
"loss": 5.3852,
|
|
"mean_token_accuracy": 0.16619622707366943,
|
|
"num_tokens": 22600581.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 5.809205341339111,
|
|
"epoch": 1.0295736189876077,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004899791510514945,
|
|
"loss": 5.5897,
|
|
"mean_token_accuracy": 0.15455610007047654,
|
|
"num_tokens": 22610822.0,
|
|
"step": 12255
|
|
},
|
|
{
|
|
"entropy": 5.765147113800049,
|
|
"epoch": 1.0299936988027725,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004899703115833696,
|
|
"loss": 5.5785,
|
|
"mean_token_accuracy": 0.1628772124648094,
|
|
"num_tokens": 22619484.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 5.693212890625,
|
|
"epoch": 1.0304137786179375,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004899614683071563,
|
|
"loss": 5.4248,
|
|
"mean_token_accuracy": 0.16597820073366165,
|
|
"num_tokens": 22629038.0,
|
|
"step": 12265
|
|
},
|
|
{
|
|
"entropy": 5.7034484386444095,
|
|
"epoch": 1.0308338584331023,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004899526212230112,
|
|
"loss": 5.516,
|
|
"mean_token_accuracy": 0.15537500530481338,
|
|
"num_tokens": 22638619.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 5.658185815811157,
|
|
"epoch": 1.0312539382482673,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004899437703310912,
|
|
"loss": 5.5003,
|
|
"mean_token_accuracy": 0.16062938123941423,
|
|
"num_tokens": 22648065.0,
|
|
"step": 12275
|
|
},
|
|
{
|
|
"entropy": 5.798764753341675,
|
|
"epoch": 1.031674018063432,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004899349156315529,
|
|
"loss": 5.5658,
|
|
"mean_token_accuracy": 0.15393222272396087,
|
|
"num_tokens": 22658107.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 5.730508184432983,
|
|
"epoch": 1.0320940978785969,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004899260571245533,
|
|
"loss": 5.4466,
|
|
"mean_token_accuracy": 0.16231588870286942,
|
|
"num_tokens": 22667103.0,
|
|
"step": 12285
|
|
},
|
|
{
|
|
"entropy": 5.6667787551879885,
|
|
"epoch": 1.0325141776937619,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004899171948102492,
|
|
"loss": 5.4168,
|
|
"mean_token_accuracy": 0.16460922211408616,
|
|
"num_tokens": 22676792.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 5.648167705535888,
|
|
"epoch": 1.0329342575089266,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004899083286887977,
|
|
"loss": 5.452,
|
|
"mean_token_accuracy": 0.1632228210568428,
|
|
"num_tokens": 22685344.0,
|
|
"step": 12295
|
|
},
|
|
{
|
|
"entropy": 5.778263664245605,
|
|
"epoch": 1.0333543373240917,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004898994587603559,
|
|
"loss": 5.5131,
|
|
"mean_token_accuracy": 0.16273818016052247,
|
|
"num_tokens": 22694387.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 5.695818853378296,
|
|
"epoch": 1.0337744171392564,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004898905850250807,
|
|
"loss": 5.542,
|
|
"mean_token_accuracy": 0.16002353727817537,
|
|
"num_tokens": 22704203.0,
|
|
"step": 12305
|
|
},
|
|
{
|
|
"entropy": 5.760842561721802,
|
|
"epoch": 1.0341944969544214,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004898817074831295,
|
|
"loss": 5.5913,
|
|
"mean_token_accuracy": 0.1574055314064026,
|
|
"num_tokens": 22713518.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 5.756874465942383,
|
|
"epoch": 1.0346145767695862,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004898728261346595,
|
|
"loss": 5.593,
|
|
"mean_token_accuracy": 0.15683950930833818,
|
|
"num_tokens": 22722997.0,
|
|
"step": 12315
|
|
},
|
|
{
|
|
"entropy": 5.767385387420655,
|
|
"epoch": 1.035034656584751,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000489863940979828,
|
|
"loss": 5.534,
|
|
"mean_token_accuracy": 0.15978951305150985,
|
|
"num_tokens": 22732385.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 5.693596649169922,
|
|
"epoch": 1.035454736399916,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004898550520187925,
|
|
"loss": 5.4096,
|
|
"mean_token_accuracy": 0.16672670543193818,
|
|
"num_tokens": 22741148.0,
|
|
"step": 12325
|
|
},
|
|
{
|
|
"entropy": 5.675939607620239,
|
|
"epoch": 1.0358748162150808,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004898461592517103,
|
|
"loss": 5.4109,
|
|
"mean_token_accuracy": 0.16389408260583876,
|
|
"num_tokens": 22750239.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 5.783782148361206,
|
|
"epoch": 1.0362948960302458,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004898372626787391,
|
|
"loss": 5.5538,
|
|
"mean_token_accuracy": 0.15868894159793853,
|
|
"num_tokens": 22759290.0,
|
|
"step": 12335
|
|
},
|
|
{
|
|
"entropy": 5.806813049316406,
|
|
"epoch": 1.0367149758454106,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004898283623000364,
|
|
"loss": 5.5762,
|
|
"mean_token_accuracy": 0.15626893192529678,
|
|
"num_tokens": 22768450.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 5.7313658714294435,
|
|
"epoch": 1.0371350556605754,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004898194581157598,
|
|
"loss": 5.434,
|
|
"mean_token_accuracy": 0.15717112123966218,
|
|
"num_tokens": 22777711.0,
|
|
"step": 12345
|
|
},
|
|
{
|
|
"entropy": 5.695150518417359,
|
|
"epoch": 1.0375551354757404,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004898105501260671,
|
|
"loss": 5.5217,
|
|
"mean_token_accuracy": 0.16438411176204681,
|
|
"num_tokens": 22787153.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 5.763386631011963,
|
|
"epoch": 1.0379752152909052,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004898016383311163,
|
|
"loss": 5.525,
|
|
"mean_token_accuracy": 0.1668378531932831,
|
|
"num_tokens": 22797125.0,
|
|
"step": 12355
|
|
},
|
|
{
|
|
"entropy": 5.720566844940185,
|
|
"epoch": 1.0383952951060702,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000489792722731065,
|
|
"loss": 5.5131,
|
|
"mean_token_accuracy": 0.1580943688750267,
|
|
"num_tokens": 22806478.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 5.739205694198608,
|
|
"epoch": 1.038815374921235,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004897838033260712,
|
|
"loss": 5.5264,
|
|
"mean_token_accuracy": 0.1504399910569191,
|
|
"num_tokens": 22815375.0,
|
|
"step": 12365
|
|
},
|
|
{
|
|
"entropy": 5.783780193328857,
|
|
"epoch": 1.0392354547364,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004897748801162929,
|
|
"loss": 5.4899,
|
|
"mean_token_accuracy": 0.16633763164281845,
|
|
"num_tokens": 22824401.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 5.735202741622925,
|
|
"epoch": 1.0396555345515648,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004897659531018882,
|
|
"loss": 5.6045,
|
|
"mean_token_accuracy": 0.16063894852995872,
|
|
"num_tokens": 22833933.0,
|
|
"step": 12375
|
|
},
|
|
{
|
|
"entropy": 5.695873117446899,
|
|
"epoch": 1.0400756143667296,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004897570222830152,
|
|
"loss": 5.4862,
|
|
"mean_token_accuracy": 0.15706607103347778,
|
|
"num_tokens": 22843779.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 5.765497493743896,
|
|
"epoch": 1.0404956941818946,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004897480876598322,
|
|
"loss": 5.5739,
|
|
"mean_token_accuracy": 0.15584344267845154,
|
|
"num_tokens": 22852951.0,
|
|
"step": 12385
|
|
},
|
|
{
|
|
"entropy": 5.787726879119873,
|
|
"epoch": 1.0409157739970594,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004897391492324974,
|
|
"loss": 5.5851,
|
|
"mean_token_accuracy": 0.15543406456708908,
|
|
"num_tokens": 22861398.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 5.713971185684204,
|
|
"epoch": 1.0413358538122244,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004897302070011691,
|
|
"loss": 5.4643,
|
|
"mean_token_accuracy": 0.1621120572090149,
|
|
"num_tokens": 22870518.0,
|
|
"step": 12395
|
|
},
|
|
{
|
|
"entropy": 5.669089317321777,
|
|
"epoch": 1.0417559336273892,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004897212609660058,
|
|
"loss": 5.533,
|
|
"mean_token_accuracy": 0.15619430541992188,
|
|
"num_tokens": 22879389.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 5.724472951889038,
|
|
"epoch": 1.0421760134425542,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004897123111271659,
|
|
"loss": 5.5315,
|
|
"mean_token_accuracy": 0.16127124577760696,
|
|
"num_tokens": 22888977.0,
|
|
"step": 12405
|
|
},
|
|
{
|
|
"entropy": 5.853266382217408,
|
|
"epoch": 1.042596093257719,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004897033574848079,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.16196577847003937,
|
|
"num_tokens": 22898446.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 5.70566611289978,
|
|
"epoch": 1.0430161730728837,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004896944000390907,
|
|
"loss": 5.5245,
|
|
"mean_token_accuracy": 0.16506237536668777,
|
|
"num_tokens": 22908044.0,
|
|
"step": 12415
|
|
},
|
|
{
|
|
"entropy": 5.778344535827637,
|
|
"epoch": 1.0434362528880488,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004896854387901725,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.15366130471229553,
|
|
"num_tokens": 22917330.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 5.807542943954468,
|
|
"epoch": 1.0438563327032135,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004896764737382124,
|
|
"loss": 5.5466,
|
|
"mean_token_accuracy": 0.16622493267059327,
|
|
"num_tokens": 22927160.0,
|
|
"step": 12425
|
|
},
|
|
{
|
|
"entropy": 5.792239236831665,
|
|
"epoch": 1.0442764125183785,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004896675048833691,
|
|
"loss": 5.4966,
|
|
"mean_token_accuracy": 0.1603910431265831,
|
|
"num_tokens": 22936755.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 5.707068347930909,
|
|
"epoch": 1.0446964923335433,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004896585322258014,
|
|
"loss": 5.4971,
|
|
"mean_token_accuracy": 0.16156308948993683,
|
|
"num_tokens": 22945699.0,
|
|
"step": 12435
|
|
},
|
|
{
|
|
"entropy": 5.714017152786255,
|
|
"epoch": 1.0451165721487083,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004896495557656685,
|
|
"loss": 5.4759,
|
|
"mean_token_accuracy": 0.17006804645061493,
|
|
"num_tokens": 22954001.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 5.808580160140991,
|
|
"epoch": 1.0455366519638731,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004896405755031293,
|
|
"loss": 5.5673,
|
|
"mean_token_accuracy": 0.15997690260410308,
|
|
"num_tokens": 22963805.0,
|
|
"step": 12445
|
|
},
|
|
{
|
|
"entropy": 5.6800487518310545,
|
|
"epoch": 1.045956731779038,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004896315914383427,
|
|
"loss": 5.5063,
|
|
"mean_token_accuracy": 0.15431105494499206,
|
|
"num_tokens": 22973542.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 5.643172407150269,
|
|
"epoch": 1.046376811594203,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004896226035714679,
|
|
"loss": 5.3786,
|
|
"mean_token_accuracy": 0.16473590731620788,
|
|
"num_tokens": 22982417.0,
|
|
"step": 12455
|
|
},
|
|
{
|
|
"entropy": 5.706238555908203,
|
|
"epoch": 1.0467968914093677,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004896136119026642,
|
|
"loss": 5.5078,
|
|
"mean_token_accuracy": 0.15882690697908403,
|
|
"num_tokens": 22992879.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 5.697173643112182,
|
|
"epoch": 1.0472169712245327,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004896046164320911,
|
|
"loss": 5.3948,
|
|
"mean_token_accuracy": 0.16620510965585708,
|
|
"num_tokens": 23001344.0,
|
|
"step": 12465
|
|
},
|
|
{
|
|
"entropy": 5.659090280532837,
|
|
"epoch": 1.0476370510396975,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004895956171599075,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.1704336553812027,
|
|
"num_tokens": 23010007.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 5.747759056091309,
|
|
"epoch": 1.0480571308548625,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004895866140862731,
|
|
"loss": 5.557,
|
|
"mean_token_accuracy": 0.15872435867786408,
|
|
"num_tokens": 23019120.0,
|
|
"step": 12475
|
|
},
|
|
{
|
|
"entropy": 5.71089186668396,
|
|
"epoch": 1.0484772106700273,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004895776072113473,
|
|
"loss": 5.5359,
|
|
"mean_token_accuracy": 0.16418685615062714,
|
|
"num_tokens": 23028562.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 5.689389705657959,
|
|
"epoch": 1.048897290485192,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004895685965352898,
|
|
"loss": 5.4731,
|
|
"mean_token_accuracy": 0.16231704950332643,
|
|
"num_tokens": 23037687.0,
|
|
"step": 12485
|
|
},
|
|
{
|
|
"entropy": 5.757169103622436,
|
|
"epoch": 1.049317370300357,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004895595820582601,
|
|
"loss": 5.4789,
|
|
"mean_token_accuracy": 0.15927850753068923,
|
|
"num_tokens": 23047475.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 5.649786186218262,
|
|
"epoch": 1.0497374501155219,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004895505637804177,
|
|
"loss": 5.5069,
|
|
"mean_token_accuracy": 0.15920701920986174,
|
|
"num_tokens": 23057475.0,
|
|
"step": 12495
|
|
},
|
|
{
|
|
"entropy": 5.599431371688842,
|
|
"epoch": 1.050157529930687,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004895415417019227,
|
|
"loss": 5.4847,
|
|
"mean_token_accuracy": 0.15794518887996672,
|
|
"num_tokens": 23066419.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 5.765432214736938,
|
|
"epoch": 1.0505776097458517,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004895325158229346,
|
|
"loss": 5.5385,
|
|
"mean_token_accuracy": 0.1619092509150505,
|
|
"num_tokens": 23075516.0,
|
|
"step": 12505
|
|
},
|
|
{
|
|
"entropy": 5.721098899841309,
|
|
"epoch": 1.0509976895610167,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004895234861436136,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.1697925642132759,
|
|
"num_tokens": 23084132.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 5.783330011367798,
|
|
"epoch": 1.0514177693761815,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004895144526641194,
|
|
"loss": 5.5043,
|
|
"mean_token_accuracy": 0.16086599081754685,
|
|
"num_tokens": 23093958.0,
|
|
"step": 12515
|
|
},
|
|
{
|
|
"entropy": 5.783671569824219,
|
|
"epoch": 1.0518378491913463,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004895054153846123,
|
|
"loss": 5.5409,
|
|
"mean_token_accuracy": 0.1583005540072918,
|
|
"num_tokens": 23103524.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 5.631361865997315,
|
|
"epoch": 1.0522579290065113,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004894963743052521,
|
|
"loss": 5.451,
|
|
"mean_token_accuracy": 0.15645991861820222,
|
|
"num_tokens": 23112445.0,
|
|
"step": 12525
|
|
},
|
|
{
|
|
"entropy": 5.749525880813598,
|
|
"epoch": 1.052678008821676,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004894873294261991,
|
|
"loss": 5.5179,
|
|
"mean_token_accuracy": 0.15921320170164108,
|
|
"num_tokens": 23121299.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 5.767481660842895,
|
|
"epoch": 1.053098088636841,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004894782807476134,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.1522089034318924,
|
|
"num_tokens": 23130260.0,
|
|
"step": 12535
|
|
},
|
|
{
|
|
"entropy": 5.74699182510376,
|
|
"epoch": 1.0535181684520059,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004894692282696555,
|
|
"loss": 5.4622,
|
|
"mean_token_accuracy": 0.16261095851659774,
|
|
"num_tokens": 23139335.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 5.656941652297974,
|
|
"epoch": 1.0539382482671709,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004894601719924857,
|
|
"loss": 5.4648,
|
|
"mean_token_accuracy": 0.16428751796483992,
|
|
"num_tokens": 23149299.0,
|
|
"step": 12545
|
|
},
|
|
{
|
|
"entropy": 5.598066186904907,
|
|
"epoch": 1.0543583280823356,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004894511119162644,
|
|
"loss": 5.414,
|
|
"mean_token_accuracy": 0.16812524497509002,
|
|
"num_tokens": 23158651.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 5.759066200256347,
|
|
"epoch": 1.0547784078975004,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000489442048041152,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.15415302515029908,
|
|
"num_tokens": 23167629.0,
|
|
"step": 12555
|
|
},
|
|
{
|
|
"entropy": 5.7550407409667965,
|
|
"epoch": 1.0551984877126654,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004894329803673092,
|
|
"loss": 5.4926,
|
|
"mean_token_accuracy": 0.15900574922561644,
|
|
"num_tokens": 23177026.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 5.696121501922607,
|
|
"epoch": 1.0556185675278302,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004894239088948964,
|
|
"loss": 5.4628,
|
|
"mean_token_accuracy": 0.1633963868021965,
|
|
"num_tokens": 23185297.0,
|
|
"step": 12565
|
|
},
|
|
{
|
|
"entropy": 5.640616607666016,
|
|
"epoch": 1.0560386473429952,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004894148336240747,
|
|
"loss": 5.4745,
|
|
"mean_token_accuracy": 0.1665568009018898,
|
|
"num_tokens": 23194804.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 5.749676895141602,
|
|
"epoch": 1.05645872715816,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004894057545550045,
|
|
"loss": 5.5094,
|
|
"mean_token_accuracy": 0.15964649617671967,
|
|
"num_tokens": 23205063.0,
|
|
"step": 12575
|
|
},
|
|
{
|
|
"entropy": 5.691761779785156,
|
|
"epoch": 1.056878806973325,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004893966716878467,
|
|
"loss": 5.4411,
|
|
"mean_token_accuracy": 0.15895105600357057,
|
|
"num_tokens": 23215038.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 5.763622140884399,
|
|
"epoch": 1.0572988867884898,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004893875850227624,
|
|
"loss": 5.614,
|
|
"mean_token_accuracy": 0.151802134513855,
|
|
"num_tokens": 23223530.0,
|
|
"step": 12585
|
|
},
|
|
{
|
|
"entropy": 5.740535259246826,
|
|
"epoch": 1.0577189666036546,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004893784945599124,
|
|
"loss": 5.5385,
|
|
"mean_token_accuracy": 0.16195468753576278,
|
|
"num_tokens": 23232547.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 5.704318332672119,
|
|
"epoch": 1.0581390464188196,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004893694002994577,
|
|
"loss": 5.5753,
|
|
"mean_token_accuracy": 0.16065402403473855,
|
|
"num_tokens": 23241305.0,
|
|
"step": 12595
|
|
},
|
|
{
|
|
"entropy": 5.854096460342407,
|
|
"epoch": 1.0585591262339844,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004893603022415595,
|
|
"loss": 5.6043,
|
|
"mean_token_accuracy": 0.1608058363199234,
|
|
"num_tokens": 23250708.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 5.792645645141602,
|
|
"epoch": 1.0589792060491494,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004893512003863788,
|
|
"loss": 5.5117,
|
|
"mean_token_accuracy": 0.15551188662648202,
|
|
"num_tokens": 23260161.0,
|
|
"step": 12605
|
|
},
|
|
{
|
|
"entropy": 5.682678604125977,
|
|
"epoch": 1.0593992858643142,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004893420947340771,
|
|
"loss": 5.4161,
|
|
"mean_token_accuracy": 0.1580376446247101,
|
|
"num_tokens": 23268932.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 5.680995082855224,
|
|
"epoch": 1.0598193656794792,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004893329852848155,
|
|
"loss": 5.5111,
|
|
"mean_token_accuracy": 0.16238304674625398,
|
|
"num_tokens": 23277741.0,
|
|
"step": 12615
|
|
},
|
|
{
|
|
"entropy": 5.706674957275391,
|
|
"epoch": 1.060239445494644,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004893238720387555,
|
|
"loss": 5.5094,
|
|
"mean_token_accuracy": 0.16012922972440718,
|
|
"num_tokens": 23286982.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 5.713710308074951,
|
|
"epoch": 1.0606595253098088,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004893147549960584,
|
|
"loss": 5.4361,
|
|
"mean_token_accuracy": 0.16573359668254853,
|
|
"num_tokens": 23296902.0,
|
|
"step": 12625
|
|
},
|
|
{
|
|
"entropy": 5.671449041366577,
|
|
"epoch": 1.0610796051249738,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004893056341568857,
|
|
"loss": 5.4432,
|
|
"mean_token_accuracy": 0.16855929046869278,
|
|
"num_tokens": 23305443.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 5.694199895858764,
|
|
"epoch": 1.0614996849401386,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004892965095213992,
|
|
"loss": 5.4203,
|
|
"mean_token_accuracy": 0.16460745334625243,
|
|
"num_tokens": 23315420.0,
|
|
"step": 12635
|
|
},
|
|
{
|
|
"entropy": 5.759862661361694,
|
|
"epoch": 1.0619197647553036,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004892873810897604,
|
|
"loss": 5.5089,
|
|
"mean_token_accuracy": 0.1558899015188217,
|
|
"num_tokens": 23324540.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 5.739316987991333,
|
|
"epoch": 1.0623398445704684,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004892782488621308,
|
|
"loss": 5.4567,
|
|
"mean_token_accuracy": 0.16644190847873688,
|
|
"num_tokens": 23334282.0,
|
|
"step": 12645
|
|
},
|
|
{
|
|
"entropy": 5.712379074096679,
|
|
"epoch": 1.0627599243856332,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004892691128386725,
|
|
"loss": 5.453,
|
|
"mean_token_accuracy": 0.1627206951379776,
|
|
"num_tokens": 23342836.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 5.702242517471314,
|
|
"epoch": 1.0631800042007982,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004892599730195471,
|
|
"loss": 5.4406,
|
|
"mean_token_accuracy": 0.16527725458145143,
|
|
"num_tokens": 23351863.0,
|
|
"step": 12655
|
|
},
|
|
{
|
|
"entropy": 5.809025621414184,
|
|
"epoch": 1.063600084015963,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004892508294049167,
|
|
"loss": 5.6074,
|
|
"mean_token_accuracy": 0.1642581820487976,
|
|
"num_tokens": 23361788.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 5.716249179840088,
|
|
"epoch": 1.064020163831128,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004892416819949431,
|
|
"loss": 5.4403,
|
|
"mean_token_accuracy": 0.15782576352357863,
|
|
"num_tokens": 23370175.0,
|
|
"step": 12665
|
|
},
|
|
{
|
|
"entropy": 5.668329477310181,
|
|
"epoch": 1.0644402436462927,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004892325307897886,
|
|
"loss": 5.4826,
|
|
"mean_token_accuracy": 0.16445921808481218,
|
|
"num_tokens": 23378835.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 5.684893798828125,
|
|
"epoch": 1.0648603234614578,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004892233757896149,
|
|
"loss": 5.4898,
|
|
"mean_token_accuracy": 0.16239043474197387,
|
|
"num_tokens": 23389390.0,
|
|
"step": 12675
|
|
},
|
|
{
|
|
"entropy": 5.731085300445557,
|
|
"epoch": 1.0652804032766225,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004892142169945845,
|
|
"loss": 5.4812,
|
|
"mean_token_accuracy": 0.15869970321655275,
|
|
"num_tokens": 23398802.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 5.663789510726929,
|
|
"epoch": 1.0657004830917876,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004892050544048596,
|
|
"loss": 5.4592,
|
|
"mean_token_accuracy": 0.16194516718387603,
|
|
"num_tokens": 23407731.0,
|
|
"step": 12685
|
|
},
|
|
{
|
|
"entropy": 5.708717679977417,
|
|
"epoch": 1.0661205629069523,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004891958880206024,
|
|
"loss": 5.5059,
|
|
"mean_token_accuracy": 0.15976526141166686,
|
|
"num_tokens": 23417046.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 5.7145740509033205,
|
|
"epoch": 1.0665406427221171,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004891867178419753,
|
|
"loss": 5.5009,
|
|
"mean_token_accuracy": 0.1623055413365364,
|
|
"num_tokens": 23426107.0,
|
|
"step": 12695
|
|
},
|
|
{
|
|
"entropy": 5.758947944641113,
|
|
"epoch": 1.0669607225372821,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004891775438691408,
|
|
"loss": 5.5391,
|
|
"mean_token_accuracy": 0.1586405709385872,
|
|
"num_tokens": 23435523.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 5.691416501998901,
|
|
"epoch": 1.067380802352447,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004891683661022615,
|
|
"loss": 5.4907,
|
|
"mean_token_accuracy": 0.16506600081920625,
|
|
"num_tokens": 23444185.0,
|
|
"step": 12705
|
|
},
|
|
{
|
|
"entropy": 5.812458419799805,
|
|
"epoch": 1.067800882167612,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004891591845414997,
|
|
"loss": 5.678,
|
|
"mean_token_accuracy": 0.14658654034137725,
|
|
"num_tokens": 23454100.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 5.816659593582154,
|
|
"epoch": 1.0682209619827767,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004891499991870184,
|
|
"loss": 5.5766,
|
|
"mean_token_accuracy": 0.15168848782777786,
|
|
"num_tokens": 23463415.0,
|
|
"step": 12715
|
|
},
|
|
{
|
|
"entropy": 5.723210430145263,
|
|
"epoch": 1.0686410417979415,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048914081003898,
|
|
"loss": 5.4731,
|
|
"mean_token_accuracy": 0.15874896347522735,
|
|
"num_tokens": 23471515.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 5.743414497375488,
|
|
"epoch": 1.0690611216131065,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004891316170975475,
|
|
"loss": 5.5173,
|
|
"mean_token_accuracy": 0.15784869194030762,
|
|
"num_tokens": 23481696.0,
|
|
"step": 12725
|
|
},
|
|
{
|
|
"entropy": 5.7783526420593265,
|
|
"epoch": 1.0694812014282713,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004891224203628836,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.16449615508317947,
|
|
"num_tokens": 23490714.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 5.63666844367981,
|
|
"epoch": 1.0699012812434363,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004891132198351514,
|
|
"loss": 5.4621,
|
|
"mean_token_accuracy": 0.1659099578857422,
|
|
"num_tokens": 23500368.0,
|
|
"step": 12735
|
|
},
|
|
{
|
|
"entropy": 5.526670217514038,
|
|
"epoch": 1.070321361058601,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004891040155145137,
|
|
"loss": 5.4048,
|
|
"mean_token_accuracy": 0.17042581588029862,
|
|
"num_tokens": 23508857.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 5.627542209625244,
|
|
"epoch": 1.070741440873766,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004890948074011335,
|
|
"loss": 5.3897,
|
|
"mean_token_accuracy": 0.17012525349855423,
|
|
"num_tokens": 23518128.0,
|
|
"step": 12745
|
|
},
|
|
{
|
|
"entropy": 5.748180818557739,
|
|
"epoch": 1.071161520688931,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004890855954951741,
|
|
"loss": 5.4948,
|
|
"mean_token_accuracy": 0.16303456127643584,
|
|
"num_tokens": 23527292.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 5.744745492935181,
|
|
"epoch": 1.0715816005040957,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004890763797967987,
|
|
"loss": 5.4885,
|
|
"mean_token_accuracy": 0.16271119713783264,
|
|
"num_tokens": 23535694.0,
|
|
"step": 12755
|
|
},
|
|
{
|
|
"entropy": 5.706960821151734,
|
|
"epoch": 1.0720016803192607,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004890671603061704,
|
|
"loss": 5.4966,
|
|
"mean_token_accuracy": 0.15939076095819474,
|
|
"num_tokens": 23544766.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 5.706810760498047,
|
|
"epoch": 1.0724217601344255,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004890579370234526,
|
|
"loss": 5.4554,
|
|
"mean_token_accuracy": 0.1673600748181343,
|
|
"num_tokens": 23554037.0,
|
|
"step": 12765
|
|
},
|
|
{
|
|
"entropy": 5.774952697753906,
|
|
"epoch": 1.0728418399495905,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004890487099488086,
|
|
"loss": 5.5179,
|
|
"mean_token_accuracy": 0.15788703113794328,
|
|
"num_tokens": 23562282.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 5.792991018295288,
|
|
"epoch": 1.0732619197647553,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.000489039479082402,
|
|
"loss": 5.5865,
|
|
"mean_token_accuracy": 0.15591855943202973,
|
|
"num_tokens": 23571955.0,
|
|
"step": 12775
|
|
},
|
|
{
|
|
"entropy": 5.676628351211548,
|
|
"epoch": 1.0736819995799203,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004890302444243962,
|
|
"loss": 5.4755,
|
|
"mean_token_accuracy": 0.15936234593391418,
|
|
"num_tokens": 23580996.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 5.745807313919068,
|
|
"epoch": 1.074102079395085,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004890210059749549,
|
|
"loss": 5.5674,
|
|
"mean_token_accuracy": 0.1499895855784416,
|
|
"num_tokens": 23589618.0,
|
|
"step": 12785
|
|
},
|
|
{
|
|
"entropy": 5.733888244628906,
|
|
"epoch": 1.0745221592102498,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004890117637342416,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.1605689197778702,
|
|
"num_tokens": 23599574.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 5.7341142177581785,
|
|
"epoch": 1.0749422390254149,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004890025177024202,
|
|
"loss": 5.486,
|
|
"mean_token_accuracy": 0.15659692734479905,
|
|
"num_tokens": 23609205.0,
|
|
"step": 12795
|
|
},
|
|
{
|
|
"entropy": 5.678049373626709,
|
|
"epoch": 1.0753623188405796,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004889932678796543,
|
|
"loss": 5.5044,
|
|
"mean_token_accuracy": 0.15572902113199233,
|
|
"num_tokens": 23617554.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 5.7471010208129885,
|
|
"epoch": 1.0757823986557447,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004889840142661078,
|
|
"loss": 5.5599,
|
|
"mean_token_accuracy": 0.1572861537337303,
|
|
"num_tokens": 23626757.0,
|
|
"step": 12805
|
|
},
|
|
{
|
|
"entropy": 5.770623016357422,
|
|
"epoch": 1.0762024784709094,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004889747568619447,
|
|
"loss": 5.5106,
|
|
"mean_token_accuracy": 0.1615568682551384,
|
|
"num_tokens": 23636111.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 5.72378830909729,
|
|
"epoch": 1.0766225582860744,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004889654956673291,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.16236085295677186,
|
|
"num_tokens": 23644579.0,
|
|
"step": 12815
|
|
},
|
|
{
|
|
"entropy": 5.700385427474975,
|
|
"epoch": 1.0770426381012392,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004889562306824248,
|
|
"loss": 5.4095,
|
|
"mean_token_accuracy": 0.1597435638308525,
|
|
"num_tokens": 23653263.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 5.591032648086548,
|
|
"epoch": 1.077462717916404,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000488946961907396,
|
|
"loss": 5.3843,
|
|
"mean_token_accuracy": 0.1746201902627945,
|
|
"num_tokens": 23662529.0,
|
|
"step": 12825
|
|
},
|
|
{
|
|
"entropy": 5.608241891860962,
|
|
"epoch": 1.077882797731569,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004889376893424071,
|
|
"loss": 5.421,
|
|
"mean_token_accuracy": 0.1713373154401779,
|
|
"num_tokens": 23671491.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 5.640907621383667,
|
|
"epoch": 1.0783028775467338,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004889284129876221,
|
|
"loss": 5.4005,
|
|
"mean_token_accuracy": 0.15988982617855071,
|
|
"num_tokens": 23680121.0,
|
|
"step": 12835
|
|
},
|
|
{
|
|
"entropy": 5.662772226333618,
|
|
"epoch": 1.0787229573618988,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004889191328432054,
|
|
"loss": 5.4614,
|
|
"mean_token_accuracy": 0.16260750889778136,
|
|
"num_tokens": 23689008.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 5.742505121231079,
|
|
"epoch": 1.0791430371770636,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004889098489093215,
|
|
"loss": 5.5053,
|
|
"mean_token_accuracy": 0.1597042962908745,
|
|
"num_tokens": 23698551.0,
|
|
"step": 12845
|
|
},
|
|
{
|
|
"entropy": 5.8218427181243895,
|
|
"epoch": 1.0795631169922286,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004889005611861347,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.15463445335626602,
|
|
"num_tokens": 23707438.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 5.734436941146851,
|
|
"epoch": 1.0799831968073934,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004888912696738096,
|
|
"loss": 5.5045,
|
|
"mean_token_accuracy": 0.16258185505867004,
|
|
"num_tokens": 23715822.0,
|
|
"step": 12855
|
|
},
|
|
{
|
|
"entropy": 5.743537902832031,
|
|
"epoch": 1.0804032766225582,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004888819743725108,
|
|
"loss": 5.5265,
|
|
"mean_token_accuracy": 0.1599157154560089,
|
|
"num_tokens": 23725426.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 5.762011289596558,
|
|
"epoch": 1.0808233564377232,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000488872675282403,
|
|
"loss": 5.5143,
|
|
"mean_token_accuracy": 0.16166198402643203,
|
|
"num_tokens": 23735092.0,
|
|
"step": 12865
|
|
},
|
|
{
|
|
"entropy": 5.754138803482055,
|
|
"epoch": 1.081243436252888,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004888633724036509,
|
|
"loss": 5.5018,
|
|
"mean_token_accuracy": 0.16174346208572388,
|
|
"num_tokens": 23744255.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 5.657329463958741,
|
|
"epoch": 1.081663516068053,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004888540657364192,
|
|
"loss": 5.3593,
|
|
"mean_token_accuracy": 0.1702010914683342,
|
|
"num_tokens": 23752978.0,
|
|
"step": 12875
|
|
},
|
|
{
|
|
"entropy": 5.6576941967010494,
|
|
"epoch": 1.0820835958832178,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004888447552808729,
|
|
"loss": 5.4421,
|
|
"mean_token_accuracy": 0.16415699273347856,
|
|
"num_tokens": 23761051.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 5.763893032073975,
|
|
"epoch": 1.0825036756983828,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004888354410371768,
|
|
"loss": 5.5546,
|
|
"mean_token_accuracy": 0.15805445313453675,
|
|
"num_tokens": 23770818.0,
|
|
"step": 12885
|
|
},
|
|
{
|
|
"entropy": 5.810835695266723,
|
|
"epoch": 1.0829237555135476,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.000488826123005496,
|
|
"loss": 5.5714,
|
|
"mean_token_accuracy": 0.16120226234197615,
|
|
"num_tokens": 23780597.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 5.69043231010437,
|
|
"epoch": 1.0833438353287124,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 0.0004888168011859957,
|
|
"loss": 5.4083,
|
|
"mean_token_accuracy": 0.16143542230129243,
|
|
"num_tokens": 23790119.0,
|
|
"step": 12895
|
|
},
|
|
{
|
|
"entropy": 5.687187528610229,
|
|
"epoch": 1.0837639151438774,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004888074755788407,
|
|
"loss": 5.4772,
|
|
"mean_token_accuracy": 0.16725920587778093,
|
|
"num_tokens": 23798972.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 5.722570514678955,
|
|
"epoch": 1.0841839949590422,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0004887981461841963,
|
|
"loss": 5.4527,
|
|
"mean_token_accuracy": 0.17206404507160186,
|
|
"num_tokens": 23808685.0,
|
|
"step": 12905
|
|
},
|
|
{
|
|
"entropy": 5.765744590759278,
|
|
"epoch": 1.0846040747742072,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004887888130022279,
|
|
"loss": 5.4663,
|
|
"mean_token_accuracy": 0.16214465647935866,
|
|
"num_tokens": 23817721.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 5.631449890136719,
|
|
"epoch": 1.085024154589372,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004887794760331008,
|
|
"loss": 5.4193,
|
|
"mean_token_accuracy": 0.16689784675836564,
|
|
"num_tokens": 23826892.0,
|
|
"step": 12915
|
|
},
|
|
{
|
|
"entropy": 5.683791780471802,
|
|
"epoch": 1.085444234404537,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004887701352769804,
|
|
"loss": 5.3724,
|
|
"mean_token_accuracy": 0.17175290137529373,
|
|
"num_tokens": 23835717.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 5.697872066497803,
|
|
"epoch": 1.0858643142197018,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000488760790734032,
|
|
"loss": 5.472,
|
|
"mean_token_accuracy": 0.16542189866304396,
|
|
"num_tokens": 23845814.0,
|
|
"step": 12925
|
|
},
|
|
{
|
|
"entropy": 5.738125276565552,
|
|
"epoch": 1.0862843940348665,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004887514424044214,
|
|
"loss": 5.4563,
|
|
"mean_token_accuracy": 0.153540675342083,
|
|
"num_tokens": 23854779.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 5.688271474838257,
|
|
"epoch": 1.0867044738500315,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000488742090288314,
|
|
"loss": 5.5074,
|
|
"mean_token_accuracy": 0.16052113920450212,
|
|
"num_tokens": 23863533.0,
|
|
"step": 12935
|
|
},
|
|
{
|
|
"entropy": 5.7345654487609865,
|
|
"epoch": 1.0871245536651963,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004887327343858755,
|
|
"loss": 5.5325,
|
|
"mean_token_accuracy": 0.1583286091685295,
|
|
"num_tokens": 23872725.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 5.735647916793823,
|
|
"epoch": 1.0875446334803613,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004887233746972717,
|
|
"loss": 5.5094,
|
|
"mean_token_accuracy": 0.1608467683196068,
|
|
"num_tokens": 23881799.0,
|
|
"step": 12945
|
|
},
|
|
{
|
|
"entropy": 5.736598634719849,
|
|
"epoch": 1.0879647132955261,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004887140112226684,
|
|
"loss": 5.5438,
|
|
"mean_token_accuracy": 0.15989564061164857,
|
|
"num_tokens": 23890628.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 5.667224788665772,
|
|
"epoch": 1.088384793110691,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 0.0004887046439622314,
|
|
"loss": 5.5216,
|
|
"mean_token_accuracy": 0.16750244051218033,
|
|
"num_tokens": 23899968.0,
|
|
"step": 12955
|
|
},
|
|
{
|
|
"entropy": 5.747261238098145,
|
|
"epoch": 1.088804872925856,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004886952729161267,
|
|
"loss": 5.3932,
|
|
"mean_token_accuracy": 0.16512321233749389,
|
|
"num_tokens": 23908634.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 5.779636716842651,
|
|
"epoch": 1.0892249527410207,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 0.0004886858980845202,
|
|
"loss": 5.5616,
|
|
"mean_token_accuracy": 0.15966024100780488,
|
|
"num_tokens": 23917925.0,
|
|
"step": 12965
|
|
},
|
|
{
|
|
"entropy": 5.65394434928894,
|
|
"epoch": 1.0896450325561857,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0004886765194675782,
|
|
"loss": 5.4445,
|
|
"mean_token_accuracy": 0.1655475303530693,
|
|
"num_tokens": 23927173.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 5.667041397094726,
|
|
"epoch": 1.0900651123713505,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004886671370654665,
|
|
"loss": 5.4196,
|
|
"mean_token_accuracy": 0.1653660088777542,
|
|
"num_tokens": 23936258.0,
|
|
"step": 12975
|
|
},
|
|
{
|
|
"entropy": 5.676847219467163,
|
|
"epoch": 1.0904851921865155,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004886577508783516,
|
|
"loss": 5.3862,
|
|
"mean_token_accuracy": 0.16707207411527633,
|
|
"num_tokens": 23944215.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 5.730111455917358,
|
|
"epoch": 1.0909052720016803,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 0.0004886483609063997,
|
|
"loss": 5.4505,
|
|
"mean_token_accuracy": 0.16068692207336427,
|
|
"num_tokens": 23953151.0,
|
|
"step": 12985
|
|
},
|
|
{
|
|
"entropy": 5.592217302322387,
|
|
"epoch": 1.0913253518168453,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0004886389671497769,
|
|
"loss": 5.4724,
|
|
"mean_token_accuracy": 0.16959808766841888,
|
|
"num_tokens": 23962919.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 5.735597896575928,
|
|
"epoch": 1.09174543163201,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.00048862956960865,
|
|
"loss": 5.4779,
|
|
"mean_token_accuracy": 0.15886924266815186,
|
|
"num_tokens": 23971900.0,
|
|
"step": 12995
|
|
},
|
|
{
|
|
"entropy": 5.7348557472229,
|
|
"epoch": 1.0921655114471749,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004886201682831852,
|
|
"loss": 5.4471,
|
|
"mean_token_accuracy": 0.16426561921834945,
|
|
"num_tokens": 23980945.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 5.678046464920044,
|
|
"epoch": 1.09258559126234,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004886107631735491,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.16405817568302156,
|
|
"num_tokens": 23990460.0,
|
|
"step": 13005
|
|
},
|
|
{
|
|
"entropy": 5.714896297454834,
|
|
"epoch": 1.0930056710775047,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004886013542799083,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.15213673710823059,
|
|
"num_tokens": 23999925.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 5.658804130554199,
|
|
"epoch": 1.0934257508926697,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004885919416024296,
|
|
"loss": 5.4217,
|
|
"mean_token_accuracy": 0.1613025948405266,
|
|
"num_tokens": 24009039.0,
|
|
"step": 13015
|
|
},
|
|
{
|
|
"entropy": 5.759115076065063,
|
|
"epoch": 1.0938458307078345,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004885825251412796,
|
|
"loss": 5.4736,
|
|
"mean_token_accuracy": 0.16182312816381456,
|
|
"num_tokens": 24017725.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 5.735840749740601,
|
|
"epoch": 1.0942659105229993,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004885731048966252,
|
|
"loss": 5.503,
|
|
"mean_token_accuracy": 0.1575954094529152,
|
|
"num_tokens": 24027158.0,
|
|
"step": 13025
|
|
},
|
|
{
|
|
"entropy": 5.6926501274108885,
|
|
"epoch": 1.0946859903381643,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004885636808686331,
|
|
"loss": 5.5293,
|
|
"mean_token_accuracy": 0.16384944021701814,
|
|
"num_tokens": 24037224.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 5.738328456878662,
|
|
"epoch": 1.095106070153329,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004885542530574705,
|
|
"loss": 5.5052,
|
|
"mean_token_accuracy": 0.1625734105706215,
|
|
"num_tokens": 24046097.0,
|
|
"step": 13035
|
|
},
|
|
{
|
|
"entropy": 5.693251371383667,
|
|
"epoch": 1.095526149968494,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004885448214633042,
|
|
"loss": 5.4044,
|
|
"mean_token_accuracy": 0.1620977535843849,
|
|
"num_tokens": 24055270.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 5.73248519897461,
|
|
"epoch": 1.0959462297836589,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004885353860863013,
|
|
"loss": 5.5641,
|
|
"mean_token_accuracy": 0.15346422791481018,
|
|
"num_tokens": 24064995.0,
|
|
"step": 13045
|
|
},
|
|
{
|
|
"entropy": 5.779075717926025,
|
|
"epoch": 1.0963663095988239,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.000488525946926629,
|
|
"loss": 5.6107,
|
|
"mean_token_accuracy": 0.15505203902721404,
|
|
"num_tokens": 24075523.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 5.737312889099121,
|
|
"epoch": 1.0967863894139886,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004885165039844545,
|
|
"loss": 5.4789,
|
|
"mean_token_accuracy": 0.16420630365610123,
|
|
"num_tokens": 24084933.0,
|
|
"step": 13055
|
|
},
|
|
{
|
|
"entropy": 5.698319673538208,
|
|
"epoch": 1.0972064692291534,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004885070572599452,
|
|
"loss": 5.503,
|
|
"mean_token_accuracy": 0.15282038301229478,
|
|
"num_tokens": 24093964.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 5.724235200881958,
|
|
"epoch": 1.0976265490443184,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004884976067532681,
|
|
"loss": 5.452,
|
|
"mean_token_accuracy": 0.15377498120069505,
|
|
"num_tokens": 24103951.0,
|
|
"step": 13065
|
|
},
|
|
{
|
|
"entropy": 5.679253768920899,
|
|
"epoch": 1.0980466288594832,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000488488152464591,
|
|
"loss": 5.5711,
|
|
"mean_token_accuracy": 0.15378451496362686,
|
|
"num_tokens": 24113392.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 5.718753099441528,
|
|
"epoch": 1.0984667086746482,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004884786943940812,
|
|
"loss": 5.4403,
|
|
"mean_token_accuracy": 0.15815389901399612,
|
|
"num_tokens": 24123165.0,
|
|
"step": 13075
|
|
},
|
|
{
|
|
"entropy": 5.713952112197876,
|
|
"epoch": 1.098886788489813,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004884692325419063,
|
|
"loss": 5.479,
|
|
"mean_token_accuracy": 0.15968940854072572,
|
|
"num_tokens": 24132176.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 5.682787561416626,
|
|
"epoch": 1.099306868304978,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004884597669082336,
|
|
"loss": 5.5387,
|
|
"mean_token_accuracy": 0.15351806879043578,
|
|
"num_tokens": 24141737.0,
|
|
"step": 13085
|
|
},
|
|
{
|
|
"entropy": 5.712856578826904,
|
|
"epoch": 1.0997269481201428,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004884502974932313,
|
|
"loss": 5.4785,
|
|
"mean_token_accuracy": 0.16513199657201766,
|
|
"num_tokens": 24150477.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 5.806832218170166,
|
|
"epoch": 1.1001470279353076,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004884408242970668,
|
|
"loss": 5.5721,
|
|
"mean_token_accuracy": 0.15941140204668044,
|
|
"num_tokens": 24158739.0,
|
|
"step": 13095
|
|
},
|
|
{
|
|
"entropy": 5.651869440078736,
|
|
"epoch": 1.1005671077504726,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004884313473199081,
|
|
"loss": 5.4125,
|
|
"mean_token_accuracy": 0.16672947108745576,
|
|
"num_tokens": 24167511.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 5.65992579460144,
|
|
"epoch": 1.1009871875656374,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004884218665619229,
|
|
"loss": 5.4252,
|
|
"mean_token_accuracy": 0.1618572935461998,
|
|
"num_tokens": 24176413.0,
|
|
"step": 13105
|
|
},
|
|
{
|
|
"entropy": 5.691173410415649,
|
|
"epoch": 1.1014072673808024,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004884123820232792,
|
|
"loss": 5.3662,
|
|
"mean_token_accuracy": 0.17088967561721802,
|
|
"num_tokens": 24185135.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 5.688163566589355,
|
|
"epoch": 1.1018273471959672,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004884028937041451,
|
|
"loss": 5.4519,
|
|
"mean_token_accuracy": 0.16832585632801056,
|
|
"num_tokens": 24193273.0,
|
|
"step": 13115
|
|
},
|
|
{
|
|
"entropy": 5.6593669891357425,
|
|
"epoch": 1.1022474270111322,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004883934016046886,
|
|
"loss": 5.5176,
|
|
"mean_token_accuracy": 0.15427347868680955,
|
|
"num_tokens": 24202509.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 5.783310222625732,
|
|
"epoch": 1.102667506826297,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000488383905725078,
|
|
"loss": 5.5096,
|
|
"mean_token_accuracy": 0.15751553177833558,
|
|
"num_tokens": 24212644.0,
|
|
"step": 13125
|
|
},
|
|
{
|
|
"entropy": 5.7312768459320065,
|
|
"epoch": 1.1030875866414618,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004883744060654811,
|
|
"loss": 5.4135,
|
|
"mean_token_accuracy": 0.16013285517692566,
|
|
"num_tokens": 24221838.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 5.651692819595337,
|
|
"epoch": 1.1035076664566268,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004883649026260667,
|
|
"loss": 5.4813,
|
|
"mean_token_accuracy": 0.16545673757791518,
|
|
"num_tokens": 24230987.0,
|
|
"step": 13135
|
|
},
|
|
{
|
|
"entropy": 5.6557066440582275,
|
|
"epoch": 1.1039277462717916,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004883553954070028,
|
|
"loss": 5.4491,
|
|
"mean_token_accuracy": 0.16192631274461747,
|
|
"num_tokens": 24240523.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 5.724129486083984,
|
|
"epoch": 1.1043478260869566,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000488345884408458,
|
|
"loss": 5.5421,
|
|
"mean_token_accuracy": 0.1672690689563751,
|
|
"num_tokens": 24249799.0,
|
|
"step": 13145
|
|
},
|
|
{
|
|
"entropy": 5.723703002929687,
|
|
"epoch": 1.1047679059021214,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004883363696306007,
|
|
"loss": 5.4621,
|
|
"mean_token_accuracy": 0.1656269609928131,
|
|
"num_tokens": 24259361.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 5.718085622787475,
|
|
"epoch": 1.1051879857172864,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004883268510735995,
|
|
"loss": 5.4368,
|
|
"mean_token_accuracy": 0.15831930935382843,
|
|
"num_tokens": 24268010.0,
|
|
"step": 13155
|
|
},
|
|
{
|
|
"entropy": 5.5756614208221436,
|
|
"epoch": 1.1056080655324512,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004883173287376229,
|
|
"loss": 5.4839,
|
|
"mean_token_accuracy": 0.1586616076529026,
|
|
"num_tokens": 24277416.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 5.7934998035430905,
|
|
"epoch": 1.106028145347616,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004883078026228397,
|
|
"loss": 5.5608,
|
|
"mean_token_accuracy": 0.16097336113452912,
|
|
"num_tokens": 24286185.0,
|
|
"step": 13165
|
|
},
|
|
{
|
|
"entropy": 5.741655588150024,
|
|
"epoch": 1.106448225162781,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004882982727294187,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.1603280246257782,
|
|
"num_tokens": 24295382.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 5.676276683807373,
|
|
"epoch": 1.1068683049779457,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 0.0004882887390575284,
|
|
"loss": 5.4468,
|
|
"mean_token_accuracy": 0.1647188439965248,
|
|
"num_tokens": 24305197.0,
|
|
"step": 13175
|
|
},
|
|
{
|
|
"entropy": 5.706903171539307,
|
|
"epoch": 1.1072883847931108,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004882792016073381,
|
|
"loss": 5.541,
|
|
"mean_token_accuracy": 0.15018792897462846,
|
|
"num_tokens": 24314149.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 5.755481195449829,
|
|
"epoch": 1.1077084646082755,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048826966037901655,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.1623881921172142,
|
|
"num_tokens": 24323737.0,
|
|
"step": 13185
|
|
},
|
|
{
|
|
"entropy": 5.675417709350586,
|
|
"epoch": 1.1081285444234406,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.00048826011537273276,
|
|
"loss": 5.4406,
|
|
"mean_token_accuracy": 0.1623774915933609,
|
|
"num_tokens": 24332853.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 5.705647706985474,
|
|
"epoch": 1.1085486242386053,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004882505665886558,
|
|
"loss": 5.5693,
|
|
"mean_token_accuracy": 0.15558527559041976,
|
|
"num_tokens": 24342632.0,
|
|
"step": 13195
|
|
},
|
|
{
|
|
"entropy": 5.6826183795928955,
|
|
"epoch": 1.1089687040537701,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048824101402695493,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.16228149831295013,
|
|
"num_tokens": 24351659.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 5.612444162368774,
|
|
"epoch": 1.1093887838689351,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004882314576877993,
|
|
"loss": 5.4479,
|
|
"mean_token_accuracy": 0.1650165230035782,
|
|
"num_tokens": 24360938.0,
|
|
"step": 13205
|
|
},
|
|
{
|
|
"entropy": 5.7091968059539795,
|
|
"epoch": 1.1098088636841,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004882218975713581,
|
|
"loss": 5.5041,
|
|
"mean_token_accuracy": 0.1613766685128212,
|
|
"num_tokens": 24369603.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 5.702196216583252,
|
|
"epoch": 1.110228943499265,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004882123336778009,
|
|
"loss": 5.4355,
|
|
"mean_token_accuracy": 0.16338066160678863,
|
|
"num_tokens": 24377605.0,
|
|
"step": 13215
|
|
},
|
|
{
|
|
"entropy": 5.725568962097168,
|
|
"epoch": 1.1106490233144297,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004882027660072969,
|
|
"loss": 5.5007,
|
|
"mean_token_accuracy": 0.15481040328741075,
|
|
"num_tokens": 24386930.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 5.700316143035889,
|
|
"epoch": 1.1110691031295947,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004881931945600157,
|
|
"loss": 5.4679,
|
|
"mean_token_accuracy": 0.16834752559661864,
|
|
"num_tokens": 24396473.0,
|
|
"step": 13225
|
|
},
|
|
{
|
|
"entropy": 5.72724027633667,
|
|
"epoch": 1.1114891829447595,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004881836193361269,
|
|
"loss": 5.5465,
|
|
"mean_token_accuracy": 0.1676660493016243,
|
|
"num_tokens": 24405461.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 5.7355544090271,
|
|
"epoch": 1.1119092627599243,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004881740403358,
|
|
"loss": 5.4901,
|
|
"mean_token_accuracy": 0.16505587697029114,
|
|
"num_tokens": 24414138.0,
|
|
"step": 13235
|
|
},
|
|
{
|
|
"entropy": 5.717983341217041,
|
|
"epoch": 1.1123293425750893,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048816445755920474,
|
|
"loss": 5.5038,
|
|
"mean_token_accuracy": 0.15973408818244933,
|
|
"num_tokens": 24423386.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 5.685654735565185,
|
|
"epoch": 1.112749422390254,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004881548710065109,
|
|
"loss": 5.4944,
|
|
"mean_token_accuracy": 0.15903386771678923,
|
|
"num_tokens": 24433637.0,
|
|
"step": 13245
|
|
},
|
|
{
|
|
"entropy": 5.740741491317749,
|
|
"epoch": 1.113169502205419,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004881452806778883,
|
|
"loss": 5.5311,
|
|
"mean_token_accuracy": 0.16349861323833464,
|
|
"num_tokens": 24443677.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 5.709890747070313,
|
|
"epoch": 1.113589582020584,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048813568657350676,
|
|
"loss": 5.4317,
|
|
"mean_token_accuracy": 0.16741538047790527,
|
|
"num_tokens": 24452317.0,
|
|
"step": 13255
|
|
},
|
|
{
|
|
"entropy": 5.704727077484131,
|
|
"epoch": 1.1140096618357487,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004881260886935363,
|
|
"loss": 5.449,
|
|
"mean_token_accuracy": 0.16238080710172653,
|
|
"num_tokens": 24460626.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 5.757587671279907,
|
|
"epoch": 1.1144297416509137,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048811648703814693,
|
|
"loss": 5.546,
|
|
"mean_token_accuracy": 0.1519101120531559,
|
|
"num_tokens": 24469583.0,
|
|
"step": 13265
|
|
},
|
|
{
|
|
"entropy": 5.741657829284668,
|
|
"epoch": 1.1148498214660785,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004881068816075087,
|
|
"loss": 5.4811,
|
|
"mean_token_accuracy": 0.15867555439472197,
|
|
"num_tokens": 24478811.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 5.706976461410522,
|
|
"epoch": 1.1152699012812435,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048809727240179193,
|
|
"loss": 5.5147,
|
|
"mean_token_accuracy": 0.1607096463441849,
|
|
"num_tokens": 24487818.0,
|
|
"step": 13275
|
|
},
|
|
{
|
|
"entropy": 5.676044464111328,
|
|
"epoch": 1.1156899810964083,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004880876594211665,
|
|
"loss": 5.4882,
|
|
"mean_token_accuracy": 0.15960678607225418,
|
|
"num_tokens": 24497087.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 5.743713235855102,
|
|
"epoch": 1.1161100609115733,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00048807804266580304,
|
|
"loss": 5.4398,
|
|
"mean_token_accuracy": 0.15841995030641556,
|
|
"num_tokens": 24505347.0,
|
|
"step": 13285
|
|
},
|
|
{
|
|
"entropy": 5.774560213088989,
|
|
"epoch": 1.116530140726738,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004880684221358717,
|
|
"loss": 5.4756,
|
|
"mean_token_accuracy": 0.16267163306474686,
|
|
"num_tokens": 24514732.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 5.7229407787322994,
|
|
"epoch": 1.116950220541903,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048805879783154305,
|
|
"loss": 5.5052,
|
|
"mean_token_accuracy": 0.16089607030153275,
|
|
"num_tokens": 24523295.0,
|
|
"step": 13295
|
|
},
|
|
{
|
|
"entropy": 5.651921367645263,
|
|
"epoch": 1.1173703003570679,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048804916975298744,
|
|
"loss": 5.4123,
|
|
"mean_token_accuracy": 0.16294008493423462,
|
|
"num_tokens": 24532415.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 5.739264678955078,
|
|
"epoch": 1.1177903801722326,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004880395379003755,
|
|
"loss": 5.5434,
|
|
"mean_token_accuracy": 0.15819203555583955,
|
|
"num_tokens": 24541856.0,
|
|
"step": 13305
|
|
},
|
|
{
|
|
"entropy": 5.685423040390015,
|
|
"epoch": 1.1182104599873977,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048802990227387797,
|
|
"loss": 5.5277,
|
|
"mean_token_accuracy": 0.1538828618824482,
|
|
"num_tokens": 24550982.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 5.771675062179566,
|
|
"epoch": 1.1186305398025624,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048802026287366525,
|
|
"loss": 5.5966,
|
|
"mean_token_accuracy": 0.1531897470355034,
|
|
"num_tokens": 24561176.0,
|
|
"step": 13315
|
|
},
|
|
{
|
|
"entropy": 5.749803829193115,
|
|
"epoch": 1.1190506196177274,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00048801061969990834,
|
|
"loss": 5.47,
|
|
"mean_token_accuracy": 0.16135310828685762,
|
|
"num_tokens": 24570741.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 5.661540126800537,
|
|
"epoch": 1.1194706994328922,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048800097275277795,
|
|
"loss": 5.4795,
|
|
"mean_token_accuracy": 0.16684099435806274,
|
|
"num_tokens": 24580175.0,
|
|
"step": 13325
|
|
},
|
|
{
|
|
"entropy": 5.715025186538696,
|
|
"epoch": 1.119890779248057,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000487991322032445,
|
|
"loss": 5.4763,
|
|
"mean_token_accuracy": 0.16523855775594712,
|
|
"num_tokens": 24588754.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 5.864963054656982,
|
|
"epoch": 1.120310859063222,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004879816675390805,
|
|
"loss": 5.6524,
|
|
"mean_token_accuracy": 0.15361952036619186,
|
|
"num_tokens": 24599429.0,
|
|
"step": 13335
|
|
},
|
|
{
|
|
"entropy": 5.661528491973877,
|
|
"epoch": 1.1207309388783868,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048797200927285547,
|
|
"loss": 5.3917,
|
|
"mean_token_accuracy": 0.1662903368473053,
|
|
"num_tokens": 24608767.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 5.678159713745117,
|
|
"epoch": 1.1211510186935518,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004879623472339409,
|
|
"loss": 5.5641,
|
|
"mean_token_accuracy": 0.16006904989480972,
|
|
"num_tokens": 24618232.0,
|
|
"step": 13345
|
|
},
|
|
{
|
|
"entropy": 5.752752017974854,
|
|
"epoch": 1.1215710985087166,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.000487952681422508,
|
|
"loss": 5.4368,
|
|
"mean_token_accuracy": 0.16255403459072112,
|
|
"num_tokens": 24626986.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 5.588898992538452,
|
|
"epoch": 1.1219911783238816,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000487943011838728,
|
|
"loss": 5.3223,
|
|
"mean_token_accuracy": 0.16933453232049941,
|
|
"num_tokens": 24635283.0,
|
|
"step": 13355
|
|
},
|
|
{
|
|
"entropy": 5.555433702468872,
|
|
"epoch": 1.1224112581390464,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004879333384827722,
|
|
"loss": 5.4317,
|
|
"mean_token_accuracy": 0.1646237164735794,
|
|
"num_tokens": 24644451.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 5.796985626220703,
|
|
"epoch": 1.1228313379542114,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004879236613548119,
|
|
"loss": 5.5727,
|
|
"mean_token_accuracy": 0.15768791288137435,
|
|
"num_tokens": 24654811.0,
|
|
"step": 13365
|
|
},
|
|
{
|
|
"entropy": 5.751317977905273,
|
|
"epoch": 1.1232514177693762,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004879139804550187,
|
|
"loss": 5.4907,
|
|
"mean_token_accuracy": 0.15994445979595184,
|
|
"num_tokens": 24663712.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 5.733260011672973,
|
|
"epoch": 1.123671497584541,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048790429578356387,
|
|
"loss": 5.588,
|
|
"mean_token_accuracy": 0.15311638191342353,
|
|
"num_tokens": 24672518.0,
|
|
"step": 13375
|
|
},
|
|
{
|
|
"entropy": 5.719970655441284,
|
|
"epoch": 1.124091577399706,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00048789460734061915,
|
|
"loss": 5.5207,
|
|
"mean_token_accuracy": 0.160324390232563,
|
|
"num_tokens": 24681900.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 5.720213317871094,
|
|
"epoch": 1.1245116572148708,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 0.0004878849151263561,
|
|
"loss": 5.4909,
|
|
"mean_token_accuracy": 0.16072850972414016,
|
|
"num_tokens": 24691760.0,
|
|
"step": 13385
|
|
},
|
|
{
|
|
"entropy": 5.71978440284729,
|
|
"epoch": 1.1249317370300358,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004878752191409463,
|
|
"loss": 5.4247,
|
|
"mean_token_accuracy": 0.16750899255275725,
|
|
"num_tokens": 24700742.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 5.660094261169434,
|
|
"epoch": 1.1253518168452006,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004878655193845616,
|
|
"loss": 5.5156,
|
|
"mean_token_accuracy": 0.15948394387960435,
|
|
"num_tokens": 24709329.0,
|
|
"step": 13395
|
|
},
|
|
{
|
|
"entropy": 5.693703031539917,
|
|
"epoch": 1.1257718966603654,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.00048785581585737394,
|
|
"loss": 5.6359,
|
|
"mean_token_accuracy": 0.15693159401416779,
|
|
"num_tokens": 24718475.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 5.770649480819702,
|
|
"epoch": 1.1261919764755304,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.000487846108559555,
|
|
"loss": 5.5083,
|
|
"mean_token_accuracy": 0.16890775114297868,
|
|
"num_tokens": 24727817.0,
|
|
"step": 13405
|
|
},
|
|
{
|
|
"entropy": 5.703707599639893,
|
|
"epoch": 1.1266120562906952,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00048783639749127694,
|
|
"loss": 5.4892,
|
|
"mean_token_accuracy": 0.16033429354429246,
|
|
"num_tokens": 24737057.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 5.686640310287475,
|
|
"epoch": 1.1270321361058602,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004878266826527116,
|
|
"loss": 5.5297,
|
|
"mean_token_accuracy": 0.15543637573719024,
|
|
"num_tokens": 24746016.0,
|
|
"step": 13415
|
|
},
|
|
{
|
|
"entropy": 5.779524898529052,
|
|
"epoch": 1.127452215921025,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00048781696404403126,
|
|
"loss": 5.527,
|
|
"mean_token_accuracy": 0.163545098900795,
|
|
"num_tokens": 24755978.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 5.694488048553467,
|
|
"epoch": 1.12787229573619,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048780724166540794,
|
|
"loss": 5.423,
|
|
"mean_token_accuracy": 0.1599399358034134,
|
|
"num_tokens": 24765255.0,
|
|
"step": 13425
|
|
},
|
|
{
|
|
"entropy": 5.662991142272949,
|
|
"epoch": 1.1282923755513548,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004877975155170139,
|
|
"loss": 5.4922,
|
|
"mean_token_accuracy": 0.15767267495393752,
|
|
"num_tokens": 24774339.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 5.680190658569336,
|
|
"epoch": 1.1287124553665198,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004877877855990215,
|
|
"loss": 5.4979,
|
|
"mean_token_accuracy": 0.1563847467303276,
|
|
"num_tokens": 24783236.0,
|
|
"step": 13435
|
|
},
|
|
{
|
|
"entropy": 5.642968368530274,
|
|
"epoch": 1.1291325351816845,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000487778051911603,
|
|
"loss": 5.4033,
|
|
"mean_token_accuracy": 0.1693968042731285,
|
|
"num_tokens": 24792168.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 5.761270141601562,
|
|
"epoch": 1.1295526149968493,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004877683144549308,
|
|
"loss": 5.5611,
|
|
"mean_token_accuracy": 0.16145953834056853,
|
|
"num_tokens": 24800843.0,
|
|
"step": 13445
|
|
},
|
|
{
|
|
"entropy": 5.7103941440582275,
|
|
"epoch": 1.1299726948120143,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048775857322917753,
|
|
"loss": 5.4436,
|
|
"mean_token_accuracy": 0.15832821130752564,
|
|
"num_tokens": 24810475.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 5.657360696792603,
|
|
"epoch": 1.1303927746271791,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004877488282345158,
|
|
"loss": 5.5202,
|
|
"mean_token_accuracy": 0.16295383870601654,
|
|
"num_tokens": 24820486.0,
|
|
"step": 13455
|
|
},
|
|
{
|
|
"entropy": 5.752575635910034,
|
|
"epoch": 1.1308128544423441,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000487739079471118,
|
|
"loss": 5.5749,
|
|
"mean_token_accuracy": 0.16365474909543992,
|
|
"num_tokens": 24830243.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 5.7682483196258545,
|
|
"epoch": 1.131232934257509,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.000487729326939157,
|
|
"loss": 5.4805,
|
|
"mean_token_accuracy": 0.16092797219753266,
|
|
"num_tokens": 24839090.0,
|
|
"step": 13465
|
|
},
|
|
{
|
|
"entropy": 5.685783910751343,
|
|
"epoch": 1.1316530140726737,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00048771957063880553,
|
|
"loss": 5.4632,
|
|
"mean_token_accuracy": 0.1614797055721283,
|
|
"num_tokens": 24847933.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 5.772010850906372,
|
|
"epoch": 1.1320730938878387,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004877098105702363,
|
|
"loss": 5.4886,
|
|
"mean_token_accuracy": 0.163765586912632,
|
|
"num_tokens": 24857037.0,
|
|
"step": 13475
|
|
},
|
|
{
|
|
"entropy": 5.617125749588013,
|
|
"epoch": 1.1324931737030035,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048770004673362243,
|
|
"loss": 5.3125,
|
|
"mean_token_accuracy": 0.1722439780831337,
|
|
"num_tokens": 24866042.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 5.561356925964356,
|
|
"epoch": 1.1329132535181685,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048769027912913673,
|
|
"loss": 5.2843,
|
|
"mean_token_accuracy": 0.1734999194741249,
|
|
"num_tokens": 24873735.0,
|
|
"step": 13485
|
|
},
|
|
{
|
|
"entropy": 5.528507661819458,
|
|
"epoch": 1.1333333333333333,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004876805077569522,
|
|
"loss": 5.353,
|
|
"mean_token_accuracy": 0.16299628913402558,
|
|
"num_tokens": 24882277.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 5.604131412506104,
|
|
"epoch": 1.133753413148498,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048767073261724204,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.16074343770742416,
|
|
"num_tokens": 24891354.0,
|
|
"step": 13495
|
|
},
|
|
{
|
|
"entropy": 5.686602067947388,
|
|
"epoch": 1.134173492963663,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004876609537101793,
|
|
"loss": 5.4689,
|
|
"mean_token_accuracy": 0.1579518973827362,
|
|
"num_tokens": 24899887.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 5.832871198654175,
|
|
"epoch": 1.1345935727788279,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004876511710359374,
|
|
"loss": 5.4771,
|
|
"mean_token_accuracy": 0.16068532615900039,
|
|
"num_tokens": 24908616.0,
|
|
"step": 13505
|
|
},
|
|
{
|
|
"entropy": 5.792671775817871,
|
|
"epoch": 1.135013652593993,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00048764138459468935,
|
|
"loss": 5.5377,
|
|
"mean_token_accuracy": 0.16124322265386581,
|
|
"num_tokens": 24917864.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 5.755936479568481,
|
|
"epoch": 1.1354337324091577,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00048763159438660876,
|
|
"loss": 5.551,
|
|
"mean_token_accuracy": 0.1572817325592041,
|
|
"num_tokens": 24927864.0,
|
|
"step": 13515
|
|
},
|
|
{
|
|
"entropy": 5.621814107894897,
|
|
"epoch": 1.1358538122243227,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048762180041186893,
|
|
"loss": 5.4411,
|
|
"mean_token_accuracy": 0.16689430475234984,
|
|
"num_tokens": 24937146.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 5.737927103042603,
|
|
"epoch": 1.1362738920394875,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004876120026706434,
|
|
"loss": 5.5174,
|
|
"mean_token_accuracy": 0.16046024858951569,
|
|
"num_tokens": 24945694.0,
|
|
"step": 13525
|
|
},
|
|
{
|
|
"entropy": 5.7014954566955565,
|
|
"epoch": 1.1366939718546525,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004876022011631057,
|
|
"loss": 5.4271,
|
|
"mean_token_accuracy": 0.165780770778656,
|
|
"num_tokens": 24955325.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 5.640952110290527,
|
|
"epoch": 1.1371140516698173,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004875923958894295,
|
|
"loss": 5.2981,
|
|
"mean_token_accuracy": 0.1672575891017914,
|
|
"num_tokens": 24964028.0,
|
|
"step": 13535
|
|
},
|
|
{
|
|
"entropy": 5.672315645217895,
|
|
"epoch": 1.137534131484982,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00048758258684978846,
|
|
"loss": 5.498,
|
|
"mean_token_accuracy": 0.1611057698726654,
|
|
"num_tokens": 24972923.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 5.699390411376953,
|
|
"epoch": 1.137954211300147,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00048757277404435636,
|
|
"loss": 5.3845,
|
|
"mean_token_accuracy": 0.16409458816051484,
|
|
"num_tokens": 24982156.0,
|
|
"step": 13545
|
|
},
|
|
{
|
|
"entropy": 5.678975343704224,
|
|
"epoch": 1.1383742911153119,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000487562957473307,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.16643529236316681,
|
|
"num_tokens": 24991616.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 5.650065231323242,
|
|
"epoch": 1.1387943709304769,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004875531371368144,
|
|
"loss": 5.5046,
|
|
"mean_token_accuracy": 0.1579531379044056,
|
|
"num_tokens": 25001140.0,
|
|
"step": 13555
|
|
},
|
|
{
|
|
"entropy": 5.72753210067749,
|
|
"epoch": 1.1392144507456416,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00048754331303505236,
|
|
"loss": 5.4148,
|
|
"mean_token_accuracy": 0.16427789330482484,
|
|
"num_tokens": 25010863.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 5.72626485824585,
|
|
"epoch": 1.1396345305608064,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048753348516819496,
|
|
"loss": 5.5148,
|
|
"mean_token_accuracy": 0.15984421372413635,
|
|
"num_tokens": 25019770.0,
|
|
"step": 13565
|
|
},
|
|
{
|
|
"entropy": 5.761800861358642,
|
|
"epoch": 1.1400546103759714,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004875236535364163,
|
|
"loss": 5.5556,
|
|
"mean_token_accuracy": 0.15370625630021095,
|
|
"num_tokens": 25029900.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 5.775524997711182,
|
|
"epoch": 1.1404746901911362,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004875138181398906,
|
|
"loss": 5.516,
|
|
"mean_token_accuracy": 0.16178728863596917,
|
|
"num_tokens": 25039428.0,
|
|
"step": 13575
|
|
},
|
|
{
|
|
"entropy": 5.739251804351807,
|
|
"epoch": 1.1408947700063012,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.000487503978978792,
|
|
"loss": 5.5084,
|
|
"mean_token_accuracy": 0.1567676842212677,
|
|
"num_tokens": 25049145.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 5.7199629783630375,
|
|
"epoch": 1.141314849821466,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048749413605329487,
|
|
"loss": 5.5387,
|
|
"mean_token_accuracy": 0.15968952625989913,
|
|
"num_tokens": 25058772.0,
|
|
"step": 13585
|
|
},
|
|
{
|
|
"entropy": 5.715544176101685,
|
|
"epoch": 1.141734929636631,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00048748428936357346,
|
|
"loss": 5.4386,
|
|
"mean_token_accuracy": 0.1636001095175743,
|
|
"num_tokens": 25067249.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 5.647507381439209,
|
|
"epoch": 1.1421550094517958,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004874744389098024,
|
|
"loss": 5.405,
|
|
"mean_token_accuracy": 0.1577477991580963,
|
|
"num_tokens": 25076893.0,
|
|
"step": 13595
|
|
},
|
|
{
|
|
"entropy": 5.634746408462524,
|
|
"epoch": 1.1425750892669608,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004874645846921559,
|
|
"loss": 5.4148,
|
|
"mean_token_accuracy": 0.16532657518982888,
|
|
"num_tokens": 25086238.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 5.669492626190186,
|
|
"epoch": 1.1429951690821256,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00048745472671080884,
|
|
"loss": 5.4414,
|
|
"mean_token_accuracy": 0.1582840844988823,
|
|
"num_tokens": 25095334.0,
|
|
"step": 13605
|
|
},
|
|
{
|
|
"entropy": 5.686340093612671,
|
|
"epoch": 1.1434152488972904,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048744486496593565,
|
|
"loss": 5.4259,
|
|
"mean_token_accuracy": 0.1654140532016754,
|
|
"num_tokens": 25104136.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 5.6616381168365475,
|
|
"epoch": 1.1438353287124554,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000487434999457711,
|
|
"loss": 5.4213,
|
|
"mean_token_accuracy": 0.17221303135156632,
|
|
"num_tokens": 25112629.0,
|
|
"step": 13615
|
|
},
|
|
{
|
|
"entropy": 5.7070547580719,
|
|
"epoch": 1.1442554085276202,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004874251301863098,
|
|
"loss": 5.4605,
|
|
"mean_token_accuracy": 0.1610724672675133,
|
|
"num_tokens": 25121014.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 5.658392524719238,
|
|
"epoch": 1.1446754883427852,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 0.00048741525715190675,
|
|
"loss": 5.4949,
|
|
"mean_token_accuracy": 0.1595884680747986,
|
|
"num_tokens": 25130097.0,
|
|
"step": 13625
|
|
},
|
|
{
|
|
"entropy": 5.7246985912323,
|
|
"epoch": 1.14509556815795,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004874053803546769,
|
|
"loss": 5.5002,
|
|
"mean_token_accuracy": 0.16293734163045884,
|
|
"num_tokens": 25139065.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 5.713643646240234,
|
|
"epoch": 1.1455156479731148,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.000487395499794795,
|
|
"loss": 5.4796,
|
|
"mean_token_accuracy": 0.1665970802307129,
|
|
"num_tokens": 25148852.0,
|
|
"step": 13635
|
|
},
|
|
{
|
|
"entropy": 5.620668411254883,
|
|
"epoch": 1.1459357277882798,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004873856154724362,
|
|
"loss": 5.3741,
|
|
"mean_token_accuracy": 0.17443220168352128,
|
|
"num_tokens": 25157580.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 5.670327091217041,
|
|
"epoch": 1.1463558076034446,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004873757273877756,
|
|
"loss": 5.4831,
|
|
"mean_token_accuracy": 0.1579154871404171,
|
|
"num_tokens": 25166243.0,
|
|
"step": 13645
|
|
},
|
|
{
|
|
"entropy": 5.720182752609253,
|
|
"epoch": 1.1467758874186096,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048736583554098836,
|
|
"loss": 5.49,
|
|
"mean_token_accuracy": 0.16273559033870696,
|
|
"num_tokens": 25174674.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 5.649949932098389,
|
|
"epoch": 1.1471959672337744,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00048735593993224973,
|
|
"loss": 5.4028,
|
|
"mean_token_accuracy": 0.1695830523967743,
|
|
"num_tokens": 25183892.0,
|
|
"step": 13655
|
|
},
|
|
{
|
|
"entropy": 5.677073192596436,
|
|
"epoch": 1.1476160470489394,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048734604056173495,
|
|
"loss": 5.4464,
|
|
"mean_token_accuracy": 0.1643107756972313,
|
|
"num_tokens": 25192731.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 5.715389537811279,
|
|
"epoch": 1.1480361268641042,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00048733613742961933,
|
|
"loss": 5.5484,
|
|
"mean_token_accuracy": 0.16420064717531205,
|
|
"num_tokens": 25201280.0,
|
|
"step": 13665
|
|
},
|
|
{
|
|
"entropy": 5.67736177444458,
|
|
"epoch": 1.1484562066792692,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00048732623053607846,
|
|
"loss": 5.4255,
|
|
"mean_token_accuracy": 0.1610700160264969,
|
|
"num_tokens": 25209929.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 5.672457456588745,
|
|
"epoch": 1.148876286494434,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004873163198812877,
|
|
"loss": 5.3544,
|
|
"mean_token_accuracy": 0.16796983331441878,
|
|
"num_tokens": 25218583.0,
|
|
"step": 13675
|
|
},
|
|
{
|
|
"entropy": 5.768982076644898,
|
|
"epoch": 1.1492963663095987,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004873064054654227,
|
|
"loss": 5.5805,
|
|
"mean_token_accuracy": 0.15605029240250587,
|
|
"num_tokens": 25228949.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 5.741779899597168,
|
|
"epoch": 1.1497164461247638,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00048729648728865904,
|
|
"loss": 5.4092,
|
|
"mean_token_accuracy": 0.17617493420839309,
|
|
"num_tokens": 25238603.0,
|
|
"step": 13685
|
|
},
|
|
{
|
|
"entropy": 5.68451452255249,
|
|
"epoch": 1.1501365259399285,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048728656535117237,
|
|
"loss": 5.5239,
|
|
"mean_token_accuracy": 0.15241808593273162,
|
|
"num_tokens": 25248265.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 5.671699285507202,
|
|
"epoch": 1.1505566057550936,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004872766396531386,
|
|
"loss": 5.5062,
|
|
"mean_token_accuracy": 0.16589785665273665,
|
|
"num_tokens": 25258195.0,
|
|
"step": 13695
|
|
},
|
|
{
|
|
"entropy": 5.758512020111084,
|
|
"epoch": 1.1509766855702583,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00048726671019473335,
|
|
"loss": 5.4622,
|
|
"mean_token_accuracy": 0.16697300374507903,
|
|
"num_tokens": 25267886.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 5.720304870605469,
|
|
"epoch": 1.1513967653854231,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048725677697613267,
|
|
"loss": 5.5039,
|
|
"mean_token_accuracy": 0.16215680837631224,
|
|
"num_tokens": 25277304.0,
|
|
"step": 13705
|
|
},
|
|
{
|
|
"entropy": 5.701442766189575,
|
|
"epoch": 1.1518168452005881,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004872468399975125,
|
|
"loss": 5.5047,
|
|
"mean_token_accuracy": 0.15424503684043883,
|
|
"num_tokens": 25286771.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 5.780902290344239,
|
|
"epoch": 1.152236925015753,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048723689925904884,
|
|
"loss": 5.5656,
|
|
"mean_token_accuracy": 0.15876710936427116,
|
|
"num_tokens": 25296018.0,
|
|
"step": 13715
|
|
},
|
|
{
|
|
"entropy": 5.713040781021118,
|
|
"epoch": 1.152657004830918,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004872269547609179,
|
|
"loss": 5.5103,
|
|
"mean_token_accuracy": 0.1646754786372185,
|
|
"num_tokens": 25305737.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 5.65469765663147,
|
|
"epoch": 1.1530770846460827,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004872170065032956,
|
|
"loss": 5.3432,
|
|
"mean_token_accuracy": 0.1650144189596176,
|
|
"num_tokens": 25314625.0,
|
|
"step": 13725
|
|
},
|
|
{
|
|
"entropy": 5.688196754455566,
|
|
"epoch": 1.1534971644612477,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004872070544863584,
|
|
"loss": 5.4849,
|
|
"mean_token_accuracy": 0.15882542431354524,
|
|
"num_tokens": 25323453.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 5.685961675643921,
|
|
"epoch": 1.1539172442764125,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004871970987102824,
|
|
"loss": 5.4906,
|
|
"mean_token_accuracy": 0.166608627140522,
|
|
"num_tokens": 25333236.0,
|
|
"step": 13735
|
|
},
|
|
{
|
|
"entropy": 5.751754331588745,
|
|
"epoch": 1.1543373240915775,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004871871391752442,
|
|
"loss": 5.3968,
|
|
"mean_token_accuracy": 0.16037501096725465,
|
|
"num_tokens": 25341993.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 5.743503475189209,
|
|
"epoch": 1.1547574039067423,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00048717717588141993,
|
|
"loss": 5.4382,
|
|
"mean_token_accuracy": 0.16419214904308319,
|
|
"num_tokens": 25350695.0,
|
|
"step": 13745
|
|
},
|
|
{
|
|
"entropy": 5.695055913925171,
|
|
"epoch": 1.155177483721907,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004871672088289863,
|
|
"loss": 5.4726,
|
|
"mean_token_accuracy": 0.1616984099149704,
|
|
"num_tokens": 25359044.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 5.670234966278076,
|
|
"epoch": 1.155597563537072,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048715723801811986,
|
|
"loss": 5.4911,
|
|
"mean_token_accuracy": 0.16160673201084136,
|
|
"num_tokens": 25367959.0,
|
|
"step": 13755
|
|
},
|
|
{
|
|
"entropy": 5.719758939743042,
|
|
"epoch": 1.156017643352237,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048714726344899716,
|
|
"loss": 5.51,
|
|
"mean_token_accuracy": 0.16625330299139024,
|
|
"num_tokens": 25376968.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 5.6630126953125,
|
|
"epoch": 1.156437723167402,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004871372851217949,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.16886330991983414,
|
|
"num_tokens": 25385381.0,
|
|
"step": 13765
|
|
},
|
|
{
|
|
"entropy": 5.691815996170044,
|
|
"epoch": 1.1568578029825667,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004871273030366899,
|
|
"loss": 5.4938,
|
|
"mean_token_accuracy": 0.15851637423038484,
|
|
"num_tokens": 25394647.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 5.671438503265381,
|
|
"epoch": 1.1572778827977315,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004871173171938589,
|
|
"loss": 5.4387,
|
|
"mean_token_accuracy": 0.1735491305589676,
|
|
"num_tokens": 25403973.0,
|
|
"step": 13775
|
|
},
|
|
{
|
|
"entropy": 5.642987537384033,
|
|
"epoch": 1.1576979626128965,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004871073275934789,
|
|
"loss": 5.4258,
|
|
"mean_token_accuracy": 0.1666042447090149,
|
|
"num_tokens": 25412319.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 5.623088264465332,
|
|
"epoch": 1.1581180424280613,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048709733423572685,
|
|
"loss": 5.4618,
|
|
"mean_token_accuracy": 0.16146773099899292,
|
|
"num_tokens": 25420558.0,
|
|
"step": 13785
|
|
},
|
|
{
|
|
"entropy": 5.629000854492188,
|
|
"epoch": 1.1585381222432263,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048708733712077973,
|
|
"loss": 5.4071,
|
|
"mean_token_accuracy": 0.16649366915225983,
|
|
"num_tokens": 25429258.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 5.719772052764893,
|
|
"epoch": 1.158958202058391,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004870773362488146,
|
|
"loss": 5.3748,
|
|
"mean_token_accuracy": 0.1697326421737671,
|
|
"num_tokens": 25438005.0,
|
|
"step": 13795
|
|
},
|
|
{
|
|
"entropy": 5.681618309020996,
|
|
"epoch": 1.159378281873556,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004870673316200087,
|
|
"loss": 5.4003,
|
|
"mean_token_accuracy": 0.16728533059358597,
|
|
"num_tokens": 25447120.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 5.646628332138062,
|
|
"epoch": 1.1597983616887209,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004870573232345392,
|
|
"loss": 5.3916,
|
|
"mean_token_accuracy": 0.16811733990907668,
|
|
"num_tokens": 25456216.0,
|
|
"step": 13805
|
|
},
|
|
{
|
|
"entropy": 5.839818906784058,
|
|
"epoch": 1.1602184415038856,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004870473110925834,
|
|
"loss": 5.6768,
|
|
"mean_token_accuracy": 0.15327301174402236,
|
|
"num_tokens": 25466456.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 5.657715559005737,
|
|
"epoch": 1.1606385213190507,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004870372951943187,
|
|
"loss": 5.3212,
|
|
"mean_token_accuracy": 0.1731086015701294,
|
|
"num_tokens": 25475217.0,
|
|
"step": 13815
|
|
},
|
|
{
|
|
"entropy": 5.764273929595947,
|
|
"epoch": 1.1610586011342154,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048702727553992243,
|
|
"loss": 5.6252,
|
|
"mean_token_accuracy": 0.15146582424640656,
|
|
"num_tokens": 25484617.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 5.661474609375,
|
|
"epoch": 1.1614786809493804,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.00048701725212957223,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.17106336653232573,
|
|
"num_tokens": 25493936.0,
|
|
"step": 13825
|
|
},
|
|
{
|
|
"entropy": 5.615126895904541,
|
|
"epoch": 1.1618987607645452,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004870072249634455,
|
|
"loss": 5.3846,
|
|
"mean_token_accuracy": 0.16981288492679597,
|
|
"num_tokens": 25502306.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 5.586185503005981,
|
|
"epoch": 1.1623188405797102,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048699719404172006,
|
|
"loss": 5.4546,
|
|
"mean_token_accuracy": 0.1651104733347893,
|
|
"num_tokens": 25511247.0,
|
|
"step": 13835
|
|
},
|
|
{
|
|
"entropy": 5.713759469985962,
|
|
"epoch": 1.162738920394875,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048698715936457344,
|
|
"loss": 5.5012,
|
|
"mean_token_accuracy": 0.15939352810382842,
|
|
"num_tokens": 25520482.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 5.726053237915039,
|
|
"epoch": 1.1631590002100398,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048697712093218336,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.1690056636929512,
|
|
"num_tokens": 25529854.0,
|
|
"step": 13845
|
|
},
|
|
{
|
|
"entropy": 5.624899101257324,
|
|
"epoch": 1.1635790800252048,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004869670787447279,
|
|
"loss": 5.3395,
|
|
"mean_token_accuracy": 0.16676997542381286,
|
|
"num_tokens": 25538251.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 5.618051338195801,
|
|
"epoch": 1.1639991598403696,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004869570328023846,
|
|
"loss": 5.4133,
|
|
"mean_token_accuracy": 0.16560969799757003,
|
|
"num_tokens": 25546889.0,
|
|
"step": 13855
|
|
},
|
|
{
|
|
"entropy": 5.655919551849365,
|
|
"epoch": 1.1644192396555346,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00048694698310533177,
|
|
"loss": 5.48,
|
|
"mean_token_accuracy": 0.16459716558456422,
|
|
"num_tokens": 25557040.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 5.7215770244598385,
|
|
"epoch": 1.1648393194706994,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004869369296537472,
|
|
"loss": 5.6387,
|
|
"mean_token_accuracy": 0.1548250749707222,
|
|
"num_tokens": 25565798.0,
|
|
"step": 13865
|
|
},
|
|
{
|
|
"entropy": 5.826737976074218,
|
|
"epoch": 1.1652593992858642,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004869268724478091,
|
|
"loss": 5.4626,
|
|
"mean_token_accuracy": 0.1656502142548561,
|
|
"num_tokens": 25575039.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 5.779808759689331,
|
|
"epoch": 1.1656794791010292,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048691681148769545,
|
|
"loss": 5.4698,
|
|
"mean_token_accuracy": 0.16209751814603807,
|
|
"num_tokens": 25584635.0,
|
|
"step": 13875
|
|
},
|
|
{
|
|
"entropy": 5.579784250259399,
|
|
"epoch": 1.166099558916194,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004869067467735847,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.16798000484704972,
|
|
"num_tokens": 25593736.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 5.613956546783447,
|
|
"epoch": 1.166519638731359,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0004868966783056551,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.17804049104452133,
|
|
"num_tokens": 25602685.0,
|
|
"step": 13885
|
|
},
|
|
{
|
|
"entropy": 5.671496915817261,
|
|
"epoch": 1.1669397185465238,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048688660608408484,
|
|
"loss": 5.4521,
|
|
"mean_token_accuracy": 0.16123623102903367,
|
|
"num_tokens": 25610690.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 5.615883159637451,
|
|
"epoch": 1.1673597983616888,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048687653010905254,
|
|
"loss": 5.3419,
|
|
"mean_token_accuracy": 0.16897291988134383,
|
|
"num_tokens": 25619805.0,
|
|
"step": 13895
|
|
},
|
|
{
|
|
"entropy": 5.767966794967651,
|
|
"epoch": 1.1677798781768536,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048686645038073664,
|
|
"loss": 5.5659,
|
|
"mean_token_accuracy": 0.15139710083603858,
|
|
"num_tokens": 25629447.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 5.700986623764038,
|
|
"epoch": 1.1681999579920186,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048685636689931554,
|
|
"loss": 5.4057,
|
|
"mean_token_accuracy": 0.16528156250715256,
|
|
"num_tokens": 25638619.0,
|
|
"step": 13905
|
|
},
|
|
{
|
|
"entropy": 5.720313978195191,
|
|
"epoch": 1.1686200378071834,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048684627966496803,
|
|
"loss": 5.4855,
|
|
"mean_token_accuracy": 0.16764382421970367,
|
|
"num_tokens": 25648255.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 5.695196580886841,
|
|
"epoch": 1.1690401176223482,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048683618867787284,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.15946254581212999,
|
|
"num_tokens": 25657881.0,
|
|
"step": 13915
|
|
},
|
|
{
|
|
"entropy": 5.7503081321716305,
|
|
"epoch": 1.1694601974375132,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004868260939382086,
|
|
"loss": 5.545,
|
|
"mean_token_accuracy": 0.16139545887708664,
|
|
"num_tokens": 25666773.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 5.727688646316528,
|
|
"epoch": 1.169880277252678,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004868159954461542,
|
|
"loss": 5.4278,
|
|
"mean_token_accuracy": 0.16332051604986192,
|
|
"num_tokens": 25675152.0,
|
|
"step": 13925
|
|
},
|
|
{
|
|
"entropy": 5.8233521461486815,
|
|
"epoch": 1.170300357067843,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00048680589320188847,
|
|
"loss": 5.563,
|
|
"mean_token_accuracy": 0.15545087233185767,
|
|
"num_tokens": 25684962.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 5.67341160774231,
|
|
"epoch": 1.1707204368830078,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004867957872055904,
|
|
"loss": 5.4358,
|
|
"mean_token_accuracy": 0.166546930372715,
|
|
"num_tokens": 25693782.0,
|
|
"step": 13935
|
|
},
|
|
{
|
|
"entropy": 5.649929618835449,
|
|
"epoch": 1.1711405166981725,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048678567745743905,
|
|
"loss": 5.4121,
|
|
"mean_token_accuracy": 0.16831570118665695,
|
|
"num_tokens": 25703081.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 5.6604838371276855,
|
|
"epoch": 1.1715605965133375,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004867755639576135,
|
|
"loss": 5.4141,
|
|
"mean_token_accuracy": 0.17139442414045333,
|
|
"num_tokens": 25711628.0,
|
|
"step": 13945
|
|
},
|
|
{
|
|
"entropy": 5.6378460884094235,
|
|
"epoch": 1.1719806763285023,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004867654467062928,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.16958941370248795,
|
|
"num_tokens": 25720676.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 5.623021745681763,
|
|
"epoch": 1.1724007561436673,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00048675532570365633,
|
|
"loss": 5.418,
|
|
"mean_token_accuracy": 0.16936941295862198,
|
|
"num_tokens": 25729920.0,
|
|
"step": 13955
|
|
},
|
|
{
|
|
"entropy": 5.644413042068481,
|
|
"epoch": 1.1728208359588321,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048674520094988327,
|
|
"loss": 5.4047,
|
|
"mean_token_accuracy": 0.1689576655626297,
|
|
"num_tokens": 25739745.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 5.673465824127197,
|
|
"epoch": 1.1732409157739971,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048673507244515303,
|
|
"loss": 5.421,
|
|
"mean_token_accuracy": 0.16571830958127975,
|
|
"num_tokens": 25748636.0,
|
|
"step": 13965
|
|
},
|
|
{
|
|
"entropy": 5.774284315109253,
|
|
"epoch": 1.173660995589162,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000486724940189645,
|
|
"loss": 5.5623,
|
|
"mean_token_accuracy": 0.15929994434118272,
|
|
"num_tokens": 25758393.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 5.73808479309082,
|
|
"epoch": 1.174081075404327,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004867148041835386,
|
|
"loss": 5.5378,
|
|
"mean_token_accuracy": 0.15335596948862076,
|
|
"num_tokens": 25768520.0,
|
|
"step": 13975
|
|
},
|
|
{
|
|
"entropy": 5.613088512420655,
|
|
"epoch": 1.1745011552194917,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004867046644270136,
|
|
"loss": 5.3398,
|
|
"mean_token_accuracy": 0.17122806012630462,
|
|
"num_tokens": 25777168.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 5.7978432178497314,
|
|
"epoch": 1.1749212350346565,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004866945209202494,
|
|
"loss": 5.6517,
|
|
"mean_token_accuracy": 0.14711768478155135,
|
|
"num_tokens": 25787042.0,
|
|
"step": 13985
|
|
},
|
|
{
|
|
"entropy": 5.7582615375518795,
|
|
"epoch": 1.1753413148498215,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004866843736634258,
|
|
"loss": 5.5287,
|
|
"mean_token_accuracy": 0.16342772543430328,
|
|
"num_tokens": 25796784.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 5.7990720748901365,
|
|
"epoch": 1.1757613946649863,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004866742226567225,
|
|
"loss": 5.561,
|
|
"mean_token_accuracy": 0.1599314257502556,
|
|
"num_tokens": 25806285.0,
|
|
"step": 13995
|
|
},
|
|
{
|
|
"entropy": 5.723859405517578,
|
|
"epoch": 1.1761814744801513,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.00048666406790031936,
|
|
"loss": 5.4036,
|
|
"mean_token_accuracy": 0.16107962131500245,
|
|
"num_tokens": 25814889.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 5.662409734725952,
|
|
"epoch": 1.176601554295316,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004866539093943962,
|
|
"loss": 5.4512,
|
|
"mean_token_accuracy": 0.16538093835115433,
|
|
"num_tokens": 25824551.0,
|
|
"step": 14005
|
|
},
|
|
{
|
|
"entropy": 5.746952390670776,
|
|
"epoch": 1.1770216341104809,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048664374713913304,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.15985865890979767,
|
|
"num_tokens": 25834482.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 5.755951976776123,
|
|
"epoch": 1.177441713925646,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004866335811347099,
|
|
"loss": 5.5288,
|
|
"mean_token_accuracy": 0.1602414257824421,
|
|
"num_tokens": 25843274.0,
|
|
"step": 14015
|
|
},
|
|
{
|
|
"entropy": 5.80555944442749,
|
|
"epoch": 1.1778617937408107,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048662341138130683,
|
|
"loss": 5.521,
|
|
"mean_token_accuracy": 0.15339552462100983,
|
|
"num_tokens": 25852482.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 5.730782413482666,
|
|
"epoch": 1.1782818735559757,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048661323787910405,
|
|
"loss": 5.4969,
|
|
"mean_token_accuracy": 0.1554713472723961,
|
|
"num_tokens": 25862657.0,
|
|
"step": 14025
|
|
},
|
|
{
|
|
"entropy": 5.663182163238526,
|
|
"epoch": 1.1787019533711405,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004866030606282817,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.16776310056447982,
|
|
"num_tokens": 25871492.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 5.734621810913086,
|
|
"epoch": 1.1791220331863055,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048659287962902006,
|
|
"loss": 5.4536,
|
|
"mean_token_accuracy": 0.1627289742231369,
|
|
"num_tokens": 25880979.0,
|
|
"step": 14035
|
|
},
|
|
{
|
|
"entropy": 5.732399988174438,
|
|
"epoch": 1.1795421130014703,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00048658269488149945,
|
|
"loss": 5.4554,
|
|
"mean_token_accuracy": 0.16046008914709092,
|
|
"num_tokens": 25891060.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 5.822850942611694,
|
|
"epoch": 1.1799621928166353,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0004865725063859005,
|
|
"loss": 5.5659,
|
|
"mean_token_accuracy": 0.16293970942497255,
|
|
"num_tokens": 25900421.0,
|
|
"step": 14045
|
|
},
|
|
{
|
|
"entropy": 5.73273401260376,
|
|
"epoch": 1.1803822726318,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048656231414240345,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.15972733795642852,
|
|
"num_tokens": 25909614.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 5.668324518203735,
|
|
"epoch": 1.1808023524469649,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.000486552118151189,
|
|
"loss": 5.4895,
|
|
"mean_token_accuracy": 0.15764440298080445,
|
|
"num_tokens": 25919324.0,
|
|
"step": 14055
|
|
},
|
|
{
|
|
"entropy": 5.670943117141723,
|
|
"epoch": 1.1812224322621299,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00048654191841243763,
|
|
"loss": 5.4993,
|
|
"mean_token_accuracy": 0.1652704119682312,
|
|
"num_tokens": 25928818.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 5.75603985786438,
|
|
"epoch": 1.1816425120772946,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004865317149263301,
|
|
"loss": 5.5482,
|
|
"mean_token_accuracy": 0.16319168210029603,
|
|
"num_tokens": 25938148.0,
|
|
"step": 14065
|
|
},
|
|
{
|
|
"entropy": 5.6569633960723875,
|
|
"epoch": 1.1820625918924597,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004865215076930473,
|
|
"loss": 5.4529,
|
|
"mean_token_accuracy": 0.16367049515247345,
|
|
"num_tokens": 25947210.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 5.660248327255249,
|
|
"epoch": 1.1824826717076244,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004865112967127697,
|
|
"loss": 5.4428,
|
|
"mean_token_accuracy": 0.16496210247278215,
|
|
"num_tokens": 25955949.0,
|
|
"step": 14075
|
|
},
|
|
{
|
|
"entropy": 5.648013925552368,
|
|
"epoch": 1.1829027515227892,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004865010819856786,
|
|
"loss": 5.3959,
|
|
"mean_token_accuracy": 0.16307084411382675,
|
|
"num_tokens": 25964193.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 5.673745965957641,
|
|
"epoch": 1.1833228313379542,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004864908635119546,
|
|
"loss": 5.4612,
|
|
"mean_token_accuracy": 0.1630059838294983,
|
|
"num_tokens": 25973141.0,
|
|
"step": 14085
|
|
},
|
|
{
|
|
"entropy": 5.725007057189941,
|
|
"epoch": 1.183742911153119,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004864806412917788,
|
|
"loss": 5.5398,
|
|
"mean_token_accuracy": 0.158825521171093,
|
|
"num_tokens": 25982650.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 5.773545980453491,
|
|
"epoch": 1.184162990968284,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004864704153253325,
|
|
"loss": 5.5371,
|
|
"mean_token_accuracy": 0.1549429714679718,
|
|
"num_tokens": 25992096.0,
|
|
"step": 14095
|
|
},
|
|
{
|
|
"entropy": 5.772162914276123,
|
|
"epoch": 1.1845830707834488,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 0.00048646018561279665,
|
|
"loss": 5.5104,
|
|
"mean_token_accuracy": 0.16160587966442108,
|
|
"num_tokens": 26002063.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 5.648436164855957,
|
|
"epoch": 1.1850031505986138,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.00048644995215435245,
|
|
"loss": 5.3414,
|
|
"mean_token_accuracy": 0.1703270673751831,
|
|
"num_tokens": 26010716.0,
|
|
"step": 14105
|
|
},
|
|
{
|
|
"entropy": 5.677743911743164,
|
|
"epoch": 1.1854232304137786,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004864397149501812,
|
|
"loss": 5.4265,
|
|
"mean_token_accuracy": 0.16840701997280122,
|
|
"num_tokens": 26019136.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 5.679789972305298,
|
|
"epoch": 1.1858433102289434,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048642947400046434,
|
|
"loss": 5.4571,
|
|
"mean_token_accuracy": 0.17166182398796082,
|
|
"num_tokens": 26028029.0,
|
|
"step": 14115
|
|
},
|
|
{
|
|
"entropy": 5.77405161857605,
|
|
"epoch": 1.1862633900441084,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048641922930538325,
|
|
"loss": 5.6101,
|
|
"mean_token_accuracy": 0.15164516270160674,
|
|
"num_tokens": 26038025.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 5.7644017219543455,
|
|
"epoch": 1.1866834698592732,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004864089808651193,
|
|
"loss": 5.5754,
|
|
"mean_token_accuracy": 0.14774202257394792,
|
|
"num_tokens": 26048427.0,
|
|
"step": 14125
|
|
},
|
|
{
|
|
"entropy": 5.729209041595459,
|
|
"epoch": 1.1871035496744382,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004863987286798541,
|
|
"loss": 5.3801,
|
|
"mean_token_accuracy": 0.16284161061048508,
|
|
"num_tokens": 26057682.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 5.64456000328064,
|
|
"epoch": 1.187523629489603,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004863884727497693,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.1594451993703842,
|
|
"num_tokens": 26066562.0,
|
|
"step": 14135
|
|
},
|
|
{
|
|
"entropy": 5.6487713813781735,
|
|
"epoch": 1.187943709304768,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004863782130750466,
|
|
"loss": 5.3779,
|
|
"mean_token_accuracy": 0.16446612328290938,
|
|
"num_tokens": 26075633.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 5.723405551910401,
|
|
"epoch": 1.1883637891199328,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00048636794965586764,
|
|
"loss": 5.5428,
|
|
"mean_token_accuracy": 0.1594787582755089,
|
|
"num_tokens": 26085160.0,
|
|
"step": 14145
|
|
},
|
|
{
|
|
"entropy": 5.711528730392456,
|
|
"epoch": 1.1887838689350976,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048635768249241434,
|
|
"loss": 5.4197,
|
|
"mean_token_accuracy": 0.16347247660160064,
|
|
"num_tokens": 26094157.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 5.7905010223388675,
|
|
"epoch": 1.1892039487502626,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004863474115848685,
|
|
"loss": 5.5487,
|
|
"mean_token_accuracy": 0.16446382999420167,
|
|
"num_tokens": 26104459.0,
|
|
"step": 14155
|
|
},
|
|
{
|
|
"entropy": 5.677060556411743,
|
|
"epoch": 1.1896240285654274,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00048633713693341214,
|
|
"loss": 5.4867,
|
|
"mean_token_accuracy": 0.16067123413085938,
|
|
"num_tokens": 26114468.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 5.663212585449219,
|
|
"epoch": 1.1900441083805924,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048632685853822714,
|
|
"loss": 5.4469,
|
|
"mean_token_accuracy": 0.1624838277697563,
|
|
"num_tokens": 26123408.0,
|
|
"step": 14165
|
|
},
|
|
{
|
|
"entropy": 5.629336786270142,
|
|
"epoch": 1.1904641881957572,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004863165763994957,
|
|
"loss": 5.4641,
|
|
"mean_token_accuracy": 0.15543654710054397,
|
|
"num_tokens": 26132692.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 5.753988409042359,
|
|
"epoch": 1.190884268010922,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004863062905173999,
|
|
"loss": 5.6279,
|
|
"mean_token_accuracy": 0.15481553226709366,
|
|
"num_tokens": 26142259.0,
|
|
"step": 14175
|
|
},
|
|
{
|
|
"entropy": 5.779358720779419,
|
|
"epoch": 1.191304347826087,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000486296000892122,
|
|
"loss": 5.4794,
|
|
"mean_token_accuracy": 0.16091232895851135,
|
|
"num_tokens": 26151782.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 5.650760316848755,
|
|
"epoch": 1.1917244276412517,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00048628570752384424,
|
|
"loss": 5.3234,
|
|
"mean_token_accuracy": 0.16556781977415086,
|
|
"num_tokens": 26160449.0,
|
|
"step": 14185
|
|
},
|
|
{
|
|
"entropy": 5.700650358200074,
|
|
"epoch": 1.1921445074564168,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00048627541041274897,
|
|
"loss": 5.5649,
|
|
"mean_token_accuracy": 0.1567431628704071,
|
|
"num_tokens": 26169764.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 5.703988265991211,
|
|
"epoch": 1.1925645872715815,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048626510955901854,
|
|
"loss": 5.4088,
|
|
"mean_token_accuracy": 0.15994867235422133,
|
|
"num_tokens": 26178759.0,
|
|
"step": 14195
|
|
},
|
|
{
|
|
"entropy": 5.739314889907837,
|
|
"epoch": 1.1929846670867466,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004862548049628356,
|
|
"loss": 5.5509,
|
|
"mean_token_accuracy": 0.1646982505917549,
|
|
"num_tokens": 26187904.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 5.734980583190918,
|
|
"epoch": 1.1934047469019113,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004862444966243824,
|
|
"loss": 5.4643,
|
|
"mean_token_accuracy": 0.1669871285557747,
|
|
"num_tokens": 26196563.0,
|
|
"step": 14205
|
|
},
|
|
{
|
|
"entropy": 5.778663492202758,
|
|
"epoch": 1.1938248267170763,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004862341845438419,
|
|
"loss": 5.4847,
|
|
"mean_token_accuracy": 0.16169409304857255,
|
|
"num_tokens": 26206573.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 5.661051893234253,
|
|
"epoch": 1.1942449065322411,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048622386872139645,
|
|
"loss": 5.3909,
|
|
"mean_token_accuracy": 0.16438209414482116,
|
|
"num_tokens": 26215308.0,
|
|
"step": 14215
|
|
},
|
|
{
|
|
"entropy": 5.587487888336182,
|
|
"epoch": 1.194664986347406,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.000486213549157229,
|
|
"loss": 5.4567,
|
|
"mean_token_accuracy": 0.1640901729464531,
|
|
"num_tokens": 26224379.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 5.664547252655029,
|
|
"epoch": 1.195085066162571,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004862032258515222,
|
|
"loss": 5.4358,
|
|
"mean_token_accuracy": 0.1679796889424324,
|
|
"num_tokens": 26233620.0,
|
|
"step": 14225
|
|
},
|
|
{
|
|
"entropy": 5.725254678726197,
|
|
"epoch": 1.1955051459777357,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004861928988044592,
|
|
"loss": 5.5138,
|
|
"mean_token_accuracy": 0.15623046904802323,
|
|
"num_tokens": 26242556.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 5.718895196914673,
|
|
"epoch": 1.1959252257929007,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004861825680162226,
|
|
"loss": 5.4946,
|
|
"mean_token_accuracy": 0.16485830694437026,
|
|
"num_tokens": 26251561.0,
|
|
"step": 14235
|
|
},
|
|
{
|
|
"entropy": 5.664663934707642,
|
|
"epoch": 1.1963453056080655,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00048617223348699546,
|
|
"loss": 5.4329,
|
|
"mean_token_accuracy": 0.16026019304990768,
|
|
"num_tokens": 26261115.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 5.770184707641602,
|
|
"epoch": 1.1967653854232303,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0004861618952169611,
|
|
"loss": 5.591,
|
|
"mean_token_accuracy": 0.1603381484746933,
|
|
"num_tokens": 26271165.0,
|
|
"step": 14245
|
|
},
|
|
{
|
|
"entropy": 5.695276260375977,
|
|
"epoch": 1.1971854652383953,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004861515532063025,
|
|
"loss": 5.5429,
|
|
"mean_token_accuracy": 0.16051559895277023,
|
|
"num_tokens": 26280822.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 5.69549150466919,
|
|
"epoch": 1.19760554505356,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00048614120745520275,
|
|
"loss": 5.4191,
|
|
"mean_token_accuracy": 0.16700200736522675,
|
|
"num_tokens": 26288747.0,
|
|
"step": 14255
|
|
},
|
|
{
|
|
"entropy": 5.7050079822540285,
|
|
"epoch": 1.198025624868725,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048613085796384524,
|
|
"loss": 5.4945,
|
|
"mean_token_accuracy": 0.15817514955997466,
|
|
"num_tokens": 26298387.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 5.639023733139038,
|
|
"epoch": 1.19844570468389,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048612050473241335,
|
|
"loss": 5.3966,
|
|
"mean_token_accuracy": 0.16590498983860016,
|
|
"num_tokens": 26307016.0,
|
|
"step": 14265
|
|
},
|
|
{
|
|
"entropy": 5.690613889694214,
|
|
"epoch": 1.198865784499055,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0004861101477610905,
|
|
"loss": 5.5035,
|
|
"mean_token_accuracy": 0.16300584375858307,
|
|
"num_tokens": 26316296.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 5.692527532577515,
|
|
"epoch": 1.1992858643142197,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00048609978705006,
|
|
"loss": 5.4837,
|
|
"mean_token_accuracy": 0.1594039648771286,
|
|
"num_tokens": 26325525.0,
|
|
"step": 14275
|
|
},
|
|
{
|
|
"entropy": 5.666857576370239,
|
|
"epoch": 1.1997059441293847,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004860894225995055,
|
|
"loss": 5.377,
|
|
"mean_token_accuracy": 0.16849509179592131,
|
|
"num_tokens": 26334195.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 5.684696054458618,
|
|
"epoch": 1.2001260239445495,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00048607905440961054,
|
|
"loss": 5.512,
|
|
"mean_token_accuracy": 0.16250620037317276,
|
|
"num_tokens": 26343933.0,
|
|
"step": 14285
|
|
},
|
|
{
|
|
"entropy": 5.738911294937134,
|
|
"epoch": 1.2005461037597143,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048606868248055887,
|
|
"loss": 5.4441,
|
|
"mean_token_accuracy": 0.16386907249689103,
|
|
"num_tokens": 26353455.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 5.790994453430176,
|
|
"epoch": 1.2009661835748793,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004860583068125341,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.16346363723278046,
|
|
"num_tokens": 26362662.0,
|
|
"step": 14295
|
|
},
|
|
{
|
|
"entropy": 5.692120361328125,
|
|
"epoch": 1.201386263390044,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004860479274057202,
|
|
"loss": 5.4509,
|
|
"mean_token_accuracy": 0.1605956733226776,
|
|
"num_tokens": 26371536.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 5.720314931869507,
|
|
"epoch": 1.201806343205209,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048603754426030087,
|
|
"loss": 5.5496,
|
|
"mean_token_accuracy": 0.1566978722810745,
|
|
"num_tokens": 26381925.0,
|
|
"step": 14305
|
|
},
|
|
{
|
|
"entropy": 5.662918376922607,
|
|
"epoch": 1.2022264230203739,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00048602715737646016,
|
|
"loss": 5.4158,
|
|
"mean_token_accuracy": 0.16778195053339004,
|
|
"num_tokens": 26391111.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 5.84964919090271,
|
|
"epoch": 1.2026465028355386,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00048601676675438197,
|
|
"loss": 5.5865,
|
|
"mean_token_accuracy": 0.1477577805519104,
|
|
"num_tokens": 26401667.0,
|
|
"step": 14315
|
|
},
|
|
{
|
|
"entropy": 5.686867713928223,
|
|
"epoch": 1.2030665826507037,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.00048600637239425045,
|
|
"loss": 5.3949,
|
|
"mean_token_accuracy": 0.1717774584889412,
|
|
"num_tokens": 26411261.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 5.671998596191406,
|
|
"epoch": 1.2034866624658684,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048599597429624966,
|
|
"loss": 5.5392,
|
|
"mean_token_accuracy": 0.15727996453642845,
|
|
"num_tokens": 26419808.0,
|
|
"step": 14325
|
|
},
|
|
{
|
|
"entropy": 5.687448644638062,
|
|
"epoch": 1.2039067422810334,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048598557246056385,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.16296297758817674,
|
|
"num_tokens": 26429160.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 5.683249378204346,
|
|
"epoch": 1.2043268220961982,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048597516688737727,
|
|
"loss": 5.4074,
|
|
"mean_token_accuracy": 0.16682589650154114,
|
|
"num_tokens": 26437675.0,
|
|
"step": 14335
|
|
},
|
|
{
|
|
"entropy": 5.6975466251373295,
|
|
"epoch": 1.2047469019113632,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00048596475757687425,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.16042584478855132,
|
|
"num_tokens": 26446317.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 5.708725118637085,
|
|
"epoch": 1.205166981726528,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00048595434452923915,
|
|
"loss": 5.5139,
|
|
"mean_token_accuracy": 0.16216087639331817,
|
|
"num_tokens": 26456183.0,
|
|
"step": 14345
|
|
},
|
|
{
|
|
"entropy": 5.6849305629730225,
|
|
"epoch": 1.205587061541693,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00048594392774465656,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.15838514566421508,
|
|
"num_tokens": 26466324.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 5.690262508392334,
|
|
"epoch": 1.2060071413568578,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00048593350722331074,
|
|
"loss": 5.4705,
|
|
"mean_token_accuracy": 0.1616607829928398,
|
|
"num_tokens": 26475560.0,
|
|
"step": 14355
|
|
},
|
|
{
|
|
"entropy": 5.686220169067383,
|
|
"epoch": 1.2064272211720226,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048592308296538654,
|
|
"loss": 5.4449,
|
|
"mean_token_accuracy": 0.16128322407603263,
|
|
"num_tokens": 26484955.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 5.690824508666992,
|
|
"epoch": 1.2068473009871876,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004859126549710686,
|
|
"loss": 5.4025,
|
|
"mean_token_accuracy": 0.17448743879795076,
|
|
"num_tokens": 26494306.0,
|
|
"step": 14365
|
|
},
|
|
{
|
|
"entropy": 5.605901575088501,
|
|
"epoch": 1.2072673808023524,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00048590222324054153,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.16911747306585312,
|
|
"num_tokens": 26503871.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 5.741916131973267,
|
|
"epoch": 1.2076874606175174,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004858917877739901,
|
|
"loss": 5.5106,
|
|
"mean_token_accuracy": 0.16466034948825836,
|
|
"num_tokens": 26511929.0,
|
|
"step": 14375
|
|
},
|
|
{
|
|
"entropy": 5.765912389755249,
|
|
"epoch": 1.2081075404326822,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004858813485715994,
|
|
"loss": 5.5129,
|
|
"mean_token_accuracy": 0.15352574586868287,
|
|
"num_tokens": 26520469.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 5.6565714359283445,
|
|
"epoch": 1.208527620247847,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004858709056335541,
|
|
"loss": 5.4803,
|
|
"mean_token_accuracy": 0.16136947721242906,
|
|
"num_tokens": 26530102.0,
|
|
"step": 14385
|
|
},
|
|
{
|
|
"entropy": 5.6539154052734375,
|
|
"epoch": 1.208947700063012,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00048586045896003926,
|
|
"loss": 5.4784,
|
|
"mean_token_accuracy": 0.15783216953277587,
|
|
"num_tokens": 26538705.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 5.786140489578247,
|
|
"epoch": 1.2093677798781768,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004858500085512401,
|
|
"loss": 5.5837,
|
|
"mean_token_accuracy": 0.15772880017757415,
|
|
"num_tokens": 26548315.0,
|
|
"step": 14395
|
|
},
|
|
{
|
|
"entropy": 5.7165955066680905,
|
|
"epoch": 1.2097878596933418,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00048583955440734144,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.1629326745867729,
|
|
"num_tokens": 26556412.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 5.70180230140686,
|
|
"epoch": 1.2102079395085066,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048582909652852873,
|
|
"loss": 5.5744,
|
|
"mean_token_accuracy": 0.16115047186613082,
|
|
"num_tokens": 26566146.0,
|
|
"step": 14405
|
|
},
|
|
{
|
|
"entropy": 5.715734386444092,
|
|
"epoch": 1.2106280193236716,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004858186349149871,
|
|
"loss": 5.4691,
|
|
"mean_token_accuracy": 0.16265431568026542,
|
|
"num_tokens": 26576019.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 5.612696838378906,
|
|
"epoch": 1.2110480991388364,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.000485808169566902,
|
|
"loss": 5.3309,
|
|
"mean_token_accuracy": 0.1696453645825386,
|
|
"num_tokens": 26585461.0,
|
|
"step": 14415
|
|
},
|
|
{
|
|
"entropy": 5.62654824256897,
|
|
"epoch": 1.2114681789540014,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048579770048445863,
|
|
"loss": 5.3726,
|
|
"mean_token_accuracy": 0.18201425969600676,
|
|
"num_tokens": 26594021.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 5.753058910369873,
|
|
"epoch": 1.2118882587691662,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048578722766784253,
|
|
"loss": 5.5086,
|
|
"mean_token_accuracy": 0.16204283237457276,
|
|
"num_tokens": 26602712.0,
|
|
"step": 14425
|
|
},
|
|
{
|
|
"entropy": 5.593479490280151,
|
|
"epoch": 1.212308338584331,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048577675111723925,
|
|
"loss": 5.2025,
|
|
"mean_token_accuracy": 0.18278668075799942,
|
|
"num_tokens": 26610970.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 5.615044832229614,
|
|
"epoch": 1.212728418399496,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048576627083283435,
|
|
"loss": 5.4954,
|
|
"mean_token_accuracy": 0.16862737089395524,
|
|
"num_tokens": 26619840.0,
|
|
"step": 14435
|
|
},
|
|
{
|
|
"entropy": 5.663373374938965,
|
|
"epoch": 1.2131484982146608,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004857557868148136,
|
|
"loss": 5.4002,
|
|
"mean_token_accuracy": 0.16440703421831132,
|
|
"num_tokens": 26629271.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 5.672978448867798,
|
|
"epoch": 1.2135685780298258,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004857452990633625,
|
|
"loss": 5.4333,
|
|
"mean_token_accuracy": 0.16087207645177842,
|
|
"num_tokens": 26638610.0,
|
|
"step": 14445
|
|
},
|
|
{
|
|
"entropy": 5.792498302459717,
|
|
"epoch": 1.2139886578449905,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048573480757866695,
|
|
"loss": 5.5919,
|
|
"mean_token_accuracy": 0.15683359503746033,
|
|
"num_tokens": 26648504.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 5.720464372634888,
|
|
"epoch": 1.2144087376601553,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048572431236091284,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.16139672845602035,
|
|
"num_tokens": 26658084.0,
|
|
"step": 14455
|
|
},
|
|
{
|
|
"entropy": 5.712548398971558,
|
|
"epoch": 1.2148288174753203,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048571381341028604,
|
|
"loss": 5.5517,
|
|
"mean_token_accuracy": 0.16073913276195526,
|
|
"num_tokens": 26666933.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 5.752342224121094,
|
|
"epoch": 1.2152488972904851,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004857033107269725,
|
|
"loss": 5.4182,
|
|
"mean_token_accuracy": 0.16725114732980728,
|
|
"num_tokens": 26675049.0,
|
|
"step": 14465
|
|
},
|
|
{
|
|
"entropy": 5.664717102050782,
|
|
"epoch": 1.2156689771056501,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048569280431115823,
|
|
"loss": 5.4942,
|
|
"mean_token_accuracy": 0.16280431896448136,
|
|
"num_tokens": 26684223.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 5.666110849380493,
|
|
"epoch": 1.216089056920815,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004856822941630296,
|
|
"loss": 5.4388,
|
|
"mean_token_accuracy": 0.15747048407793046,
|
|
"num_tokens": 26693605.0,
|
|
"step": 14475
|
|
},
|
|
{
|
|
"entropy": 5.7499290943145756,
|
|
"epoch": 1.2165091367359797,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00048567178028277255,
|
|
"loss": 5.5114,
|
|
"mean_token_accuracy": 0.16667446196079255,
|
|
"num_tokens": 26702829.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 5.765132427215576,
|
|
"epoch": 1.2169292165511447,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004856612626705733,
|
|
"loss": 5.5496,
|
|
"mean_token_accuracy": 0.15713531970977784,
|
|
"num_tokens": 26712466.0,
|
|
"step": 14485
|
|
},
|
|
{
|
|
"entropy": 5.745383930206299,
|
|
"epoch": 1.2173492963663095,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004856507413266183,
|
|
"loss": 5.4247,
|
|
"mean_token_accuracy": 0.16737874001264572,
|
|
"num_tokens": 26721730.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 5.637966871261597,
|
|
"epoch": 1.2177693761814745,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000485640216251094,
|
|
"loss": 5.5088,
|
|
"mean_token_accuracy": 0.16009110063314438,
|
|
"num_tokens": 26731017.0,
|
|
"step": 14495
|
|
},
|
|
{
|
|
"entropy": 5.674624824523926,
|
|
"epoch": 1.2181894559966393,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00048562968744418665,
|
|
"loss": 5.4761,
|
|
"mean_token_accuracy": 0.16008124649524688,
|
|
"num_tokens": 26739588.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 5.764046764373779,
|
|
"epoch": 1.2186095358118043,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004856191549060828,
|
|
"loss": 5.6018,
|
|
"mean_token_accuracy": 0.15619692504405974,
|
|
"num_tokens": 26748889.0,
|
|
"step": 14505
|
|
},
|
|
{
|
|
"entropy": 5.754044675827027,
|
|
"epoch": 1.219029615626969,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048560861863696913,
|
|
"loss": 5.5297,
|
|
"mean_token_accuracy": 0.15980444252490997,
|
|
"num_tokens": 26757979.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 5.707068204879761,
|
|
"epoch": 1.219449695442134,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004855980786370322,
|
|
"loss": 5.4485,
|
|
"mean_token_accuracy": 0.16127097010612487,
|
|
"num_tokens": 26767225.0,
|
|
"step": 14515
|
|
},
|
|
{
|
|
"entropy": 5.6698870182037355,
|
|
"epoch": 1.219869775257299,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004855875349064588,
|
|
"loss": 5.3966,
|
|
"mean_token_accuracy": 0.16548994332551956,
|
|
"num_tokens": 26776289.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 5.744715166091919,
|
|
"epoch": 1.2202898550724637,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004855769874454356,
|
|
"loss": 5.5192,
|
|
"mean_token_accuracy": 0.16024302393198014,
|
|
"num_tokens": 26785631.0,
|
|
"step": 14525
|
|
},
|
|
{
|
|
"entropy": 5.691872644424438,
|
|
"epoch": 1.2207099348876287,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004855664362541495,
|
|
"loss": 5.5232,
|
|
"mean_token_accuracy": 0.16038859486579896,
|
|
"num_tokens": 26795285.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 5.651829099655151,
|
|
"epoch": 1.2211300147027935,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048555588133278744,
|
|
"loss": 5.4307,
|
|
"mean_token_accuracy": 0.16211945861577987,
|
|
"num_tokens": 26804584.0,
|
|
"step": 14535
|
|
},
|
|
{
|
|
"entropy": 5.604468536376953,
|
|
"epoch": 1.2215500945179585,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004855453226815363,
|
|
"loss": 5.3061,
|
|
"mean_token_accuracy": 0.17006382644176482,
|
|
"num_tokens": 26814354.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 5.626403427124023,
|
|
"epoch": 1.2219701743331233,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00048553476030058326,
|
|
"loss": 5.3466,
|
|
"mean_token_accuracy": 0.17612583935260773,
|
|
"num_tokens": 26824274.0,
|
|
"step": 14545
|
|
},
|
|
{
|
|
"entropy": 5.616889381408692,
|
|
"epoch": 1.222390254148288,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048552419419011536,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.16051012128591538,
|
|
"num_tokens": 26833155.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 5.667688083648682,
|
|
"epoch": 1.222810333963453,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004855136243503196,
|
|
"loss": 5.3997,
|
|
"mean_token_accuracy": 0.1646553486585617,
|
|
"num_tokens": 26842545.0,
|
|
"step": 14555
|
|
},
|
|
{
|
|
"entropy": 5.739150142669677,
|
|
"epoch": 1.2232304137786179,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048550305078138363,
|
|
"loss": 5.481,
|
|
"mean_token_accuracy": 0.16481468081474304,
|
|
"num_tokens": 26851772.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 5.648625612258911,
|
|
"epoch": 1.2236504935937829,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048549247348349435,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.16550036519765854,
|
|
"num_tokens": 26860884.0,
|
|
"step": 14565
|
|
},
|
|
{
|
|
"entropy": 5.679945564270019,
|
|
"epoch": 1.2240705734089476,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00048548189245683934,
|
|
"loss": 5.5126,
|
|
"mean_token_accuracy": 0.1663243889808655,
|
|
"num_tokens": 26869435.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 5.681559896469116,
|
|
"epoch": 1.2244906532241127,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048547130770160596,
|
|
"loss": 5.4131,
|
|
"mean_token_accuracy": 0.16150881946086884,
|
|
"num_tokens": 26878852.0,
|
|
"step": 14575
|
|
},
|
|
{
|
|
"entropy": 5.70316162109375,
|
|
"epoch": 1.2249107330392774,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004854607192179817,
|
|
"loss": 5.3864,
|
|
"mean_token_accuracy": 0.1695043832063675,
|
|
"num_tokens": 26887532.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 5.844434452056885,
|
|
"epoch": 1.2253308128544425,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004854501270061543,
|
|
"loss": 5.6029,
|
|
"mean_token_accuracy": 0.15792314410209657,
|
|
"num_tokens": 26897459.0,
|
|
"step": 14585
|
|
},
|
|
{
|
|
"entropy": 5.618150424957276,
|
|
"epoch": 1.2257508926696072,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048543953106631115,
|
|
"loss": 5.3795,
|
|
"mean_token_accuracy": 0.16793021261692048,
|
|
"num_tokens": 26907156.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 5.732923221588135,
|
|
"epoch": 1.226170972484772,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004854289313986401,
|
|
"loss": 5.4648,
|
|
"mean_token_accuracy": 0.16741324663162233,
|
|
"num_tokens": 26915764.0,
|
|
"step": 14595
|
|
},
|
|
{
|
|
"entropy": 5.644811153411865,
|
|
"epoch": 1.226591052299937,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004854183280033289,
|
|
"loss": 5.3429,
|
|
"mean_token_accuracy": 0.16403224915266038,
|
|
"num_tokens": 26924166.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 5.6976734638214115,
|
|
"epoch": 1.2270111321151018,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004854077208805654,
|
|
"loss": 5.5704,
|
|
"mean_token_accuracy": 0.1540565922856331,
|
|
"num_tokens": 26933546.0,
|
|
"step": 14605
|
|
},
|
|
{
|
|
"entropy": 5.7353489875793455,
|
|
"epoch": 1.2274312119302668,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004853971100305374,
|
|
"loss": 5.4901,
|
|
"mean_token_accuracy": 0.1645752012729645,
|
|
"num_tokens": 26943213.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 5.752119350433349,
|
|
"epoch": 1.2278512917454316,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000485386495453433,
|
|
"loss": 5.4702,
|
|
"mean_token_accuracy": 0.16524574309587478,
|
|
"num_tokens": 26952968.0,
|
|
"step": 14615
|
|
},
|
|
{
|
|
"entropy": 5.690602731704712,
|
|
"epoch": 1.2282713715605964,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048537587714944007,
|
|
"loss": 5.431,
|
|
"mean_token_accuracy": 0.16387941986322402,
|
|
"num_tokens": 26962230.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 5.637970733642578,
|
|
"epoch": 1.2286914513757614,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004853652551187469,
|
|
"loss": 5.5035,
|
|
"mean_token_accuracy": 0.16774664968252181,
|
|
"num_tokens": 26970985.0,
|
|
"step": 14625
|
|
},
|
|
{
|
|
"entropy": 5.707252836227417,
|
|
"epoch": 1.2291115311909262,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048535462936154147,
|
|
"loss": 5.5344,
|
|
"mean_token_accuracy": 0.16012766510248183,
|
|
"num_tokens": 26981138.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 5.622266340255737,
|
|
"epoch": 1.2295316110060912,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004853439998780122,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.17002979367971421,
|
|
"num_tokens": 26990158.0,
|
|
"step": 14635
|
|
},
|
|
{
|
|
"entropy": 5.6507940769195555,
|
|
"epoch": 1.229951690821256,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004853333666683472,
|
|
"loss": 5.5224,
|
|
"mean_token_accuracy": 0.15614334493875504,
|
|
"num_tokens": 26998889.0,
|
|
"step": 14640
|
|
},
|
|
{
|
|
"entropy": 5.708015727996826,
|
|
"epoch": 1.230371770636421,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00048532272973273496,
|
|
"loss": 5.4656,
|
|
"mean_token_accuracy": 0.16113510280847548,
|
|
"num_tokens": 27008912.0,
|
|
"step": 14645
|
|
},
|
|
{
|
|
"entropy": 5.671196317672729,
|
|
"epoch": 1.2307918504515858,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00048531208907136384,
|
|
"loss": 5.3541,
|
|
"mean_token_accuracy": 0.17473920732736586,
|
|
"num_tokens": 27017573.0,
|
|
"step": 14650
|
|
},
|
|
{
|
|
"entropy": 5.658402824401856,
|
|
"epoch": 1.2312119302667508,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048530144468442236,
|
|
"loss": 5.4297,
|
|
"mean_token_accuracy": 0.1590592809021473,
|
|
"num_tokens": 27027205.0,
|
|
"step": 14655
|
|
},
|
|
{
|
|
"entropy": 5.66589732170105,
|
|
"epoch": 1.2316320100819156,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048529079657209906,
|
|
"loss": 5.3827,
|
|
"mean_token_accuracy": 0.16773709654808044,
|
|
"num_tokens": 27035882.0,
|
|
"step": 14660
|
|
},
|
|
{
|
|
"entropy": 5.628454732894897,
|
|
"epoch": 1.2320520898970804,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004852801447345826,
|
|
"loss": 5.4555,
|
|
"mean_token_accuracy": 0.17012043595314025,
|
|
"num_tokens": 27044761.0,
|
|
"step": 14665
|
|
},
|
|
{
|
|
"entropy": 5.6688700199127195,
|
|
"epoch": 1.2324721697122454,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004852694891720617,
|
|
"loss": 5.4815,
|
|
"mean_token_accuracy": 0.16467399448156356,
|
|
"num_tokens": 27054149.0,
|
|
"step": 14670
|
|
},
|
|
{
|
|
"entropy": 5.725511741638184,
|
|
"epoch": 1.2328922495274102,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000485258829884725,
|
|
"loss": 5.524,
|
|
"mean_token_accuracy": 0.1634502664208412,
|
|
"num_tokens": 27063145.0,
|
|
"step": 14675
|
|
},
|
|
{
|
|
"entropy": 5.7596677303314205,
|
|
"epoch": 1.2333123293425752,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004852481668727614,
|
|
"loss": 5.4697,
|
|
"mean_token_accuracy": 0.16408599615097047,
|
|
"num_tokens": 27072378.0,
|
|
"step": 14680
|
|
},
|
|
{
|
|
"entropy": 5.588124799728393,
|
|
"epoch": 1.23373240915774,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048523750013635986,
|
|
"loss": 5.354,
|
|
"mean_token_accuracy": 0.16549673229455947,
|
|
"num_tokens": 27082241.0,
|
|
"step": 14685
|
|
},
|
|
{
|
|
"entropy": 5.605792379379272,
|
|
"epoch": 1.2341524889729047,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004852268296757092,
|
|
"loss": 5.3762,
|
|
"mean_token_accuracy": 0.16784797310829164,
|
|
"num_tokens": 27091488.0,
|
|
"step": 14690
|
|
},
|
|
{
|
|
"entropy": 5.743075704574585,
|
|
"epoch": 1.2345725687880698,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004852161554909985,
|
|
"loss": 5.4272,
|
|
"mean_token_accuracy": 0.16824524402618407,
|
|
"num_tokens": 27100378.0,
|
|
"step": 14695
|
|
},
|
|
{
|
|
"entropy": 5.69188551902771,
|
|
"epoch": 1.2349926486032345,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048520547758241686,
|
|
"loss": 5.4522,
|
|
"mean_token_accuracy": 0.16235414147377014,
|
|
"num_tokens": 27110341.0,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"entropy": 5.656498527526855,
|
|
"epoch": 1.2354127284183996,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00048519479595015343,
|
|
"loss": 5.3965,
|
|
"mean_token_accuracy": 0.1622692197561264,
|
|
"num_tokens": 27119381.0,
|
|
"step": 14705
|
|
},
|
|
{
|
|
"entropy": 5.605996942520141,
|
|
"epoch": 1.2358328082335643,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048518411059439746,
|
|
"loss": 5.4951,
|
|
"mean_token_accuracy": 0.1566877394914627,
|
|
"num_tokens": 27129167.0,
|
|
"step": 14710
|
|
},
|
|
{
|
|
"entropy": 5.697007560729981,
|
|
"epoch": 1.2362528880487293,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048517342151533813,
|
|
"loss": 5.5005,
|
|
"mean_token_accuracy": 0.1557912290096283,
|
|
"num_tokens": 27138479.0,
|
|
"step": 14715
|
|
},
|
|
{
|
|
"entropy": 5.697368383407593,
|
|
"epoch": 1.2366729678638941,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004851627287131649,
|
|
"loss": 5.3838,
|
|
"mean_token_accuracy": 0.16886205822229386,
|
|
"num_tokens": 27147197.0,
|
|
"step": 14720
|
|
},
|
|
{
|
|
"entropy": 5.643680572509766,
|
|
"epoch": 1.2370930476790591,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004851520321880672,
|
|
"loss": 5.4126,
|
|
"mean_token_accuracy": 0.1719201013445854,
|
|
"num_tokens": 27155854.0,
|
|
"step": 14725
|
|
},
|
|
{
|
|
"entropy": 5.657077169418335,
|
|
"epoch": 1.237513127494224,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004851413319402344,
|
|
"loss": 5.3862,
|
|
"mean_token_accuracy": 0.1578731968998909,
|
|
"num_tokens": 27165069.0,
|
|
"step": 14730
|
|
},
|
|
{
|
|
"entropy": 5.684050750732422,
|
|
"epoch": 1.2379332073093887,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004851306279698561,
|
|
"loss": 5.4352,
|
|
"mean_token_accuracy": 0.16021962463855743,
|
|
"num_tokens": 27174070.0,
|
|
"step": 14735
|
|
},
|
|
{
|
|
"entropy": 5.788384103775025,
|
|
"epoch": 1.2383532871245537,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004851199202771219,
|
|
"loss": 5.5038,
|
|
"mean_token_accuracy": 0.1639639750123024,
|
|
"num_tokens": 27182903.0,
|
|
"step": 14740
|
|
},
|
|
{
|
|
"entropy": 5.693592119216919,
|
|
"epoch": 1.2387733669397185,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004851092088622216,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.17083500623703002,
|
|
"num_tokens": 27192747.0,
|
|
"step": 14745
|
|
},
|
|
{
|
|
"entropy": 5.670225000381469,
|
|
"epoch": 1.2391934467548835,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004850984937253448,
|
|
"loss": 5.4402,
|
|
"mean_token_accuracy": 0.1658121481537819,
|
|
"num_tokens": 27201657.0,
|
|
"step": 14750
|
|
},
|
|
{
|
|
"entropy": 5.693979692459107,
|
|
"epoch": 1.2396135265700483,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004850877748666814,
|
|
"loss": 5.4621,
|
|
"mean_token_accuracy": 0.16480949372053147,
|
|
"num_tokens": 27211794.0,
|
|
"step": 14755
|
|
},
|
|
{
|
|
"entropy": 5.638466024398804,
|
|
"epoch": 1.240033606385213,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00048507705228642117,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.1595284804701805,
|
|
"num_tokens": 27221852.0,
|
|
"step": 14760
|
|
},
|
|
{
|
|
"entropy": 5.654482078552246,
|
|
"epoch": 1.240453686200378,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004850663259847542,
|
|
"loss": 5.4612,
|
|
"mean_token_accuracy": 0.158142551779747,
|
|
"num_tokens": 27231558.0,
|
|
"step": 14765
|
|
},
|
|
{
|
|
"entropy": 5.628722333908081,
|
|
"epoch": 1.240873766015543,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00048505559596187037,
|
|
"loss": 5.451,
|
|
"mean_token_accuracy": 0.16363227218389512,
|
|
"num_tokens": 27241053.0,
|
|
"step": 14770
|
|
},
|
|
{
|
|
"entropy": 5.614446783065796,
|
|
"epoch": 1.241293845830708,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004850448622179599,
|
|
"loss": 5.3357,
|
|
"mean_token_accuracy": 0.1671755015850067,
|
|
"num_tokens": 27249770.0,
|
|
"step": 14775
|
|
},
|
|
{
|
|
"entropy": 5.800767087936402,
|
|
"epoch": 1.2417139256458727,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004850341247532128,
|
|
"loss": 5.5805,
|
|
"mean_token_accuracy": 0.15848884508013725,
|
|
"num_tokens": 27258883.0,
|
|
"step": 14780
|
|
},
|
|
{
|
|
"entropy": 5.751977014541626,
|
|
"epoch": 1.2421340054610377,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004850233835678194,
|
|
"loss": 5.4846,
|
|
"mean_token_accuracy": 0.1624804139137268,
|
|
"num_tokens": 27268056.0,
|
|
"step": 14785
|
|
},
|
|
{
|
|
"entropy": 5.669937515258789,
|
|
"epoch": 1.2425540852762025,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004850126386619699,
|
|
"loss": 5.3487,
|
|
"mean_token_accuracy": 0.17517259567975998,
|
|
"num_tokens": 27276965.0,
|
|
"step": 14790
|
|
},
|
|
{
|
|
"entropy": 5.600133562088013,
|
|
"epoch": 1.2429741650913673,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004850018900358545,
|
|
"loss": 5.4149,
|
|
"mean_token_accuracy": 0.16797211319208144,
|
|
"num_tokens": 27286173.0,
|
|
"step": 14795
|
|
},
|
|
{
|
|
"entropy": 5.646801853179932,
|
|
"epoch": 1.2433942449065323,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00048499113768966386,
|
|
"loss": 5.4173,
|
|
"mean_token_accuracy": 0.16762335151433944,
|
|
"num_tokens": 27294863.0,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"entropy": 5.730639934539795,
|
|
"epoch": 1.243814324721697,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004849803816235884,
|
|
"loss": 5.4551,
|
|
"mean_token_accuracy": 0.16181258857250214,
|
|
"num_tokens": 27304427.0,
|
|
"step": 14805
|
|
},
|
|
{
|
|
"entropy": 5.7499453067779545,
|
|
"epoch": 1.244234404536862,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004849696218378185,
|
|
"loss": 5.53,
|
|
"mean_token_accuracy": 0.16161169856786728,
|
|
"num_tokens": 27313716.0,
|
|
"step": 14810
|
|
},
|
|
{
|
|
"entropy": 5.7411253452301025,
|
|
"epoch": 1.2446544843520269,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004849588583325449,
|
|
"loss": 5.4179,
|
|
"mean_token_accuracy": 0.17681172788143157,
|
|
"num_tokens": 27322342.0,
|
|
"step": 14815
|
|
},
|
|
{
|
|
"entropy": 5.742122983932495,
|
|
"epoch": 1.2450745641671919,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004849480911079583,
|
|
"loss": 5.4983,
|
|
"mean_token_accuracy": 0.15292923152446747,
|
|
"num_tokens": 27331892.0,
|
|
"step": 14820
|
|
},
|
|
{
|
|
"entropy": 5.687739038467408,
|
|
"epoch": 1.2454946439823567,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004849373201642493,
|
|
"loss": 5.4674,
|
|
"mean_token_accuracy": 0.15925178527832032,
|
|
"num_tokens": 27340428.0,
|
|
"step": 14825
|
|
},
|
|
{
|
|
"entropy": 5.6958386421203615,
|
|
"epoch": 1.2459147237975214,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004849265455016088,
|
|
"loss": 5.4664,
|
|
"mean_token_accuracy": 0.16365174651145936,
|
|
"num_tokens": 27349224.0,
|
|
"step": 14830
|
|
},
|
|
{
|
|
"entropy": 5.661598014831543,
|
|
"epoch": 1.2463348036126864,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004849157671202277,
|
|
"loss": 5.4434,
|
|
"mean_token_accuracy": 0.16567779928445817,
|
|
"num_tokens": 27357480.0,
|
|
"step": 14835
|
|
},
|
|
{
|
|
"entropy": 5.658696794509888,
|
|
"epoch": 1.2467548834278512,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004849049850202968,
|
|
"loss": 5.3717,
|
|
"mean_token_accuracy": 0.17218401432037353,
|
|
"num_tokens": 27366732.0,
|
|
"step": 14840
|
|
},
|
|
{
|
|
"entropy": 5.671054315567017,
|
|
"epoch": 1.2471749632430162,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004848941992020072,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.15841912627220153,
|
|
"num_tokens": 27375834.0,
|
|
"step": 14845
|
|
},
|
|
{
|
|
"entropy": 5.730887794494629,
|
|
"epoch": 1.247595043058181,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004848834096655499,
|
|
"loss": 5.4563,
|
|
"mean_token_accuracy": 0.16432572156190872,
|
|
"num_tokens": 27385311.0,
|
|
"step": 14850
|
|
},
|
|
{
|
|
"entropy": 5.700474452972412,
|
|
"epoch": 1.2480151228733458,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048487261641111607,
|
|
"loss": 5.5133,
|
|
"mean_token_accuracy": 0.16188574731349945,
|
|
"num_tokens": 27394587.0,
|
|
"step": 14855
|
|
},
|
|
{
|
|
"entropy": 5.581315422058106,
|
|
"epoch": 1.2484352026885108,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000484861819438897,
|
|
"loss": 5.3722,
|
|
"mean_token_accuracy": 0.1629566103219986,
|
|
"num_tokens": 27403316.0,
|
|
"step": 14860
|
|
},
|
|
{
|
|
"entropy": 5.674688768386841,
|
|
"epoch": 1.2488552825036756,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004848510187490838,
|
|
"loss": 5.4211,
|
|
"mean_token_accuracy": 0.16881508529186248,
|
|
"num_tokens": 27412709.0,
|
|
"step": 14865
|
|
},
|
|
{
|
|
"entropy": 5.717575883865356,
|
|
"epoch": 1.2492753623188406,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004848402143418679,
|
|
"loss": 5.4867,
|
|
"mean_token_accuracy": 0.16073511987924577,
|
|
"num_tokens": 27422004.0,
|
|
"step": 14870
|
|
},
|
|
{
|
|
"entropy": 5.667223167419434,
|
|
"epoch": 1.2496954421340054,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048482940621744053,
|
|
"loss": 5.5146,
|
|
"mean_token_accuracy": 0.16103297472000122,
|
|
"num_tokens": 27431931.0,
|
|
"step": 14875
|
|
},
|
|
{
|
|
"entropy": 5.64241132736206,
|
|
"epoch": 1.2501155219491704,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004848185943759934,
|
|
"loss": 5.3291,
|
|
"mean_token_accuracy": 0.17295840233564377,
|
|
"num_tokens": 27441527.0,
|
|
"step": 14880
|
|
},
|
|
{
|
|
"entropy": 5.751472759246826,
|
|
"epoch": 1.2505356017643352,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048480777881771786,
|
|
"loss": 5.488,
|
|
"mean_token_accuracy": 0.16338546127080916,
|
|
"num_tokens": 27449964.0,
|
|
"step": 14885
|
|
},
|
|
{
|
|
"entropy": 5.653960943222046,
|
|
"epoch": 1.2509556815795002,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004847969595428056,
|
|
"loss": 5.4769,
|
|
"mean_token_accuracy": 0.16023507416248323,
|
|
"num_tokens": 27459044.0,
|
|
"step": 14890
|
|
},
|
|
{
|
|
"entropy": 5.632353162765503,
|
|
"epoch": 1.251375761394665,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00048478613655144817,
|
|
"loss": 5.4677,
|
|
"mean_token_accuracy": 0.16684045344591142,
|
|
"num_tokens": 27467644.0,
|
|
"step": 14895
|
|
},
|
|
{
|
|
"entropy": 5.754183292388916,
|
|
"epoch": 1.2517958412098298,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004847753098438374,
|
|
"loss": 5.4969,
|
|
"mean_token_accuracy": 0.15503143072128295,
|
|
"num_tokens": 27476899.0,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"entropy": 5.713054418563843,
|
|
"epoch": 1.2522159210249948,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000484764479420165,
|
|
"loss": 5.3986,
|
|
"mean_token_accuracy": 0.16840293928980826,
|
|
"num_tokens": 27485167.0,
|
|
"step": 14905
|
|
},
|
|
{
|
|
"entropy": 5.67601432800293,
|
|
"epoch": 1.2526360008401596,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048475364528062287,
|
|
"loss": 5.4366,
|
|
"mean_token_accuracy": 0.15893664807081223,
|
|
"num_tokens": 27493986.0,
|
|
"step": 14910
|
|
},
|
|
{
|
|
"entropy": 5.717255640029907,
|
|
"epoch": 1.2530560806553246,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004847428074254029,
|
|
"loss": 5.481,
|
|
"mean_token_accuracy": 0.1676044538617134,
|
|
"num_tokens": 27503896.0,
|
|
"step": 14915
|
|
},
|
|
{
|
|
"entropy": 5.700136041641235,
|
|
"epoch": 1.2534761604704894,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048473196585469713,
|
|
"loss": 5.4409,
|
|
"mean_token_accuracy": 0.16624458730220795,
|
|
"num_tokens": 27513485.0,
|
|
"step": 14920
|
|
},
|
|
{
|
|
"entropy": 5.725602149963379,
|
|
"epoch": 1.2538962402856542,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048472112056869763,
|
|
"loss": 5.5032,
|
|
"mean_token_accuracy": 0.15849509388208388,
|
|
"num_tokens": 27523164.0,
|
|
"step": 14925
|
|
},
|
|
{
|
|
"entropy": 5.7331983089447025,
|
|
"epoch": 1.2543163201008192,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004847102715675964,
|
|
"loss": 5.4388,
|
|
"mean_token_accuracy": 0.16513479351997376,
|
|
"num_tokens": 27531387.0,
|
|
"step": 14930
|
|
},
|
|
{
|
|
"entropy": 5.6596925258636475,
|
|
"epoch": 1.254736399915984,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004846994188515857,
|
|
"loss": 5.4488,
|
|
"mean_token_accuracy": 0.16895988285541536,
|
|
"num_tokens": 27541754.0,
|
|
"step": 14935
|
|
},
|
|
{
|
|
"entropy": 5.79337100982666,
|
|
"epoch": 1.255156479731149,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004846885624208578,
|
|
"loss": 5.5214,
|
|
"mean_token_accuracy": 0.158653724193573,
|
|
"num_tokens": 27551458.0,
|
|
"step": 14940
|
|
},
|
|
{
|
|
"entropy": 5.685010766983032,
|
|
"epoch": 1.2555765595463138,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.000484677702275605,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.16842745393514633,
|
|
"num_tokens": 27560797.0,
|
|
"step": 14945
|
|
},
|
|
{
|
|
"entropy": 5.695211362838745,
|
|
"epoch": 1.2559966393614788,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048466683841601963,
|
|
"loss": 5.4206,
|
|
"mean_token_accuracy": 0.16701247841119765,
|
|
"num_tokens": 27570166.0,
|
|
"step": 14950
|
|
},
|
|
{
|
|
"entropy": 5.662879896163941,
|
|
"epoch": 1.2564167191766435,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048465597084229416,
|
|
"loss": 5.3411,
|
|
"mean_token_accuracy": 0.16752343326807023,
|
|
"num_tokens": 27579411.0,
|
|
"step": 14955
|
|
},
|
|
{
|
|
"entropy": 5.737317419052124,
|
|
"epoch": 1.2568367989918086,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004846450995546212,
|
|
"loss": 5.5894,
|
|
"mean_token_accuracy": 0.15929221510887145,
|
|
"num_tokens": 27589124.0,
|
|
"step": 14960
|
|
},
|
|
{
|
|
"entropy": 5.76739387512207,
|
|
"epoch": 1.2572568788069733,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004846342245531932,
|
|
"loss": 5.5526,
|
|
"mean_token_accuracy": 0.15253591239452363,
|
|
"num_tokens": 27598664.0,
|
|
"step": 14965
|
|
},
|
|
{
|
|
"entropy": 5.792992496490479,
|
|
"epoch": 1.2576769586221381,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004846233458382029,
|
|
"loss": 5.4779,
|
|
"mean_token_accuracy": 0.16482626497745514,
|
|
"num_tokens": 27607189.0,
|
|
"step": 14970
|
|
},
|
|
{
|
|
"entropy": 5.758588409423828,
|
|
"epoch": 1.2580970384373031,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048461246340984293,
|
|
"loss": 5.5099,
|
|
"mean_token_accuracy": 0.16399455666542054,
|
|
"num_tokens": 27616415.0,
|
|
"step": 14975
|
|
},
|
|
{
|
|
"entropy": 5.67619571685791,
|
|
"epoch": 1.258517118252468,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004846015772683061,
|
|
"loss": 5.4745,
|
|
"mean_token_accuracy": 0.1670221731066704,
|
|
"num_tokens": 27624492.0,
|
|
"step": 14980
|
|
},
|
|
{
|
|
"entropy": 5.610988140106201,
|
|
"epoch": 1.258937198067633,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048459068741378526,
|
|
"loss": 5.3731,
|
|
"mean_token_accuracy": 0.16672062426805495,
|
|
"num_tokens": 27634243.0,
|
|
"step": 14985
|
|
},
|
|
{
|
|
"entropy": 5.695155811309815,
|
|
"epoch": 1.2593572778827977,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004845797938464734,
|
|
"loss": 5.4803,
|
|
"mean_token_accuracy": 0.16463592499494553,
|
|
"num_tokens": 27642887.0,
|
|
"step": 14990
|
|
},
|
|
{
|
|
"entropy": 5.7585619449615475,
|
|
"epoch": 1.2597773576979625,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004845688965665633,
|
|
"loss": 5.4946,
|
|
"mean_token_accuracy": 0.1642697721719742,
|
|
"num_tokens": 27652524.0,
|
|
"step": 14995
|
|
},
|
|
{
|
|
"entropy": 5.68261866569519,
|
|
"epoch": 1.2601974375131275,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048455799557424814,
|
|
"loss": 5.3471,
|
|
"mean_token_accuracy": 0.17591068595647813,
|
|
"num_tokens": 27661306.0,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 1.2601974375131275,
|
|
"eval_entropy": 5.542287499050695,
|
|
"eval_loss": 5.52593994140625,
|
|
"eval_mean_token_accuracy": 0.16979930738796262,
|
|
"eval_num_tokens": 27661306.0,
|
|
"eval_runtime": 27.4053,
|
|
"eval_samples_per_second": 1363.46,
|
|
"eval_steps_per_second": 170.442,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"entropy": 5.719019222259521,
|
|
"epoch": 1.2606175173282923,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004845470908697209,
|
|
"loss": 5.5345,
|
|
"mean_token_accuracy": 0.1672997236251831,
|
|
"num_tokens": 27671728.0,
|
|
"step": 15005
|
|
},
|
|
{
|
|
"entropy": 5.660177707672119,
|
|
"epoch": 1.2610375971434573,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000484536182453175,
|
|
"loss": 5.3345,
|
|
"mean_token_accuracy": 0.16970676183700562,
|
|
"num_tokens": 27680740.0,
|
|
"step": 15010
|
|
},
|
|
{
|
|
"entropy": 5.690030097961426,
|
|
"epoch": 1.261457676958622,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004845252703248035,
|
|
"loss": 5.4072,
|
|
"mean_token_accuracy": 0.16504298150539398,
|
|
"num_tokens": 27689865.0,
|
|
"step": 15015
|
|
},
|
|
{
|
|
"entropy": 5.6956014156341555,
|
|
"epoch": 1.2618777567737869,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004845143544847997,
|
|
"loss": 5.4473,
|
|
"mean_token_accuracy": 0.1682340383529663,
|
|
"num_tokens": 27700366.0,
|
|
"step": 15020
|
|
},
|
|
{
|
|
"entropy": 5.698393678665161,
|
|
"epoch": 1.262297836588952,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048450343493335697,
|
|
"loss": 5.3561,
|
|
"mean_token_accuracy": 0.17051917016506196,
|
|
"num_tokens": 27708893.0,
|
|
"step": 15025
|
|
},
|
|
{
|
|
"entropy": 5.611342048645019,
|
|
"epoch": 1.262717916404117,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004844925116706688,
|
|
"loss": 5.3771,
|
|
"mean_token_accuracy": 0.16306255012750626,
|
|
"num_tokens": 27717494.0,
|
|
"step": 15030
|
|
},
|
|
{
|
|
"entropy": 5.57361912727356,
|
|
"epoch": 1.2631379962192817,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.00048448158469692866,
|
|
"loss": 5.3038,
|
|
"mean_token_accuracy": 0.18097079247236253,
|
|
"num_tokens": 27726487.0,
|
|
"step": 15035
|
|
},
|
|
{
|
|
"entropy": 5.786226844787597,
|
|
"epoch": 1.2635580760344465,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004844706540123301,
|
|
"loss": 5.5463,
|
|
"mean_token_accuracy": 0.15970377177000045,
|
|
"num_tokens": 27736602.0,
|
|
"step": 15040
|
|
},
|
|
{
|
|
"entropy": 5.89350733757019,
|
|
"epoch": 1.2639781558496115,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048445971961706675,
|
|
"loss": 5.5419,
|
|
"mean_token_accuracy": 0.15724890679121017,
|
|
"num_tokens": 27746322.0,
|
|
"step": 15045
|
|
},
|
|
{
|
|
"entropy": 5.636753940582276,
|
|
"epoch": 1.2643982356647763,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004844487815113323,
|
|
"loss": 5.3895,
|
|
"mean_token_accuracy": 0.1694614127278328,
|
|
"num_tokens": 27754941.0,
|
|
"step": 15050
|
|
},
|
|
{
|
|
"entropy": 5.603873300552368,
|
|
"epoch": 1.2648183154799413,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004844378396953206,
|
|
"loss": 5.4706,
|
|
"mean_token_accuracy": 0.16238831877708435,
|
|
"num_tokens": 27763941.0,
|
|
"step": 15055
|
|
},
|
|
{
|
|
"entropy": 5.733206653594971,
|
|
"epoch": 1.265238395295106,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048442689416922536,
|
|
"loss": 5.4854,
|
|
"mean_token_accuracy": 0.16823527961969376,
|
|
"num_tokens": 27773087.0,
|
|
"step": 15060
|
|
},
|
|
{
|
|
"entropy": 5.640398788452148,
|
|
"epoch": 1.2656584751102709,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048441594493324057,
|
|
"loss": 5.3039,
|
|
"mean_token_accuracy": 0.17487951517105102,
|
|
"num_tokens": 27782648.0,
|
|
"step": 15065
|
|
},
|
|
{
|
|
"entropy": 5.66456823348999,
|
|
"epoch": 1.2660785549254359,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048440499198756015,
|
|
"loss": 5.5098,
|
|
"mean_token_accuracy": 0.16223005801439286,
|
|
"num_tokens": 27791567.0,
|
|
"step": 15070
|
|
},
|
|
{
|
|
"entropy": 5.695383977890015,
|
|
"epoch": 1.2664986347406006,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048439403533237816,
|
|
"loss": 5.499,
|
|
"mean_token_accuracy": 0.1588960826396942,
|
|
"num_tokens": 27801397.0,
|
|
"step": 15075
|
|
},
|
|
{
|
|
"entropy": 5.790954875946045,
|
|
"epoch": 1.2669187145557657,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004843830749678886,
|
|
"loss": 5.5147,
|
|
"mean_token_accuracy": 0.16107721030712127,
|
|
"num_tokens": 27810831.0,
|
|
"step": 15080
|
|
},
|
|
{
|
|
"entropy": 5.717430448532104,
|
|
"epoch": 1.2673387943709304,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004843721108942856,
|
|
"loss": 5.4237,
|
|
"mean_token_accuracy": 0.16757311969995498,
|
|
"num_tokens": 27819591.0,
|
|
"step": 15085
|
|
},
|
|
{
|
|
"entropy": 5.6086828231811525,
|
|
"epoch": 1.2677588741860952,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004843611431117636,
|
|
"loss": 5.4138,
|
|
"mean_token_accuracy": 0.1716834545135498,
|
|
"num_tokens": 27828614.0,
|
|
"step": 15090
|
|
},
|
|
{
|
|
"entropy": 5.673300123214721,
|
|
"epoch": 1.2681789540012602,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004843501716205167,
|
|
"loss": 5.4511,
|
|
"mean_token_accuracy": 0.165350541472435,
|
|
"num_tokens": 27837549.0,
|
|
"step": 15095
|
|
},
|
|
{
|
|
"entropy": 5.737055730819702,
|
|
"epoch": 1.2685990338164252,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004843391964207393,
|
|
"loss": 5.4743,
|
|
"mean_token_accuracy": 0.15991066843271257,
|
|
"num_tokens": 27846678.0,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"entropy": 5.789986085891724,
|
|
"epoch": 1.26901911363159,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004843282175126258,
|
|
"loss": 5.4962,
|
|
"mean_token_accuracy": 0.1644158586859703,
|
|
"num_tokens": 27855734.0,
|
|
"step": 15105
|
|
},
|
|
{
|
|
"entropy": 5.703271150588989,
|
|
"epoch": 1.2694391934467548,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.00048431723489637086,
|
|
"loss": 5.4225,
|
|
"mean_token_accuracy": 0.16743371933698653,
|
|
"num_tokens": 27865111.0,
|
|
"step": 15110
|
|
},
|
|
{
|
|
"entropy": 5.7195985317230225,
|
|
"epoch": 1.2698592732619198,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00048430624857216876,
|
|
"loss": 5.4393,
|
|
"mean_token_accuracy": 0.1662244826555252,
|
|
"num_tokens": 27874495.0,
|
|
"step": 15115
|
|
},
|
|
{
|
|
"entropy": 5.6339555263519285,
|
|
"epoch": 1.2702793530770846,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004842952585402143,
|
|
"loss": 5.4758,
|
|
"mean_token_accuracy": 0.16450706571340562,
|
|
"num_tokens": 27884531.0,
|
|
"step": 15120
|
|
},
|
|
{
|
|
"entropy": 5.596436595916748,
|
|
"epoch": 1.2706994328922496,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.000484284264800702,
|
|
"loss": 5.3613,
|
|
"mean_token_accuracy": 0.17341870963573455,
|
|
"num_tokens": 27893463.0,
|
|
"step": 15125
|
|
},
|
|
{
|
|
"entropy": 5.757380199432373,
|
|
"epoch": 1.2711195127074144,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00048427326735382687,
|
|
"loss": 5.4724,
|
|
"mean_token_accuracy": 0.16172740906476973,
|
|
"num_tokens": 27903015.0,
|
|
"step": 15130
|
|
},
|
|
{
|
|
"entropy": 5.742963027954102,
|
|
"epoch": 1.2715395925225792,
|
|
"grad_norm": 9.8125,
|
|
"learning_rate": 0.0004842622661997834,
|
|
"loss": 5.4552,
|
|
"mean_token_accuracy": 0.16410297602415086,
|
|
"num_tokens": 27912207.0,
|
|
"step": 15135
|
|
},
|
|
{
|
|
"entropy": 5.6874500751495365,
|
|
"epoch": 1.2719596723377442,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004842512613387668,
|
|
"loss": 5.4679,
|
|
"mean_token_accuracy": 0.1574219599366188,
|
|
"num_tokens": 27921566.0,
|
|
"step": 15140
|
|
},
|
|
{
|
|
"entropy": 5.663531732559204,
|
|
"epoch": 1.272379752152909,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004842402527709718,
|
|
"loss": 5.4061,
|
|
"mean_token_accuracy": 0.16983576118946075,
|
|
"num_tokens": 27930633.0,
|
|
"step": 15145
|
|
},
|
|
{
|
|
"entropy": 5.78377251625061,
|
|
"epoch": 1.272799831968074,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004842292404965934,
|
|
"loss": 5.5197,
|
|
"mean_token_accuracy": 0.1595507562160492,
|
|
"num_tokens": 27939887.0,
|
|
"step": 15150
|
|
},
|
|
{
|
|
"entropy": 5.767408180236816,
|
|
"epoch": 1.2732199117832388,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004842182245158268,
|
|
"loss": 5.5257,
|
|
"mean_token_accuracy": 0.16959029585123062,
|
|
"num_tokens": 27949090.0,
|
|
"step": 15155
|
|
},
|
|
{
|
|
"entropy": 5.610546350479126,
|
|
"epoch": 1.2736399915984036,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048420720482886715,
|
|
"loss": 5.3312,
|
|
"mean_token_accuracy": 0.1733013227581978,
|
|
"num_tokens": 27958141.0,
|
|
"step": 15160
|
|
},
|
|
{
|
|
"entropy": 5.63969464302063,
|
|
"epoch": 1.2740600714135686,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004841961814359095,
|
|
"loss": 5.4047,
|
|
"mean_token_accuracy": 0.16643078476190568,
|
|
"num_tokens": 27967780.0,
|
|
"step": 15165
|
|
},
|
|
{
|
|
"entropy": 5.69786319732666,
|
|
"epoch": 1.2744801512287336,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048418515433714917,
|
|
"loss": 5.489,
|
|
"mean_token_accuracy": 0.16522752195596696,
|
|
"num_tokens": 27976243.0,
|
|
"step": 15170
|
|
},
|
|
{
|
|
"entropy": 5.6997581958770756,
|
|
"epoch": 1.2749002310438984,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004841741235327817,
|
|
"loss": 5.3579,
|
|
"mean_token_accuracy": 0.17067465782165528,
|
|
"num_tokens": 27985874.0,
|
|
"step": 15175
|
|
},
|
|
{
|
|
"entropy": 5.806114244461059,
|
|
"epoch": 1.2753203108590632,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048416308902300215,
|
|
"loss": 5.5921,
|
|
"mean_token_accuracy": 0.15702388137578965,
|
|
"num_tokens": 27995111.0,
|
|
"step": 15180
|
|
},
|
|
{
|
|
"entropy": 5.689389657974243,
|
|
"epoch": 1.2757403906742282,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004841520508080063,
|
|
"loss": 5.4127,
|
|
"mean_token_accuracy": 0.1689732179045677,
|
|
"num_tokens": 28003948.0,
|
|
"step": 15185
|
|
},
|
|
{
|
|
"entropy": 5.6548957347869875,
|
|
"epoch": 1.276160470489393,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.00048414100888798957,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.16478729695081712,
|
|
"num_tokens": 28012941.0,
|
|
"step": 15190
|
|
},
|
|
{
|
|
"entropy": 5.601344108581543,
|
|
"epoch": 1.276580550304558,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 0.0004841299632631475,
|
|
"loss": 5.41,
|
|
"mean_token_accuracy": 0.1636947825551033,
|
|
"num_tokens": 28022195.0,
|
|
"step": 15195
|
|
},
|
|
{
|
|
"entropy": 5.65929913520813,
|
|
"epoch": 1.2770006301197228,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004841189139336759,
|
|
"loss": 5.3589,
|
|
"mean_token_accuracy": 0.16983367949724198,
|
|
"num_tokens": 28031446.0,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"entropy": 5.688397693634033,
|
|
"epoch": 1.2774207099348875,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004841078608997703,
|
|
"loss": 5.3801,
|
|
"mean_token_accuracy": 0.17025842219591142,
|
|
"num_tokens": 28040906.0,
|
|
"step": 15205
|
|
},
|
|
{
|
|
"entropy": 5.676456069946289,
|
|
"epoch": 1.2778407897500526,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004840968041616267,
|
|
"loss": 5.3894,
|
|
"mean_token_accuracy": 0.1704905390739441,
|
|
"num_tokens": 28049848.0,
|
|
"step": 15210
|
|
},
|
|
{
|
|
"entropy": 5.67938723564148,
|
|
"epoch": 1.2782608695652173,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048408574371944094,
|
|
"loss": 5.3732,
|
|
"mean_token_accuracy": 0.16771376579999925,
|
|
"num_tokens": 28058276.0,
|
|
"step": 15215
|
|
},
|
|
{
|
|
"entropy": 5.688129663467407,
|
|
"epoch": 1.2786809493803823,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004840746795734088,
|
|
"loss": 5.5029,
|
|
"mean_token_accuracy": 0.1592990979552269,
|
|
"num_tokens": 28068185.0,
|
|
"step": 15220
|
|
},
|
|
{
|
|
"entropy": 5.77323579788208,
|
|
"epoch": 1.2791010291955471,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004840636117237264,
|
|
"loss": 5.5346,
|
|
"mean_token_accuracy": 0.16309675723314285,
|
|
"num_tokens": 28077532.0,
|
|
"step": 15225
|
|
},
|
|
{
|
|
"entropy": 5.695499229431152,
|
|
"epoch": 1.279521109010712,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004840525401705897,
|
|
"loss": 5.3962,
|
|
"mean_token_accuracy": 0.16487024575471879,
|
|
"num_tokens": 28087593.0,
|
|
"step": 15230
|
|
},
|
|
{
|
|
"entropy": 5.651865243911743,
|
|
"epoch": 1.279941188825877,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.00048404146491419503,
|
|
"loss": 5.3617,
|
|
"mean_token_accuracy": 0.17026301175355912,
|
|
"num_tokens": 28096256.0,
|
|
"step": 15235
|
|
},
|
|
{
|
|
"entropy": 5.682730484008789,
|
|
"epoch": 1.2803612686410417,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 0.00048403038595473837,
|
|
"loss": 5.3999,
|
|
"mean_token_accuracy": 0.1683255612850189,
|
|
"num_tokens": 28105048.0,
|
|
"step": 15240
|
|
},
|
|
{
|
|
"entropy": 5.698611879348755,
|
|
"epoch": 1.2807813484562067,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.000484019303292416,
|
|
"loss": 5.4677,
|
|
"mean_token_accuracy": 0.15729653239250183,
|
|
"num_tokens": 28114330.0,
|
|
"step": 15245
|
|
},
|
|
{
|
|
"entropy": 5.666230535507202,
|
|
"epoch": 1.2812014282713715,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048400821692742434,
|
|
"loss": 5.3826,
|
|
"mean_token_accuracy": 0.17221412509679795,
|
|
"num_tokens": 28123147.0,
|
|
"step": 15250
|
|
},
|
|
{
|
|
"entropy": 5.731086874008179,
|
|
"epoch": 1.2816215080865365,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048399712685995983,
|
|
"loss": 5.519,
|
|
"mean_token_accuracy": 0.16596773117780686,
|
|
"num_tokens": 28132477.0,
|
|
"step": 15255
|
|
},
|
|
{
|
|
"entropy": 5.683180570602417,
|
|
"epoch": 1.2820415879017013,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048398603309021877,
|
|
"loss": 5.5007,
|
|
"mean_token_accuracy": 0.16307283490896224,
|
|
"num_tokens": 28141350.0,
|
|
"step": 15260
|
|
},
|
|
{
|
|
"entropy": 5.718101358413696,
|
|
"epoch": 1.2824616677168663,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004839749356183978,
|
|
"loss": 5.4452,
|
|
"mean_token_accuracy": 0.16625609248876572,
|
|
"num_tokens": 28149522.0,
|
|
"step": 15265
|
|
},
|
|
{
|
|
"entropy": 5.71740870475769,
|
|
"epoch": 1.282881747532031,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004839638344446933,
|
|
"loss": 5.5484,
|
|
"mean_token_accuracy": 0.16156259179115295,
|
|
"num_tokens": 28159646.0,
|
|
"step": 15270
|
|
},
|
|
{
|
|
"entropy": 5.810041522979736,
|
|
"epoch": 1.283301827347196,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004839527295693023,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.1712553933262825,
|
|
"num_tokens": 28168408.0,
|
|
"step": 15275
|
|
},
|
|
{
|
|
"entropy": 5.740299415588379,
|
|
"epoch": 1.283721907162361,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0004839416209924211,
|
|
"loss": 5.4659,
|
|
"mean_token_accuracy": 0.16082556098699569,
|
|
"num_tokens": 28177744.0,
|
|
"step": 15280
|
|
},
|
|
{
|
|
"entropy": 5.74624080657959,
|
|
"epoch": 1.2841419869775257,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00048393050871424676,
|
|
"loss": 5.5276,
|
|
"mean_token_accuracy": 0.16067055016756057,
|
|
"num_tokens": 28186811.0,
|
|
"step": 15285
|
|
},
|
|
{
|
|
"entropy": 5.6819815158844,
|
|
"epoch": 1.2845620667926907,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.000483919392734976,
|
|
"loss": 5.5012,
|
|
"mean_token_accuracy": 0.15652224719524382,
|
|
"num_tokens": 28197052.0,
|
|
"step": 15290
|
|
},
|
|
{
|
|
"entropy": 5.707629013061523,
|
|
"epoch": 1.2849821466078555,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004839082730548058,
|
|
"loss": 5.3546,
|
|
"mean_token_accuracy": 0.1764655143022537,
|
|
"num_tokens": 28206000.0,
|
|
"step": 15295
|
|
},
|
|
{
|
|
"entropy": 5.692590618133545,
|
|
"epoch": 1.2854022264230203,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004838971496739331,
|
|
"loss": 5.3416,
|
|
"mean_token_accuracy": 0.16673224717378615,
|
|
"num_tokens": 28214679.0,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"entropy": 5.616611909866333,
|
|
"epoch": 1.2858223062381853,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000483886022592555,
|
|
"loss": 5.4572,
|
|
"mean_token_accuracy": 0.16383219435811042,
|
|
"num_tokens": 28223890.0,
|
|
"step": 15305
|
|
},
|
|
{
|
|
"entropy": 5.671573495864868,
|
|
"epoch": 1.28624238605335,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004838748918108685,
|
|
"loss": 5.3889,
|
|
"mean_token_accuracy": 0.16743310987949372,
|
|
"num_tokens": 28232422.0,
|
|
"step": 15310
|
|
},
|
|
{
|
|
"entropy": 5.661684656143189,
|
|
"epoch": 1.286662465868515,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.00048386375732907083,
|
|
"loss": 5.4321,
|
|
"mean_token_accuracy": 0.1664291650056839,
|
|
"num_tokens": 28242079.0,
|
|
"step": 15315
|
|
},
|
|
{
|
|
"entropy": 5.772406101226807,
|
|
"epoch": 1.2870825456836799,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00048385261914735936,
|
|
"loss": 5.626,
|
|
"mean_token_accuracy": 0.1569541186094284,
|
|
"num_tokens": 28252510.0,
|
|
"step": 15320
|
|
},
|
|
{
|
|
"entropy": 5.816063642501831,
|
|
"epoch": 1.2875026254988446,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.00048384147726593125,
|
|
"loss": 5.5211,
|
|
"mean_token_accuracy": 0.1613934814929962,
|
|
"num_tokens": 28261348.0,
|
|
"step": 15325
|
|
},
|
|
{
|
|
"entropy": 5.7399543762207035,
|
|
"epoch": 1.2879227053140097,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004838303316849839,
|
|
"loss": 5.4373,
|
|
"mean_token_accuracy": 0.15664124339818955,
|
|
"num_tokens": 28270739.0,
|
|
"step": 15330
|
|
},
|
|
{
|
|
"entropy": 5.7096014499664305,
|
|
"epoch": 1.2883427851291747,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048381918240471473,
|
|
"loss": 5.4913,
|
|
"mean_token_accuracy": 0.15497729554772377,
|
|
"num_tokens": 28279370.0,
|
|
"step": 15335
|
|
},
|
|
{
|
|
"entropy": 5.726278638839721,
|
|
"epoch": 1.2887628649443394,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048380802942532124,
|
|
"loss": 5.411,
|
|
"mean_token_accuracy": 0.1654820501804352,
|
|
"num_tokens": 28287955.0,
|
|
"step": 15340
|
|
},
|
|
{
|
|
"entropy": 5.604457712173462,
|
|
"epoch": 1.2891829447595042,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048379687274700107,
|
|
"loss": 5.3613,
|
|
"mean_token_accuracy": 0.17298102527856826,
|
|
"num_tokens": 28296832.0,
|
|
"step": 15345
|
|
},
|
|
{
|
|
"entropy": 5.598322010040283,
|
|
"epoch": 1.2896030245746692,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048378571236995185,
|
|
"loss": 5.3944,
|
|
"mean_token_accuracy": 0.166165030002594,
|
|
"num_tokens": 28305778.0,
|
|
"step": 15350
|
|
},
|
|
{
|
|
"entropy": 5.761275959014893,
|
|
"epoch": 1.290023104389834,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00048377454829437124,
|
|
"loss": 5.4484,
|
|
"mean_token_accuracy": 0.1619205430150032,
|
|
"num_tokens": 28314615.0,
|
|
"step": 15355
|
|
},
|
|
{
|
|
"entropy": 5.827945566177368,
|
|
"epoch": 1.290443184204999,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004837633805204569,
|
|
"loss": 5.5111,
|
|
"mean_token_accuracy": 0.16340176910161971,
|
|
"num_tokens": 28324478.0,
|
|
"step": 15360
|
|
},
|
|
{
|
|
"entropy": 5.753641033172608,
|
|
"epoch": 1.2908632640201638,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004837522090484069,
|
|
"loss": 5.4739,
|
|
"mean_token_accuracy": 0.16428422480821608,
|
|
"num_tokens": 28333532.0,
|
|
"step": 15365
|
|
},
|
|
{
|
|
"entropy": 5.720655488967895,
|
|
"epoch": 1.2912833438353286,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00048374103387841894,
|
|
"loss": 5.4456,
|
|
"mean_token_accuracy": 0.15933494865894318,
|
|
"num_tokens": 28343723.0,
|
|
"step": 15370
|
|
},
|
|
{
|
|
"entropy": 5.728183746337891,
|
|
"epoch": 1.2917034236504936,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048372985501069106,
|
|
"loss": 5.4241,
|
|
"mean_token_accuracy": 0.1650676444172859,
|
|
"num_tokens": 28351992.0,
|
|
"step": 15375
|
|
},
|
|
{
|
|
"entropy": 5.65154390335083,
|
|
"epoch": 1.2921235034656584,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004837186724454213,
|
|
"loss": 5.4075,
|
|
"mean_token_accuracy": 0.16652555614709855,
|
|
"num_tokens": 28361141.0,
|
|
"step": 15380
|
|
},
|
|
{
|
|
"entropy": 5.664861392974854,
|
|
"epoch": 1.2925435832808234,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004837074861828077,
|
|
"loss": 5.3951,
|
|
"mean_token_accuracy": 0.16747472435235977,
|
|
"num_tokens": 28370339.0,
|
|
"step": 15385
|
|
},
|
|
{
|
|
"entropy": 5.725724220275879,
|
|
"epoch": 1.2929636630959882,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004836962962230485,
|
|
"loss": 5.5142,
|
|
"mean_token_accuracy": 0.16315443962812423,
|
|
"num_tokens": 28379242.0,
|
|
"step": 15390
|
|
},
|
|
{
|
|
"entropy": 5.659032392501831,
|
|
"epoch": 1.293383742911153,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004836851025663418,
|
|
"loss": 5.4054,
|
|
"mean_token_accuracy": 0.1692844420671463,
|
|
"num_tokens": 28388864.0,
|
|
"step": 15395
|
|
},
|
|
{
|
|
"entropy": 5.7302182674407955,
|
|
"epoch": 1.293803822726318,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.000483673905212886,
|
|
"loss": 5.5045,
|
|
"mean_token_accuracy": 0.16604892164468765,
|
|
"num_tokens": 28398000.0,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"entropy": 5.645801734924317,
|
|
"epoch": 1.294223902541483,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004836627041628794,
|
|
"loss": 5.4445,
|
|
"mean_token_accuracy": 0.1687624305486679,
|
|
"num_tokens": 28407652.0,
|
|
"step": 15405
|
|
},
|
|
{
|
|
"entropy": 5.7521144390106205,
|
|
"epoch": 1.2946439823566478,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004836514994165205,
|
|
"loss": 5.4993,
|
|
"mean_token_accuracy": 0.16120134592056273,
|
|
"num_tokens": 28417694.0,
|
|
"step": 15410
|
|
},
|
|
{
|
|
"entropy": 5.694954919815063,
|
|
"epoch": 1.2950640621718126,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048364029097400777,
|
|
"loss": 5.442,
|
|
"mean_token_accuracy": 0.16629258692264556,
|
|
"num_tokens": 28426928.0,
|
|
"step": 15415
|
|
},
|
|
{
|
|
"entropy": 5.664297342300415,
|
|
"epoch": 1.2954841419869776,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048362907883553956,
|
|
"loss": 5.4714,
|
|
"mean_token_accuracy": 0.15762439966201783,
|
|
"num_tokens": 28436176.0,
|
|
"step": 15420
|
|
},
|
|
{
|
|
"entropy": 5.728027105331421,
|
|
"epoch": 1.2959042218021424,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048361786300131477,
|
|
"loss": 5.5363,
|
|
"mean_token_accuracy": 0.15678158700466155,
|
|
"num_tokens": 28445277.0,
|
|
"step": 15425
|
|
},
|
|
{
|
|
"entropy": 5.784550476074219,
|
|
"epoch": 1.2963243016173074,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004836066434715319,
|
|
"loss": 5.4399,
|
|
"mean_token_accuracy": 0.16050161719322203,
|
|
"num_tokens": 28453959.0,
|
|
"step": 15430
|
|
},
|
|
{
|
|
"entropy": 5.718553638458252,
|
|
"epoch": 1.2967443814324722,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004835954202463898,
|
|
"loss": 5.5243,
|
|
"mean_token_accuracy": 0.16090073585510253,
|
|
"num_tokens": 28463780.0,
|
|
"step": 15435
|
|
},
|
|
{
|
|
"entropy": 5.64632830619812,
|
|
"epoch": 1.297164461247637,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004835841933260872,
|
|
"loss": 5.3784,
|
|
"mean_token_accuracy": 0.16484325826168061,
|
|
"num_tokens": 28473299.0,
|
|
"step": 15440
|
|
},
|
|
{
|
|
"entropy": 5.666690301895142,
|
|
"epoch": 1.297584541062802,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.00048357296271082305,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.16306840777397155,
|
|
"num_tokens": 28481859.0,
|
|
"step": 15445
|
|
},
|
|
{
|
|
"entropy": 5.80743989944458,
|
|
"epoch": 1.2980046208779668,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048356172840079625,
|
|
"loss": 5.4795,
|
|
"mean_token_accuracy": 0.16350326538085938,
|
|
"num_tokens": 28491034.0,
|
|
"step": 15450
|
|
},
|
|
{
|
|
"entropy": 5.697645139694214,
|
|
"epoch": 1.2984247006931318,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004835504903962058,
|
|
"loss": 5.3839,
|
|
"mean_token_accuracy": 0.16248102933168412,
|
|
"num_tokens": 28499829.0,
|
|
"step": 15455
|
|
},
|
|
{
|
|
"entropy": 5.623191022872925,
|
|
"epoch": 1.2988447805082965,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048353924869725084,
|
|
"loss": 5.3937,
|
|
"mean_token_accuracy": 0.1705133929848671,
|
|
"num_tokens": 28508188.0,
|
|
"step": 15460
|
|
},
|
|
{
|
|
"entropy": 5.609925365447998,
|
|
"epoch": 1.2992648603234613,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004835280033041305,
|
|
"loss": 5.2948,
|
|
"mean_token_accuracy": 0.16951121538877487,
|
|
"num_tokens": 28516509.0,
|
|
"step": 15465
|
|
},
|
|
{
|
|
"entropy": 5.652699041366577,
|
|
"epoch": 1.2996849401386263,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004835167542170439,
|
|
"loss": 5.5169,
|
|
"mean_token_accuracy": 0.16390926837921144,
|
|
"num_tokens": 28526457.0,
|
|
"step": 15470
|
|
},
|
|
{
|
|
"entropy": 5.70890064239502,
|
|
"epoch": 1.3001050199537914,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004835055014361904,
|
|
"loss": 5.461,
|
|
"mean_token_accuracy": 0.16140211522579193,
|
|
"num_tokens": 28536149.0,
|
|
"step": 15475
|
|
},
|
|
{
|
|
"entropy": 5.776080131530762,
|
|
"epoch": 1.3005250997689561,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048349424496176924,
|
|
"loss": 5.5146,
|
|
"mean_token_accuracy": 0.16204932928085328,
|
|
"num_tokens": 28545486.0,
|
|
"step": 15480
|
|
},
|
|
{
|
|
"entropy": 5.693456315994263,
|
|
"epoch": 1.300945179584121,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00048348298479397996,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.1665617987513542,
|
|
"num_tokens": 28554555.0,
|
|
"step": 15485
|
|
},
|
|
{
|
|
"entropy": 5.563140153884888,
|
|
"epoch": 1.301365259399286,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048347172093302196,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.17032357305288315,
|
|
"num_tokens": 28563387.0,
|
|
"step": 15490
|
|
},
|
|
{
|
|
"entropy": 5.654443550109863,
|
|
"epoch": 1.3017853392144507,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00048346045337909475,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.16440292894840242,
|
|
"num_tokens": 28573437.0,
|
|
"step": 15495
|
|
},
|
|
{
|
|
"entropy": 5.641400241851807,
|
|
"epoch": 1.3022054190296157,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.000483449182132398,
|
|
"loss": 5.3656,
|
|
"mean_token_accuracy": 0.17342451214790344,
|
|
"num_tokens": 28583362.0,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"entropy": 5.808328342437744,
|
|
"epoch": 1.3026254988447805,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.00048343790719313124,
|
|
"loss": 5.553,
|
|
"mean_token_accuracy": 0.15858516097068787,
|
|
"num_tokens": 28593201.0,
|
|
"step": 15505
|
|
},
|
|
{
|
|
"entropy": 5.6986161231994625,
|
|
"epoch": 1.3030455786599453,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00048342662856149427,
|
|
"loss": 5.452,
|
|
"mean_token_accuracy": 0.15802465230226517,
|
|
"num_tokens": 28602486.0,
|
|
"step": 15510
|
|
},
|
|
{
|
|
"entropy": 5.641084289550781,
|
|
"epoch": 1.3034656584751103,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000483415346237687,
|
|
"loss": 5.4635,
|
|
"mean_token_accuracy": 0.163986237347126,
|
|
"num_tokens": 28611643.0,
|
|
"step": 15515
|
|
},
|
|
{
|
|
"entropy": 5.741965579986572,
|
|
"epoch": 1.303885738290275,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004834040602219091,
|
|
"loss": 5.511,
|
|
"mean_token_accuracy": 0.16517338454723357,
|
|
"num_tokens": 28620545.0,
|
|
"step": 15520
|
|
},
|
|
{
|
|
"entropy": 5.687145090103149,
|
|
"epoch": 1.30430581810544,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048339277051436067,
|
|
"loss": 5.4423,
|
|
"mean_token_accuracy": 0.16573746055364608,
|
|
"num_tokens": 28630024.0,
|
|
"step": 15525
|
|
},
|
|
{
|
|
"entropy": 5.800404119491577,
|
|
"epoch": 1.304725897920605,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004833814771152415,
|
|
"loss": 5.4982,
|
|
"mean_token_accuracy": 0.1673808366060257,
|
|
"num_tokens": 28638995.0,
|
|
"step": 15530
|
|
},
|
|
{
|
|
"entropy": 5.6978675365448,
|
|
"epoch": 1.3051459777357697,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048337018002475184,
|
|
"loss": 5.4483,
|
|
"mean_token_accuracy": 0.1675184115767479,
|
|
"num_tokens": 28647833.0,
|
|
"step": 15535
|
|
},
|
|
{
|
|
"entropy": 5.632976531982422,
|
|
"epoch": 1.3055660575509347,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004833588792430917,
|
|
"loss": 5.3562,
|
|
"mean_token_accuracy": 0.16957587152719497,
|
|
"num_tokens": 28657441.0,
|
|
"step": 15540
|
|
},
|
|
{
|
|
"entropy": 5.710914278030396,
|
|
"epoch": 1.3059861373660997,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004833475747704614,
|
|
"loss": 5.4746,
|
|
"mean_token_accuracy": 0.16293687522411346,
|
|
"num_tokens": 28666666.0,
|
|
"step": 15545
|
|
},
|
|
{
|
|
"entropy": 5.711972379684449,
|
|
"epoch": 1.3064062171812645,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.000483336266607061,
|
|
"loss": 5.4684,
|
|
"mean_token_accuracy": 0.16195246577262878,
|
|
"num_tokens": 28676770.0,
|
|
"step": 15550
|
|
},
|
|
{
|
|
"entropy": 5.71502652168274,
|
|
"epoch": 1.3068262969964293,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048332495475309097,
|
|
"loss": 5.3882,
|
|
"mean_token_accuracy": 0.16904159635305405,
|
|
"num_tokens": 28685610.0,
|
|
"step": 15555
|
|
},
|
|
{
|
|
"entropy": 5.733300971984863,
|
|
"epoch": 1.3072463768115943,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048331363920875155,
|
|
"loss": 5.4835,
|
|
"mean_token_accuracy": 0.1614070475101471,
|
|
"num_tokens": 28695082.0,
|
|
"step": 15560
|
|
},
|
|
{
|
|
"entropy": 5.6674620628356935,
|
|
"epoch": 1.307666456626759,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048330231997424335,
|
|
"loss": 5.3919,
|
|
"mean_token_accuracy": 0.1674228772521019,
|
|
"num_tokens": 28704006.0,
|
|
"step": 15565
|
|
},
|
|
{
|
|
"entropy": 5.664810228347778,
|
|
"epoch": 1.308086536441924,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004832909970497668,
|
|
"loss": 5.4412,
|
|
"mean_token_accuracy": 0.16440101712942123,
|
|
"num_tokens": 28713665.0,
|
|
"step": 15570
|
|
},
|
|
{
|
|
"entropy": 5.687829685211182,
|
|
"epoch": 1.3085066162570889,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048327967043552245,
|
|
"loss": 5.3995,
|
|
"mean_token_accuracy": 0.16435023695230483,
|
|
"num_tokens": 28722920.0,
|
|
"step": 15575
|
|
},
|
|
{
|
|
"entropy": 5.689635181427002,
|
|
"epoch": 1.3089266960722536,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048326834013171107,
|
|
"loss": 5.348,
|
|
"mean_token_accuracy": 0.1712331637740135,
|
|
"num_tokens": 28731689.0,
|
|
"step": 15580
|
|
},
|
|
{
|
|
"entropy": 5.734625387191772,
|
|
"epoch": 1.3093467758874187,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004832570061385332,
|
|
"loss": 5.4711,
|
|
"mean_token_accuracy": 0.17253154814243316,
|
|
"num_tokens": 28741308.0,
|
|
"step": 15585
|
|
},
|
|
{
|
|
"entropy": 5.603468322753907,
|
|
"epoch": 1.3097668557025834,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004832456684561898,
|
|
"loss": 5.4311,
|
|
"mean_token_accuracy": 0.16657552123069763,
|
|
"num_tokens": 28750190.0,
|
|
"step": 15590
|
|
},
|
|
{
|
|
"entropy": 5.622490262985229,
|
|
"epoch": 1.3101869355177485,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004832343270848815,
|
|
"loss": 5.5019,
|
|
"mean_token_accuracy": 0.16145084649324418,
|
|
"num_tokens": 28759588.0,
|
|
"step": 15595
|
|
},
|
|
{
|
|
"entropy": 5.707578086853028,
|
|
"epoch": 1.3106070153329132,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00048322298202480935,
|
|
"loss": 5.5023,
|
|
"mean_token_accuracy": 0.162407810986042,
|
|
"num_tokens": 28768800.0,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"entropy": 5.782344579696655,
|
|
"epoch": 1.311027095148078,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.00048321163327617433,
|
|
"loss": 5.4337,
|
|
"mean_token_accuracy": 0.16309218406677245,
|
|
"num_tokens": 28778108.0,
|
|
"step": 15605
|
|
},
|
|
{
|
|
"entropy": 5.753531789779663,
|
|
"epoch": 1.311447174963243,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004832002808391775,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.16352954655885696,
|
|
"num_tokens": 28787202.0,
|
|
"step": 15610
|
|
},
|
|
{
|
|
"entropy": 5.679688262939453,
|
|
"epoch": 1.3118672547784078,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004831889247140198,
|
|
"loss": 5.4529,
|
|
"mean_token_accuracy": 0.16261952072381974,
|
|
"num_tokens": 28797482.0,
|
|
"step": 15615
|
|
},
|
|
{
|
|
"entropy": 5.587442255020141,
|
|
"epoch": 1.3122873345935728,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00048317756490090253,
|
|
"loss": 5.3885,
|
|
"mean_token_accuracy": 0.16671659797430038,
|
|
"num_tokens": 28805872.0,
|
|
"step": 15620
|
|
},
|
|
{
|
|
"entropy": 5.645391368865967,
|
|
"epoch": 1.3127074144087376,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00048316620140002685,
|
|
"loss": 5.5111,
|
|
"mean_token_accuracy": 0.15997616499662398,
|
|
"num_tokens": 28814836.0,
|
|
"step": 15625
|
|
},
|
|
{
|
|
"entropy": 5.78643798828125,
|
|
"epoch": 1.3131274942239024,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004831548342115942,
|
|
"loss": 5.4727,
|
|
"mean_token_accuracy": 0.1584260269999504,
|
|
"num_tokens": 28824727.0,
|
|
"step": 15630
|
|
},
|
|
{
|
|
"entropy": 5.820865345001221,
|
|
"epoch": 1.3135475740390674,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048314346333580576,
|
|
"loss": 5.5875,
|
|
"mean_token_accuracy": 0.1578096106648445,
|
|
"num_tokens": 28833848.0,
|
|
"step": 15635
|
|
},
|
|
{
|
|
"entropy": 5.667257070541382,
|
|
"epoch": 1.3139676538542324,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004831320887728631,
|
|
"loss": 5.3397,
|
|
"mean_token_accuracy": 0.16978776156902314,
|
|
"num_tokens": 28842198.0,
|
|
"step": 15640
|
|
},
|
|
{
|
|
"entropy": 5.667364835739136,
|
|
"epoch": 1.3143877336693972,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004831207105229676,
|
|
"loss": 5.4355,
|
|
"mean_token_accuracy": 0.16604958921670915,
|
|
"num_tokens": 28851804.0,
|
|
"step": 15645
|
|
},
|
|
{
|
|
"entropy": 5.605535078048706,
|
|
"epoch": 1.314807813484562,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048310932858632087,
|
|
"loss": 5.3583,
|
|
"mean_token_accuracy": 0.16956010460853577,
|
|
"num_tokens": 28860181.0,
|
|
"step": 15650
|
|
},
|
|
{
|
|
"entropy": 5.634918832778931,
|
|
"epoch": 1.315227893299727,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00048309794296312467,
|
|
"loss": 5.4172,
|
|
"mean_token_accuracy": 0.17280941605567932,
|
|
"num_tokens": 28869945.0,
|
|
"step": 15655
|
|
},
|
|
{
|
|
"entropy": 5.699268817901611,
|
|
"epoch": 1.3156479731148918,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048308655365358053,
|
|
"loss": 5.4639,
|
|
"mean_token_accuracy": 0.1694648638367653,
|
|
"num_tokens": 28880343.0,
|
|
"step": 15660
|
|
},
|
|
{
|
|
"entropy": 5.794540119171143,
|
|
"epoch": 1.3160680529300568,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048307516065789017,
|
|
"loss": 5.5316,
|
|
"mean_token_accuracy": 0.15753707140684128,
|
|
"num_tokens": 28889441.0,
|
|
"step": 15665
|
|
},
|
|
{
|
|
"entropy": 5.740979290008545,
|
|
"epoch": 1.3164881327452216,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048306376397625546,
|
|
"loss": 5.4851,
|
|
"mean_token_accuracy": 0.15848094820976258,
|
|
"num_tokens": 28898154.0,
|
|
"step": 15670
|
|
},
|
|
{
|
|
"entropy": 5.736214065551758,
|
|
"epoch": 1.3169082125603864,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.00048305236360887834,
|
|
"loss": 5.4881,
|
|
"mean_token_accuracy": 0.16091601997613908,
|
|
"num_tokens": 28908359.0,
|
|
"step": 15675
|
|
},
|
|
{
|
|
"entropy": 5.694441890716552,
|
|
"epoch": 1.3173282923755514,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048304095955596074,
|
|
"loss": 5.4821,
|
|
"mean_token_accuracy": 0.16323225647211076,
|
|
"num_tokens": 28918416.0,
|
|
"step": 15680
|
|
},
|
|
{
|
|
"entropy": 5.743959140777588,
|
|
"epoch": 1.3177483721907162,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004830295518177047,
|
|
"loss": 5.3966,
|
|
"mean_token_accuracy": 0.17162241786718369,
|
|
"num_tokens": 28927412.0,
|
|
"step": 15685
|
|
},
|
|
{
|
|
"entropy": 5.679540491104126,
|
|
"epoch": 1.3181684520058812,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.00048301814039431227,
|
|
"loss": 5.4299,
|
|
"mean_token_accuracy": 0.1644519239664078,
|
|
"num_tokens": 28936106.0,
|
|
"step": 15690
|
|
},
|
|
{
|
|
"entropy": 5.6732524871826175,
|
|
"epoch": 1.318588531821046,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00048300672528598553,
|
|
"loss": 5.4675,
|
|
"mean_token_accuracy": 0.16666047424077987,
|
|
"num_tokens": 28945197.0,
|
|
"step": 15695
|
|
},
|
|
{
|
|
"entropy": 5.782284116744995,
|
|
"epoch": 1.3190086116362107,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0004829953064929268,
|
|
"loss": 5.5033,
|
|
"mean_token_accuracy": 0.15363497659564018,
|
|
"num_tokens": 28954278.0,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"entropy": 5.822621250152588,
|
|
"epoch": 1.3194286914513758,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004829838840153383,
|
|
"loss": 5.55,
|
|
"mean_token_accuracy": 0.16536147743463517,
|
|
"num_tokens": 28963101.0,
|
|
"step": 15705
|
|
},
|
|
{
|
|
"entropy": 5.619999361038208,
|
|
"epoch": 1.3198487712665408,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004829724578534224,
|
|
"loss": 5.4466,
|
|
"mean_token_accuracy": 0.16242460757493973,
|
|
"num_tokens": 28972063.0,
|
|
"step": 15710
|
|
},
|
|
{
|
|
"entropy": 5.685150098800659,
|
|
"epoch": 1.3202688510817056,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048296102800738153,
|
|
"loss": 5.4051,
|
|
"mean_token_accuracy": 0.1662852793931961,
|
|
"num_tokens": 28981617.0,
|
|
"step": 15715
|
|
},
|
|
{
|
|
"entropy": 5.745265245437622,
|
|
"epoch": 1.3206889308968703,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048294959447741807,
|
|
"loss": 5.3931,
|
|
"mean_token_accuracy": 0.16527727246284485,
|
|
"num_tokens": 28989442.0,
|
|
"step": 15720
|
|
},
|
|
{
|
|
"entropy": 5.664169025421143,
|
|
"epoch": 1.3211090107120353,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048293815726373467,
|
|
"loss": 5.404,
|
|
"mean_token_accuracy": 0.17082785815000534,
|
|
"num_tokens": 28999104.0,
|
|
"step": 15725
|
|
},
|
|
{
|
|
"entropy": 5.650988054275513,
|
|
"epoch": 1.3215290905272001,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048292671636653386,
|
|
"loss": 5.4456,
|
|
"mean_token_accuracy": 0.16266124546527863,
|
|
"num_tokens": 29008645.0,
|
|
"step": 15730
|
|
},
|
|
{
|
|
"entropy": 5.707271909713745,
|
|
"epoch": 1.3219491703423651,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004829152717860184,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.16636938005685806,
|
|
"num_tokens": 29018655.0,
|
|
"step": 15735
|
|
},
|
|
{
|
|
"entropy": 5.7679918766021725,
|
|
"epoch": 1.32236925015753,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048290382352239087,
|
|
"loss": 5.4385,
|
|
"mean_token_accuracy": 0.1688806027173996,
|
|
"num_tokens": 29027109.0,
|
|
"step": 15740
|
|
},
|
|
{
|
|
"entropy": 5.653837728500366,
|
|
"epoch": 1.3227893299726947,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048289237157585424,
|
|
"loss": 5.2712,
|
|
"mean_token_accuracy": 0.17749694585800171,
|
|
"num_tokens": 29035535.0,
|
|
"step": 15745
|
|
},
|
|
{
|
|
"entropy": 5.622943782806397,
|
|
"epoch": 1.3232094097878597,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004828809159466112,
|
|
"loss": 5.4429,
|
|
"mean_token_accuracy": 0.1581158846616745,
|
|
"num_tokens": 29044723.0,
|
|
"step": 15750
|
|
},
|
|
{
|
|
"entropy": 5.718198776245117,
|
|
"epoch": 1.3236294896030245,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004828694566348648,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.1552947849035263,
|
|
"num_tokens": 29053636.0,
|
|
"step": 15755
|
|
},
|
|
{
|
|
"entropy": 5.790498828887939,
|
|
"epoch": 1.3240495694181895,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048285799364081806,
|
|
"loss": 5.4813,
|
|
"mean_token_accuracy": 0.16202333718538284,
|
|
"num_tokens": 29062940.0,
|
|
"step": 15760
|
|
},
|
|
{
|
|
"entropy": 5.721147918701172,
|
|
"epoch": 1.3244696492333543,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048284652696467404,
|
|
"loss": 5.4026,
|
|
"mean_token_accuracy": 0.1688874751329422,
|
|
"num_tokens": 29072159.0,
|
|
"step": 15765
|
|
},
|
|
{
|
|
"entropy": 5.75450963973999,
|
|
"epoch": 1.324889729048519,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048283505660663575,
|
|
"loss": 5.4791,
|
|
"mean_token_accuracy": 0.16751828640699387,
|
|
"num_tokens": 29081544.0,
|
|
"step": 15770
|
|
},
|
|
{
|
|
"entropy": 5.638855648040772,
|
|
"epoch": 1.325309808863684,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004828235825669064,
|
|
"loss": 5.4346,
|
|
"mean_token_accuracy": 0.16318106204271315,
|
|
"num_tokens": 29090710.0,
|
|
"step": 15775
|
|
},
|
|
{
|
|
"entropy": 5.688300275802613,
|
|
"epoch": 1.325729888678849,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048281210484568937,
|
|
"loss": 5.4415,
|
|
"mean_token_accuracy": 0.16632406264543534,
|
|
"num_tokens": 29098988.0,
|
|
"step": 15780
|
|
},
|
|
{
|
|
"entropy": 5.665548658370971,
|
|
"epoch": 1.326149968494014,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048280062344318794,
|
|
"loss": 5.4862,
|
|
"mean_token_accuracy": 0.15649251490831376,
|
|
"num_tokens": 29108926.0,
|
|
"step": 15785
|
|
},
|
|
{
|
|
"entropy": 5.740646505355835,
|
|
"epoch": 1.3265700483091787,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004827891383596054,
|
|
"loss": 5.4148,
|
|
"mean_token_accuracy": 0.1614031285047531,
|
|
"num_tokens": 29118065.0,
|
|
"step": 15790
|
|
},
|
|
{
|
|
"entropy": 5.7241943359375,
|
|
"epoch": 1.3269901281243437,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048277764959514524,
|
|
"loss": 5.3762,
|
|
"mean_token_accuracy": 0.1652180477976799,
|
|
"num_tokens": 29127030.0,
|
|
"step": 15795
|
|
},
|
|
{
|
|
"entropy": 5.748840999603272,
|
|
"epoch": 1.3274102079395085,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004827661571500111,
|
|
"loss": 5.5058,
|
|
"mean_token_accuracy": 0.16153218150138854,
|
|
"num_tokens": 29137200.0,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"entropy": 5.741848373413086,
|
|
"epoch": 1.3278302877546735,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048275466102440644,
|
|
"loss": 5.4825,
|
|
"mean_token_accuracy": 0.16485897302627564,
|
|
"num_tokens": 29147029.0,
|
|
"step": 15805
|
|
},
|
|
{
|
|
"entropy": 5.631581258773804,
|
|
"epoch": 1.3282503675698383,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048274316121853494,
|
|
"loss": 5.3711,
|
|
"mean_token_accuracy": 0.1663237363100052,
|
|
"num_tokens": 29155675.0,
|
|
"step": 15810
|
|
},
|
|
{
|
|
"entropy": 5.749010705947876,
|
|
"epoch": 1.328670447385003,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.00048273165773260023,
|
|
"loss": 5.4356,
|
|
"mean_token_accuracy": 0.1655052199959755,
|
|
"num_tokens": 29164730.0,
|
|
"step": 15815
|
|
},
|
|
{
|
|
"entropy": 5.701095962524414,
|
|
"epoch": 1.329090527200168,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004827201505668063,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.1656198024749756,
|
|
"num_tokens": 29173074.0,
|
|
"step": 15820
|
|
},
|
|
{
|
|
"entropy": 5.7562737464904785,
|
|
"epoch": 1.3295106070153329,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004827086397213568,
|
|
"loss": 5.5478,
|
|
"mean_token_accuracy": 0.16311392933130264,
|
|
"num_tokens": 29182175.0,
|
|
"step": 15825
|
|
},
|
|
{
|
|
"entropy": 5.8277308464050295,
|
|
"epoch": 1.3299306868304979,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004826971251964557,
|
|
"loss": 5.7415,
|
|
"mean_token_accuracy": 0.1557246647775173,
|
|
"num_tokens": 29192910.0,
|
|
"step": 15830
|
|
},
|
|
{
|
|
"entropy": 5.710680675506592,
|
|
"epoch": 1.3303507666456627,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000482685606992307,
|
|
"loss": 5.387,
|
|
"mean_token_accuracy": 0.169048510491848,
|
|
"num_tokens": 29201969.0,
|
|
"step": 15835
|
|
},
|
|
{
|
|
"entropy": 5.7679280757904055,
|
|
"epoch": 1.3307708464608274,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00048267408510911463,
|
|
"loss": 5.5448,
|
|
"mean_token_accuracy": 0.16056760400533676,
|
|
"num_tokens": 29210475.0,
|
|
"step": 15840
|
|
},
|
|
{
|
|
"entropy": 5.648775243759156,
|
|
"epoch": 1.3311909262759924,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004826625595470829,
|
|
"loss": 5.4135,
|
|
"mean_token_accuracy": 0.16637052744626998,
|
|
"num_tokens": 29222586.0,
|
|
"step": 15845
|
|
},
|
|
{
|
|
"entropy": 5.65334529876709,
|
|
"epoch": 1.3316110060911575,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00048265103030641575,
|
|
"loss": 5.4624,
|
|
"mean_token_accuracy": 0.161483795940876,
|
|
"num_tokens": 29231503.0,
|
|
"step": 15850
|
|
},
|
|
{
|
|
"entropy": 5.648448801040649,
|
|
"epoch": 1.3320310859063222,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004826394973873176,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.1599622756242752,
|
|
"num_tokens": 29241534.0,
|
|
"step": 15855
|
|
},
|
|
{
|
|
"entropy": 5.723581027984619,
|
|
"epoch": 1.332451165721487,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00048262796078999266,
|
|
"loss": 5.4497,
|
|
"mean_token_accuracy": 0.16642218083143234,
|
|
"num_tokens": 29250381.0,
|
|
"step": 15860
|
|
},
|
|
{
|
|
"entropy": 5.72342619895935,
|
|
"epoch": 1.332871245536652,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004826164205146453,
|
|
"loss": 5.5105,
|
|
"mean_token_accuracy": 0.15796453654766082,
|
|
"num_tokens": 29259205.0,
|
|
"step": 15865
|
|
},
|
|
{
|
|
"entropy": 5.566991662979126,
|
|
"epoch": 1.3332913253518168,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00048260487656147995,
|
|
"loss": 5.411,
|
|
"mean_token_accuracy": 0.16878511905670165,
|
|
"num_tokens": 29267723.0,
|
|
"step": 15870
|
|
},
|
|
{
|
|
"entropy": 5.663629627227783,
|
|
"epoch": 1.3337114051669818,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048259332893070106,
|
|
"loss": 5.4105,
|
|
"mean_token_accuracy": 0.16867344379425048,
|
|
"num_tokens": 29277102.0,
|
|
"step": 15875
|
|
},
|
|
{
|
|
"entropy": 5.685384702682495,
|
|
"epoch": 1.3341314849821466,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004825817776225133,
|
|
"loss": 5.3994,
|
|
"mean_token_accuracy": 0.16746718436479568,
|
|
"num_tokens": 29286484.0,
|
|
"step": 15880
|
|
},
|
|
{
|
|
"entropy": 5.673250675201416,
|
|
"epoch": 1.3345515647973114,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048257022263712123,
|
|
"loss": 5.4876,
|
|
"mean_token_accuracy": 0.17098401337862015,
|
|
"num_tokens": 29296528.0,
|
|
"step": 15885
|
|
},
|
|
{
|
|
"entropy": 5.589338874816894,
|
|
"epoch": 1.3349716446124764,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048255866397472954,
|
|
"loss": 5.3384,
|
|
"mean_token_accuracy": 0.17186694368720054,
|
|
"num_tokens": 29305283.0,
|
|
"step": 15890
|
|
},
|
|
{
|
|
"entropy": 5.703013134002686,
|
|
"epoch": 1.3353917244276412,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.000482547101635543,
|
|
"loss": 5.3432,
|
|
"mean_token_accuracy": 0.1679681733250618,
|
|
"num_tokens": 29315088.0,
|
|
"step": 15895
|
|
},
|
|
{
|
|
"entropy": 5.703509330749512,
|
|
"epoch": 1.3358118042428062,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048253553561976645,
|
|
"loss": 5.3596,
|
|
"mean_token_accuracy": 0.16931547373533248,
|
|
"num_tokens": 29323793.0,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"entropy": 5.644972229003907,
|
|
"epoch": 1.336231884057971,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004825239659276047,
|
|
"loss": 5.4415,
|
|
"mean_token_accuracy": 0.1619830012321472,
|
|
"num_tokens": 29334015.0,
|
|
"step": 15905
|
|
},
|
|
{
|
|
"entropy": 5.736817216873169,
|
|
"epoch": 1.3366519638731358,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004825123925592628,
|
|
"loss": 5.5419,
|
|
"mean_token_accuracy": 0.15840226113796235,
|
|
"num_tokens": 29343221.0,
|
|
"step": 15910
|
|
},
|
|
{
|
|
"entropy": 5.711045169830323,
|
|
"epoch": 1.3370720436883008,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048250081551494574,
|
|
"loss": 5.3858,
|
|
"mean_token_accuracy": 0.16694712340831758,
|
|
"num_tokens": 29352261.0,
|
|
"step": 15915
|
|
},
|
|
{
|
|
"entropy": 5.677080345153809,
|
|
"epoch": 1.3374921235034656,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004824892347948586,
|
|
"loss": 5.4929,
|
|
"mean_token_accuracy": 0.16138059496879578,
|
|
"num_tokens": 29362138.0,
|
|
"step": 15920
|
|
},
|
|
{
|
|
"entropy": 5.6527352809906,
|
|
"epoch": 1.3379122033186306,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004824776503992064,
|
|
"loss": 5.3898,
|
|
"mean_token_accuracy": 0.1713466763496399,
|
|
"num_tokens": 29371234.0,
|
|
"step": 15925
|
|
},
|
|
{
|
|
"entropy": 5.635444116592407,
|
|
"epoch": 1.3383322831337954,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004824660623281945,
|
|
"loss": 5.4473,
|
|
"mean_token_accuracy": 0.16970054805278778,
|
|
"num_tokens": 29380371.0,
|
|
"step": 15930
|
|
},
|
|
{
|
|
"entropy": 5.773072195053101,
|
|
"epoch": 1.3387523629489604,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048245447058202815,
|
|
"loss": 5.5592,
|
|
"mean_token_accuracy": 0.1614100843667984,
|
|
"num_tokens": 29389230.0,
|
|
"step": 15935
|
|
},
|
|
{
|
|
"entropy": 5.7593803882598875,
|
|
"epoch": 1.3391724427641252,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004824428751609126,
|
|
"loss": 5.4466,
|
|
"mean_token_accuracy": 0.16970301866531373,
|
|
"num_tokens": 29398753.0,
|
|
"step": 15940
|
|
},
|
|
{
|
|
"entropy": 5.7071356773376465,
|
|
"epoch": 1.3395925225792902,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00048243127606505343,
|
|
"loss": 5.4092,
|
|
"mean_token_accuracy": 0.16827066540718078,
|
|
"num_tokens": 29407487.0,
|
|
"step": 15945
|
|
},
|
|
{
|
|
"entropy": 5.572418594360352,
|
|
"epoch": 1.340012602394455,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000482419673294656,
|
|
"loss": 5.4018,
|
|
"mean_token_accuracy": 0.16651310175657272,
|
|
"num_tokens": 29416140.0,
|
|
"step": 15950
|
|
},
|
|
{
|
|
"entropy": 5.64957218170166,
|
|
"epoch": 1.3404326822096198,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004824080668499259,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.1690505862236023,
|
|
"num_tokens": 29424763.0,
|
|
"step": 15955
|
|
},
|
|
{
|
|
"entropy": 5.800030183792114,
|
|
"epoch": 1.3408527620247848,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048239645673106855,
|
|
"loss": 5.4385,
|
|
"mean_token_accuracy": 0.16088547855615615,
|
|
"num_tokens": 29434589.0,
|
|
"step": 15960
|
|
},
|
|
{
|
|
"entropy": 5.71432843208313,
|
|
"epoch": 1.3412728418399495,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048238484293828995,
|
|
"loss": 5.4479,
|
|
"mean_token_accuracy": 0.16145109385252,
|
|
"num_tokens": 29443549.0,
|
|
"step": 15965
|
|
},
|
|
{
|
|
"entropy": 5.6876280307769775,
|
|
"epoch": 1.3416929216551146,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004823732254717955,
|
|
"loss": 5.4565,
|
|
"mean_token_accuracy": 0.16495574414730071,
|
|
"num_tokens": 29452457.0,
|
|
"step": 15970
|
|
},
|
|
{
|
|
"entropy": 5.612264728546142,
|
|
"epoch": 1.3421130014702793,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004823616043317912,
|
|
"loss": 5.4241,
|
|
"mean_token_accuracy": 0.16470324099063874,
|
|
"num_tokens": 29461238.0,
|
|
"step": 15975
|
|
},
|
|
{
|
|
"entropy": 5.720156478881836,
|
|
"epoch": 1.3425330812854441,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048234997951848284,
|
|
"loss": 5.4857,
|
|
"mean_token_accuracy": 0.15919755399227142,
|
|
"num_tokens": 29471170.0,
|
|
"step": 15980
|
|
},
|
|
{
|
|
"entropy": 5.813641786575317,
|
|
"epoch": 1.3429531611006091,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004823383510320764,
|
|
"loss": 5.5245,
|
|
"mean_token_accuracy": 0.155257136374712,
|
|
"num_tokens": 29481017.0,
|
|
"step": 15985
|
|
},
|
|
{
|
|
"entropy": 5.799026155471802,
|
|
"epoch": 1.343373240915774,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048232671887277786,
|
|
"loss": 5.457,
|
|
"mean_token_accuracy": 0.1612869530916214,
|
|
"num_tokens": 29489809.0,
|
|
"step": 15990
|
|
},
|
|
{
|
|
"entropy": 5.6579841613769535,
|
|
"epoch": 1.343793320730939,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048231508304079313,
|
|
"loss": 5.4711,
|
|
"mean_token_accuracy": 0.16473791301250457,
|
|
"num_tokens": 29499499.0,
|
|
"step": 15995
|
|
},
|
|
{
|
|
"entropy": 5.745253086090088,
|
|
"epoch": 1.3442134005461037,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048230344353632855,
|
|
"loss": 5.4375,
|
|
"mean_token_accuracy": 0.16314539089798927,
|
|
"num_tokens": 29508526.0,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"entropy": 5.741327238082886,
|
|
"epoch": 1.3446334803612685,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004822918003595902,
|
|
"loss": 5.3692,
|
|
"mean_token_accuracy": 0.1664547398686409,
|
|
"num_tokens": 29517516.0,
|
|
"step": 16005
|
|
},
|
|
{
|
|
"entropy": 5.649990653991699,
|
|
"epoch": 1.3450535601764335,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004822801535107843,
|
|
"loss": 5.4562,
|
|
"mean_token_accuracy": 0.16211575120687485,
|
|
"num_tokens": 29526949.0,
|
|
"step": 16010
|
|
},
|
|
{
|
|
"entropy": 5.62546067237854,
|
|
"epoch": 1.3454736399915985,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004822685029901173,
|
|
"loss": 5.3694,
|
|
"mean_token_accuracy": 0.16785492449998857,
|
|
"num_tokens": 29536696.0,
|
|
"step": 16015
|
|
},
|
|
{
|
|
"entropy": 5.697886323928833,
|
|
"epoch": 1.3458937198067633,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004822568487977954,
|
|
"loss": 5.4598,
|
|
"mean_token_accuracy": 0.1707649677991867,
|
|
"num_tokens": 29545672.0,
|
|
"step": 16020
|
|
},
|
|
{
|
|
"entropy": 5.72620997428894,
|
|
"epoch": 1.346313799621928,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048224519093402517,
|
|
"loss": 5.4987,
|
|
"mean_token_accuracy": 0.16094502359628676,
|
|
"num_tokens": 29554888.0,
|
|
"step": 16025
|
|
},
|
|
{
|
|
"entropy": 5.706309843063354,
|
|
"epoch": 1.346733879437093,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00048223352939901317,
|
|
"loss": 5.4213,
|
|
"mean_token_accuracy": 0.1683374136686325,
|
|
"num_tokens": 29564798.0,
|
|
"step": 16030
|
|
},
|
|
{
|
|
"entropy": 5.692904901504517,
|
|
"epoch": 1.347153959252258,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004822218641929658,
|
|
"loss": 5.4523,
|
|
"mean_token_accuracy": 0.16932614743709565,
|
|
"num_tokens": 29574802.0,
|
|
"step": 16035
|
|
},
|
|
{
|
|
"entropy": 5.79500937461853,
|
|
"epoch": 1.347574039067423,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004822101953160899,
|
|
"loss": 5.4429,
|
|
"mean_token_accuracy": 0.16303310692310333,
|
|
"num_tokens": 29583056.0,
|
|
"step": 16040
|
|
},
|
|
{
|
|
"entropy": 5.704788446426392,
|
|
"epoch": 1.3479941188825877,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000482198522768592,
|
|
"loss": 5.4188,
|
|
"mean_token_accuracy": 0.1648677781224251,
|
|
"num_tokens": 29591935.0,
|
|
"step": 16045
|
|
},
|
|
{
|
|
"entropy": 5.581204128265381,
|
|
"epoch": 1.3484141986977525,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048218684655067907,
|
|
"loss": 5.3587,
|
|
"mean_token_accuracy": 0.16874558329582215,
|
|
"num_tokens": 29600812.0,
|
|
"step": 16050
|
|
},
|
|
{
|
|
"entropy": 5.74789342880249,
|
|
"epoch": 1.3488342785129175,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004821751666625577,
|
|
"loss": 5.4803,
|
|
"mean_token_accuracy": 0.16880127936601638,
|
|
"num_tokens": 29610735.0,
|
|
"step": 16055
|
|
},
|
|
{
|
|
"entropy": 5.74139404296875,
|
|
"epoch": 1.3492543583280823,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00048216348310443506,
|
|
"loss": 5.4079,
|
|
"mean_token_accuracy": 0.1595388814806938,
|
|
"num_tokens": 29620295.0,
|
|
"step": 16060
|
|
},
|
|
{
|
|
"entropy": 5.614044618606568,
|
|
"epoch": 1.3496744381432473,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048215179587651795,
|
|
"loss": 5.278,
|
|
"mean_token_accuracy": 0.17504663914442062,
|
|
"num_tokens": 29628214.0,
|
|
"step": 16065
|
|
},
|
|
{
|
|
"entropy": 5.613619422912597,
|
|
"epoch": 1.350094517958412,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004821401049790134,
|
|
"loss": 5.407,
|
|
"mean_token_accuracy": 0.17384071946144103,
|
|
"num_tokens": 29636598.0,
|
|
"step": 16070
|
|
},
|
|
{
|
|
"entropy": 5.722601461410522,
|
|
"epoch": 1.3505145977735769,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004821284104121286,
|
|
"loss": 5.3986,
|
|
"mean_token_accuracy": 0.16711462736129762,
|
|
"num_tokens": 29646052.0,
|
|
"step": 16075
|
|
},
|
|
{
|
|
"entropy": 5.650021648406982,
|
|
"epoch": 1.3509346775887419,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048211671217607066,
|
|
"loss": 5.4292,
|
|
"mean_token_accuracy": 0.1578374594449997,
|
|
"num_tokens": 29655310.0,
|
|
"step": 16080
|
|
},
|
|
{
|
|
"entropy": 5.695374917984009,
|
|
"epoch": 1.3513547574039069,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004821050102710468,
|
|
"loss": 5.4328,
|
|
"mean_token_accuracy": 0.16689082086086274,
|
|
"num_tokens": 29664020.0,
|
|
"step": 16085
|
|
},
|
|
{
|
|
"entropy": 5.67445330619812,
|
|
"epoch": 1.3517748372190717,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00048209330469726433,
|
|
"loss": 5.4928,
|
|
"mean_token_accuracy": 0.16078488826751708,
|
|
"num_tokens": 29672416.0,
|
|
"step": 16090
|
|
},
|
|
{
|
|
"entropy": 5.699030542373658,
|
|
"epoch": 1.3521949170342364,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048208159545493057,
|
|
"loss": 5.386,
|
|
"mean_token_accuracy": 0.17253393828868865,
|
|
"num_tokens": 29681148.0,
|
|
"step": 16095
|
|
},
|
|
{
|
|
"entropy": 5.662472581863403,
|
|
"epoch": 1.3526149968494015,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004820698825442531,
|
|
"loss": 5.356,
|
|
"mean_token_accuracy": 0.16811198592185975,
|
|
"num_tokens": 29689089.0,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"entropy": 5.661127424240112,
|
|
"epoch": 1.3530350766645662,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048205816596543914,
|
|
"loss": 5.4761,
|
|
"mean_token_accuracy": 0.1623773142695427,
|
|
"num_tokens": 29697704.0,
|
|
"step": 16105
|
|
},
|
|
{
|
|
"entropy": 5.721720790863037,
|
|
"epoch": 1.3534551564797312,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048204644571869646,
|
|
"loss": 5.4838,
|
|
"mean_token_accuracy": 0.1618230536580086,
|
|
"num_tokens": 29706966.0,
|
|
"step": 16110
|
|
},
|
|
{
|
|
"entropy": 5.689847612380982,
|
|
"epoch": 1.353875236294896,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004820347218042326,
|
|
"loss": 5.3846,
|
|
"mean_token_accuracy": 0.1613849386572838,
|
|
"num_tokens": 29715817.0,
|
|
"step": 16115
|
|
},
|
|
{
|
|
"entropy": 5.680365753173828,
|
|
"epoch": 1.3542953161100608,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004820229942222553,
|
|
"loss": 5.4815,
|
|
"mean_token_accuracy": 0.16351019442081452,
|
|
"num_tokens": 29725500.0,
|
|
"step": 16120
|
|
},
|
|
{
|
|
"entropy": 5.6516200542449955,
|
|
"epoch": 1.3547153959252258,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048201126297297214,
|
|
"loss": 5.4144,
|
|
"mean_token_accuracy": 0.1723678767681122,
|
|
"num_tokens": 29734774.0,
|
|
"step": 16125
|
|
},
|
|
{
|
|
"entropy": 5.713293790817261,
|
|
"epoch": 1.3551354757403906,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004819995280565911,
|
|
"loss": 5.3916,
|
|
"mean_token_accuracy": 0.16618053019046783,
|
|
"num_tokens": 29744667.0,
|
|
"step": 16130
|
|
},
|
|
{
|
|
"entropy": 5.790366268157959,
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048198778947332,
|
|
"loss": 5.4858,
|
|
"mean_token_accuracy": 0.16581830829381944,
|
|
"num_tokens": 29753644.0,
|
|
"step": 16135
|
|
},
|
|
{
|
|
"entropy": 5.781135702133179,
|
|
"epoch": 1.3559756353707204,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004819760472233668,
|
|
"loss": 5.4401,
|
|
"mean_token_accuracy": 0.17537587881088257,
|
|
"num_tokens": 29762977.0,
|
|
"step": 16140
|
|
},
|
|
{
|
|
"entropy": 5.652209234237671,
|
|
"epoch": 1.3563957151858852,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048196430130693956,
|
|
"loss": 5.417,
|
|
"mean_token_accuracy": 0.1675757497549057,
|
|
"num_tokens": 29772221.0,
|
|
"step": 16145
|
|
},
|
|
{
|
|
"entropy": 5.621037292480469,
|
|
"epoch": 1.3568157950010502,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048195255172424627,
|
|
"loss": 5.3946,
|
|
"mean_token_accuracy": 0.17199670076370238,
|
|
"num_tokens": 29781240.0,
|
|
"step": 16150
|
|
},
|
|
{
|
|
"entropy": 5.7102892875671385,
|
|
"epoch": 1.3572358748162152,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048194079847549507,
|
|
"loss": 5.3836,
|
|
"mean_token_accuracy": 0.1677268549799919,
|
|
"num_tokens": 29790330.0,
|
|
"step": 16155
|
|
},
|
|
{
|
|
"entropy": 5.742030811309815,
|
|
"epoch": 1.35765595463138,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004819290415608942,
|
|
"loss": 5.5299,
|
|
"mean_token_accuracy": 0.15959240794181823,
|
|
"num_tokens": 29800945.0,
|
|
"step": 16160
|
|
},
|
|
{
|
|
"entropy": 5.749591875076294,
|
|
"epoch": 1.3580760344465448,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004819172809806519,
|
|
"loss": 5.5563,
|
|
"mean_token_accuracy": 0.16161940693855287,
|
|
"num_tokens": 29810391.0,
|
|
"step": 16165
|
|
},
|
|
{
|
|
"entropy": 5.724706315994263,
|
|
"epoch": 1.3584961142617098,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048190551673497645,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.16489760130643843,
|
|
"num_tokens": 29819511.0,
|
|
"step": 16170
|
|
},
|
|
{
|
|
"entropy": 5.671798896789551,
|
|
"epoch": 1.3589161940768746,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 0.0004818937488240764,
|
|
"loss": 5.4587,
|
|
"mean_token_accuracy": 0.16651098430156708,
|
|
"num_tokens": 29828313.0,
|
|
"step": 16175
|
|
},
|
|
{
|
|
"entropy": 5.613863277435303,
|
|
"epoch": 1.3593362738920396,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048188197724816014,
|
|
"loss": 5.3552,
|
|
"mean_token_accuracy": 0.17119555920362473,
|
|
"num_tokens": 29837940.0,
|
|
"step": 16180
|
|
},
|
|
{
|
|
"entropy": 5.6810376167297365,
|
|
"epoch": 1.3597563537072044,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00048187020200743613,
|
|
"loss": 5.3383,
|
|
"mean_token_accuracy": 0.17339792847633362,
|
|
"num_tokens": 29846799.0,
|
|
"step": 16185
|
|
},
|
|
{
|
|
"entropy": 5.665157318115234,
|
|
"epoch": 1.3601764335223692,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000481858423102113,
|
|
"loss": 5.4742,
|
|
"mean_token_accuracy": 0.16402493715286254,
|
|
"num_tokens": 29856263.0,
|
|
"step": 16190
|
|
},
|
|
{
|
|
"entropy": 5.644852066040039,
|
|
"epoch": 1.3605965133375342,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004818466405323994,
|
|
"loss": 5.4008,
|
|
"mean_token_accuracy": 0.16702589765191078,
|
|
"num_tokens": 29864335.0,
|
|
"step": 16195
|
|
},
|
|
{
|
|
"entropy": 5.780227518081665,
|
|
"epoch": 1.361016593152699,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.00048183485429850417,
|
|
"loss": 5.4571,
|
|
"mean_token_accuracy": 0.16093909740447998,
|
|
"num_tokens": 29873466.0,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"entropy": 5.650618982315064,
|
|
"epoch": 1.361436672967864,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004818230644006359,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.1745832309126854,
|
|
"num_tokens": 29883051.0,
|
|
"step": 16205
|
|
},
|
|
{
|
|
"entropy": 5.6727265357971195,
|
|
"epoch": 1.3618567527830288,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004818112708390036,
|
|
"loss": 5.3724,
|
|
"mean_token_accuracy": 0.16966692954301835,
|
|
"num_tokens": 29891823.0,
|
|
"step": 16210
|
|
},
|
|
{
|
|
"entropy": 5.6647271633148195,
|
|
"epoch": 1.3622768325981935,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004817994736138162,
|
|
"loss": 5.3974,
|
|
"mean_token_accuracy": 0.16659445315599442,
|
|
"num_tokens": 29900735.0,
|
|
"step": 16215
|
|
},
|
|
{
|
|
"entropy": 5.723177146911621,
|
|
"epoch": 1.3626969124133586,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004817876727252824,
|
|
"loss": 5.4645,
|
|
"mean_token_accuracy": 0.16937078535556793,
|
|
"num_tokens": 29910345.0,
|
|
"step": 16220
|
|
},
|
|
{
|
|
"entropy": 5.680374002456665,
|
|
"epoch": 1.3631169922285233,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048177586817361166,
|
|
"loss": 5.4253,
|
|
"mean_token_accuracy": 0.16509459167718887,
|
|
"num_tokens": 29919650.0,
|
|
"step": 16225
|
|
},
|
|
{
|
|
"entropy": 5.744551753997802,
|
|
"epoch": 1.3635370720436883,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004817640599590128,
|
|
"loss": 5.4634,
|
|
"mean_token_accuracy": 0.16363565474748612,
|
|
"num_tokens": 29928851.0,
|
|
"step": 16230
|
|
},
|
|
{
|
|
"entropy": 5.795070457458496,
|
|
"epoch": 1.3639571518588531,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00048175224808169506,
|
|
"loss": 5.5652,
|
|
"mean_token_accuracy": 0.1574440762400627,
|
|
"num_tokens": 29939146.0,
|
|
"step": 16235
|
|
},
|
|
{
|
|
"entropy": 5.739347171783447,
|
|
"epoch": 1.3643772316740181,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00048174043254186775,
|
|
"loss": 5.3954,
|
|
"mean_token_accuracy": 0.16345300823450087,
|
|
"num_tokens": 29947556.0,
|
|
"step": 16240
|
|
},
|
|
{
|
|
"entropy": 5.723556280136108,
|
|
"epoch": 1.364797311489183,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004817286133397401,
|
|
"loss": 5.4954,
|
|
"mean_token_accuracy": 0.1634947583079338,
|
|
"num_tokens": 29957319.0,
|
|
"step": 16245
|
|
},
|
|
{
|
|
"entropy": 5.711846876144409,
|
|
"epoch": 1.365217391304348,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004817167904755216,
|
|
"loss": 5.4681,
|
|
"mean_token_accuracy": 0.16776756644248964,
|
|
"num_tokens": 29966697.0,
|
|
"step": 16250
|
|
},
|
|
{
|
|
"entropy": 5.69892258644104,
|
|
"epoch": 1.3656374711195127,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048170496394942154,
|
|
"loss": 5.4705,
|
|
"mean_token_accuracy": 0.16467532590031625,
|
|
"num_tokens": 29975103.0,
|
|
"step": 16255
|
|
},
|
|
{
|
|
"entropy": 5.626475429534912,
|
|
"epoch": 1.3660575509346775,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00048169313376164943,
|
|
"loss": 5.3783,
|
|
"mean_token_accuracy": 0.1634665012359619,
|
|
"num_tokens": 29984865.0,
|
|
"step": 16260
|
|
},
|
|
{
|
|
"entropy": 5.687254858016968,
|
|
"epoch": 1.3664776307498425,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.00048168129991241497,
|
|
"loss": 5.3935,
|
|
"mean_token_accuracy": 0.16465528607368468,
|
|
"num_tokens": 29994376.0,
|
|
"step": 16265
|
|
},
|
|
{
|
|
"entropy": 5.81418023109436,
|
|
"epoch": 1.3668977105650073,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004816694624019277,
|
|
"loss": 5.6269,
|
|
"mean_token_accuracy": 0.1598551630973816,
|
|
"num_tokens": 30004846.0,
|
|
"step": 16270
|
|
},
|
|
{
|
|
"entropy": 5.721722793579102,
|
|
"epoch": 1.3673177903801723,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048165762123039723,
|
|
"loss": 5.4061,
|
|
"mean_token_accuracy": 0.16762069165706633,
|
|
"num_tokens": 30014083.0,
|
|
"step": 16275
|
|
},
|
|
{
|
|
"entropy": 5.668401479721069,
|
|
"epoch": 1.367737870195337,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048164577639803354,
|
|
"loss": 5.4075,
|
|
"mean_token_accuracy": 0.16811236888170242,
|
|
"num_tokens": 30023606.0,
|
|
"step": 16280
|
|
},
|
|
{
|
|
"entropy": 5.625358724594117,
|
|
"epoch": 1.3681579500105019,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004816339279050463,
|
|
"loss": 5.3889,
|
|
"mean_token_accuracy": 0.1599855825304985,
|
|
"num_tokens": 30033657.0,
|
|
"step": 16285
|
|
},
|
|
{
|
|
"entropy": 5.6841939926147464,
|
|
"epoch": 1.368578029825667,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048162207575164537,
|
|
"loss": 5.4454,
|
|
"mean_token_accuracy": 0.16324448585510254,
|
|
"num_tokens": 30043230.0,
|
|
"step": 16290
|
|
},
|
|
{
|
|
"entropy": 5.704262971878052,
|
|
"epoch": 1.3689981096408317,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048161021993804075,
|
|
"loss": 5.4687,
|
|
"mean_token_accuracy": 0.16441552191972733,
|
|
"num_tokens": 30054457.0,
|
|
"step": 16295
|
|
},
|
|
{
|
|
"entropy": 5.6318847179412845,
|
|
"epoch": 1.3694181894559967,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00048159836046444255,
|
|
"loss": 5.3108,
|
|
"mean_token_accuracy": 0.17175357937812805,
|
|
"num_tokens": 30062912.0,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"entropy": 5.697698926925659,
|
|
"epoch": 1.3698382692711615,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004815864973310607,
|
|
"loss": 5.4661,
|
|
"mean_token_accuracy": 0.16420117467641832,
|
|
"num_tokens": 30071340.0,
|
|
"step": 16305
|
|
},
|
|
{
|
|
"entropy": 5.774897241592408,
|
|
"epoch": 1.3702583490863263,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048157463053810553,
|
|
"loss": 5.5472,
|
|
"mean_token_accuracy": 0.15643561482429505,
|
|
"num_tokens": 30080334.0,
|
|
"step": 16310
|
|
},
|
|
{
|
|
"entropy": 5.682491111755371,
|
|
"epoch": 1.3706784289014913,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048156276008578706,
|
|
"loss": 5.3925,
|
|
"mean_token_accuracy": 0.16573573052883148,
|
|
"num_tokens": 30089391.0,
|
|
"step": 16315
|
|
},
|
|
{
|
|
"entropy": 5.652284622192383,
|
|
"epoch": 1.3710985087166563,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004815508859743157,
|
|
"loss": 5.3808,
|
|
"mean_token_accuracy": 0.1688121259212494,
|
|
"num_tokens": 30099027.0,
|
|
"step": 16320
|
|
},
|
|
{
|
|
"entropy": 5.625274896621704,
|
|
"epoch": 1.371518588531821,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004815390082039017,
|
|
"loss": 5.3788,
|
|
"mean_token_accuracy": 0.16874595433473588,
|
|
"num_tokens": 30108088.0,
|
|
"step": 16325
|
|
},
|
|
{
|
|
"entropy": 5.650168752670288,
|
|
"epoch": 1.3719386683469859,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.00048152712677475556,
|
|
"loss": 5.3689,
|
|
"mean_token_accuracy": 0.16458612233400344,
|
|
"num_tokens": 30117768.0,
|
|
"step": 16330
|
|
},
|
|
{
|
|
"entropy": 5.756920528411865,
|
|
"epoch": 1.3723587481621509,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048151524168708773,
|
|
"loss": 5.4856,
|
|
"mean_token_accuracy": 0.1635723114013672,
|
|
"num_tokens": 30126364.0,
|
|
"step": 16335
|
|
},
|
|
{
|
|
"entropy": 5.663647317886353,
|
|
"epoch": 1.3727788279773157,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 0.00048150335294110867,
|
|
"loss": 5.4301,
|
|
"mean_token_accuracy": 0.1666969671845436,
|
|
"num_tokens": 30135365.0,
|
|
"step": 16340
|
|
},
|
|
{
|
|
"entropy": 5.731143760681152,
|
|
"epoch": 1.3731989077924807,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00048149146053702915,
|
|
"loss": 5.5047,
|
|
"mean_token_accuracy": 0.17594754695892334,
|
|
"num_tokens": 30145542.0,
|
|
"step": 16345
|
|
},
|
|
{
|
|
"entropy": 5.734094142913818,
|
|
"epoch": 1.3736189876076454,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004814795644750597,
|
|
"loss": 5.5201,
|
|
"mean_token_accuracy": 0.15887483209371567,
|
|
"num_tokens": 30154100.0,
|
|
"step": 16350
|
|
},
|
|
{
|
|
"entropy": 5.701383399963379,
|
|
"epoch": 1.3740390674228102,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048146766475541105,
|
|
"loss": 5.3993,
|
|
"mean_token_accuracy": 0.16724410504102707,
|
|
"num_tokens": 30162647.0,
|
|
"step": 16355
|
|
},
|
|
{
|
|
"entropy": 5.855766916275025,
|
|
"epoch": 1.3744591472379752,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00048145576137829406,
|
|
"loss": 5.5619,
|
|
"mean_token_accuracy": 0.1569045066833496,
|
|
"num_tokens": 30172518.0,
|
|
"step": 16360
|
|
},
|
|
{
|
|
"entropy": 5.693779468536377,
|
|
"epoch": 1.37487922705314,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004814438543439195,
|
|
"loss": 5.4842,
|
|
"mean_token_accuracy": 0.166504430770874,
|
|
"num_tokens": 30183124.0,
|
|
"step": 16365
|
|
},
|
|
{
|
|
"entropy": 5.750344085693359,
|
|
"epoch": 1.375299306868305,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004814319436524984,
|
|
"loss": 5.4196,
|
|
"mean_token_accuracy": 0.16698621958494186,
|
|
"num_tokens": 30191861.0,
|
|
"step": 16370
|
|
},
|
|
{
|
|
"entropy": 5.5947545051574705,
|
|
"epoch": 1.3757193866834698,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048142002930424174,
|
|
"loss": 5.3228,
|
|
"mean_token_accuracy": 0.16853681355714797,
|
|
"num_tokens": 30200308.0,
|
|
"step": 16375
|
|
},
|
|
{
|
|
"entropy": 5.743504285812378,
|
|
"epoch": 1.3761394664986346,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004814081112993605,
|
|
"loss": 5.442,
|
|
"mean_token_accuracy": 0.17036024779081343,
|
|
"num_tokens": 30209380.0,
|
|
"step": 16380
|
|
},
|
|
{
|
|
"entropy": 5.772786664962768,
|
|
"epoch": 1.3765595463137996,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004813961896380659,
|
|
"loss": 5.5344,
|
|
"mean_token_accuracy": 0.16031552404165267,
|
|
"num_tokens": 30218549.0,
|
|
"step": 16385
|
|
},
|
|
{
|
|
"entropy": 5.6893415451049805,
|
|
"epoch": 1.3769796261289646,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004813842643205691,
|
|
"loss": 5.4677,
|
|
"mean_token_accuracy": 0.1622385114431381,
|
|
"num_tokens": 30228119.0,
|
|
"step": 16390
|
|
},
|
|
{
|
|
"entropy": 5.672909450531006,
|
|
"epoch": 1.3773997059441294,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004813723353470813,
|
|
"loss": 5.4366,
|
|
"mean_token_accuracy": 0.15988910496234893,
|
|
"num_tokens": 30236765.0,
|
|
"step": 16395
|
|
},
|
|
{
|
|
"entropy": 5.758907604217529,
|
|
"epoch": 1.3778197857592942,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004813604027178139,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.16447694152593612,
|
|
"num_tokens": 30246089.0,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"entropy": 5.692288017272949,
|
|
"epoch": 1.3782398655744592,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048134846643297817,
|
|
"loss": 5.4961,
|
|
"mean_token_accuracy": 0.16211422756314278,
|
|
"num_tokens": 30255806.0,
|
|
"step": 16405
|
|
},
|
|
{
|
|
"entropy": 5.754509162902832,
|
|
"epoch": 1.378659945389624,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004813365264927856,
|
|
"loss": 5.5533,
|
|
"mean_token_accuracy": 0.1538752794265747,
|
|
"num_tokens": 30267112.0,
|
|
"step": 16410
|
|
},
|
|
{
|
|
"entropy": 5.724986410140991,
|
|
"epoch": 1.379080025204789,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004813245828974477,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.1641213044524193,
|
|
"num_tokens": 30276168.0,
|
|
"step": 16415
|
|
},
|
|
{
|
|
"entropy": 5.690103244781494,
|
|
"epoch": 1.3795001050199538,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004813126356471761,
|
|
"loss": 5.4506,
|
|
"mean_token_accuracy": 0.16688449084758758,
|
|
"num_tokens": 30285723.0,
|
|
"step": 16420
|
|
},
|
|
{
|
|
"entropy": 5.785612440109253,
|
|
"epoch": 1.3799201848351186,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004813006847421824,
|
|
"loss": 5.4945,
|
|
"mean_token_accuracy": 0.16515985280275344,
|
|
"num_tokens": 30294790.0,
|
|
"step": 16425
|
|
},
|
|
{
|
|
"entropy": 5.722445869445801,
|
|
"epoch": 1.3803402646502836,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004812887301826783,
|
|
"loss": 5.4235,
|
|
"mean_token_accuracy": 0.16739188879728317,
|
|
"num_tokens": 30303439.0,
|
|
"step": 16430
|
|
},
|
|
{
|
|
"entropy": 5.640029811859131,
|
|
"epoch": 1.3807603444654484,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004812767719688755,
|
|
"loss": 5.3987,
|
|
"mean_token_accuracy": 0.162314510345459,
|
|
"num_tokens": 30312493.0,
|
|
"step": 16435
|
|
},
|
|
{
|
|
"entropy": 5.695783567428589,
|
|
"epoch": 1.3811804242806134,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004812648101009859,
|
|
"loss": 5.4447,
|
|
"mean_token_accuracy": 0.1699496790766716,
|
|
"num_tokens": 30321637.0,
|
|
"step": 16440
|
|
},
|
|
{
|
|
"entropy": 5.824322462081909,
|
|
"epoch": 1.3816005040957782,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004812528445792215,
|
|
"loss": 5.5741,
|
|
"mean_token_accuracy": 0.1524802938103676,
|
|
"num_tokens": 30330730.0,
|
|
"step": 16445
|
|
},
|
|
{
|
|
"entropy": 5.700669240951538,
|
|
"epoch": 1.382020583910943,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00048124087540379407,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.16979680806398392,
|
|
"num_tokens": 30339568.0,
|
|
"step": 16450
|
|
},
|
|
{
|
|
"entropy": 5.680627346038818,
|
|
"epoch": 1.382440663726108,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048122890257491573,
|
|
"loss": 5.444,
|
|
"mean_token_accuracy": 0.1615915670990944,
|
|
"num_tokens": 30349225.0,
|
|
"step": 16455
|
|
},
|
|
{
|
|
"entropy": 5.717861557006836,
|
|
"epoch": 1.382860743541273,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048121692609279866,
|
|
"loss": 5.4418,
|
|
"mean_token_accuracy": 0.1737132966518402,
|
|
"num_tokens": 30358804.0,
|
|
"step": 16460
|
|
},
|
|
{
|
|
"entropy": 5.745238399505615,
|
|
"epoch": 1.3832808233564378,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004812049459576549,
|
|
"loss": 5.5181,
|
|
"mean_token_accuracy": 0.167852421104908,
|
|
"num_tokens": 30368490.0,
|
|
"step": 16465
|
|
},
|
|
{
|
|
"entropy": 5.783865261077881,
|
|
"epoch": 1.3837009031716025,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004811929621696966,
|
|
"loss": 5.4073,
|
|
"mean_token_accuracy": 0.16754318177700042,
|
|
"num_tokens": 30377117.0,
|
|
"step": 16470
|
|
},
|
|
{
|
|
"entropy": 5.615077972412109,
|
|
"epoch": 1.3841209829867676,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00048118097472913627,
|
|
"loss": 5.295,
|
|
"mean_token_accuracy": 0.17376861870288848,
|
|
"num_tokens": 30385151.0,
|
|
"step": 16475
|
|
},
|
|
{
|
|
"entropy": 5.568413877487183,
|
|
"epoch": 1.3845410628019323,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004811689836361861,
|
|
"loss": 5.348,
|
|
"mean_token_accuracy": 0.16653158515691757,
|
|
"num_tokens": 30394837.0,
|
|
"step": 16480
|
|
},
|
|
{
|
|
"entropy": 5.679996347427368,
|
|
"epoch": 1.3849611426170974,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004811569888910585,
|
|
"loss": 5.422,
|
|
"mean_token_accuracy": 0.16941581070423126,
|
|
"num_tokens": 30403507.0,
|
|
"step": 16485
|
|
},
|
|
{
|
|
"entropy": 5.644715404510498,
|
|
"epoch": 1.3853812224322621,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004811449904939661,
|
|
"loss": 5.4117,
|
|
"mean_token_accuracy": 0.16634142994880677,
|
|
"num_tokens": 30412941.0,
|
|
"step": 16490
|
|
},
|
|
{
|
|
"entropy": 5.688458490371704,
|
|
"epoch": 1.385801302247427,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00048113298844512127,
|
|
"loss": 5.3812,
|
|
"mean_token_accuracy": 0.17350736260414124,
|
|
"num_tokens": 30421823.0,
|
|
"step": 16495
|
|
},
|
|
{
|
|
"entropy": 5.663712358474731,
|
|
"epoch": 1.386221382062592,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004811209827447367,
|
|
"loss": 5.4873,
|
|
"mean_token_accuracy": 0.1585498943924904,
|
|
"num_tokens": 30431901.0,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"entropy": 5.632642030715942,
|
|
"epoch": 1.3866414618777567,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048110897339302504,
|
|
"loss": 5.4315,
|
|
"mean_token_accuracy": 0.16275101751089097,
|
|
"num_tokens": 30442037.0,
|
|
"step": 16505
|
|
},
|
|
{
|
|
"entropy": 5.718168163299561,
|
|
"epoch": 1.3870615416929217,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048109696039019915,
|
|
"loss": 5.3902,
|
|
"mean_token_accuracy": 0.1704296126961708,
|
|
"num_tokens": 30451189.0,
|
|
"step": 16510
|
|
},
|
|
{
|
|
"entropy": 5.751668882369995,
|
|
"epoch": 1.3874816215080865,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004810849437364716,
|
|
"loss": 5.4663,
|
|
"mean_token_accuracy": 0.16614769995212555,
|
|
"num_tokens": 30460214.0,
|
|
"step": 16515
|
|
},
|
|
{
|
|
"entropy": 5.718422794342041,
|
|
"epoch": 1.3879017013232513,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048107292343205546,
|
|
"loss": 5.4882,
|
|
"mean_token_accuracy": 0.1601525142788887,
|
|
"num_tokens": 30469936.0,
|
|
"step": 16520
|
|
},
|
|
{
|
|
"entropy": 5.689886426925659,
|
|
"epoch": 1.3883217811384163,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004810608994771636,
|
|
"loss": 5.4283,
|
|
"mean_token_accuracy": 0.16565386056900025,
|
|
"num_tokens": 30479282.0,
|
|
"step": 16525
|
|
},
|
|
{
|
|
"entropy": 5.748596954345703,
|
|
"epoch": 1.388741860953581,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000481048871872009,
|
|
"loss": 5.4586,
|
|
"mean_token_accuracy": 0.16205482929944992,
|
|
"num_tokens": 30487839.0,
|
|
"step": 16530
|
|
},
|
|
{
|
|
"entropy": 5.734499311447143,
|
|
"epoch": 1.389161940768746,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048103684061680463,
|
|
"loss": 5.5037,
|
|
"mean_token_accuracy": 0.16305503845214844,
|
|
"num_tokens": 30497327.0,
|
|
"step": 16535
|
|
},
|
|
{
|
|
"entropy": 5.670412492752075,
|
|
"epoch": 1.389582020583911,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00048102480571176384,
|
|
"loss": 5.4037,
|
|
"mean_token_accuracy": 0.1694550558924675,
|
|
"num_tokens": 30506996.0,
|
|
"step": 16540
|
|
},
|
|
{
|
|
"entropy": 5.673905563354492,
|
|
"epoch": 1.390002100399076,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004810127671570997,
|
|
"loss": 5.3351,
|
|
"mean_token_accuracy": 0.17729466110467912,
|
|
"num_tokens": 30515627.0,
|
|
"step": 16545
|
|
},
|
|
{
|
|
"entropy": 5.730953550338745,
|
|
"epoch": 1.3904221802142407,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048100072495302544,
|
|
"loss": 5.4797,
|
|
"mean_token_accuracy": 0.16208681911230088,
|
|
"num_tokens": 30525858.0,
|
|
"step": 16550
|
|
},
|
|
{
|
|
"entropy": 5.621087074279785,
|
|
"epoch": 1.3908422600294057,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004809886790997544,
|
|
"loss": 5.3797,
|
|
"mean_token_accuracy": 0.1725637599825859,
|
|
"num_tokens": 30536331.0,
|
|
"step": 16555
|
|
},
|
|
{
|
|
"entropy": 5.680699825286865,
|
|
"epoch": 1.3912623398445705,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004809766295975,
|
|
"loss": 5.4237,
|
|
"mean_token_accuracy": 0.16701553165912628,
|
|
"num_tokens": 30545329.0,
|
|
"step": 16560
|
|
},
|
|
{
|
|
"entropy": 5.67092752456665,
|
|
"epoch": 1.3916824196597353,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004809645764464757,
|
|
"loss": 5.3724,
|
|
"mean_token_accuracy": 0.17025694251060486,
|
|
"num_tokens": 30554357.0,
|
|
"step": 16565
|
|
},
|
|
{
|
|
"entropy": 5.728373718261719,
|
|
"epoch": 1.3921024994749003,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048095251964689494,
|
|
"loss": 5.5604,
|
|
"mean_token_accuracy": 0.16157087236642836,
|
|
"num_tokens": 30563548.0,
|
|
"step": 16570
|
|
},
|
|
{
|
|
"entropy": 5.7081492900848385,
|
|
"epoch": 1.392522579290065,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048094045919897134,
|
|
"loss": 5.4307,
|
|
"mean_token_accuracy": 0.16958432644605637,
|
|
"num_tokens": 30572844.0,
|
|
"step": 16575
|
|
},
|
|
{
|
|
"entropy": 5.658297061920166,
|
|
"epoch": 1.39294265910523,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004809283951029185,
|
|
"loss": 5.3522,
|
|
"mean_token_accuracy": 0.17243621349334717,
|
|
"num_tokens": 30580930.0,
|
|
"step": 16580
|
|
},
|
|
{
|
|
"entropy": 5.699292230606079,
|
|
"epoch": 1.3933627389203949,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004809163273589503,
|
|
"loss": 5.3531,
|
|
"mean_token_accuracy": 0.1716527074575424,
|
|
"num_tokens": 30589917.0,
|
|
"step": 16585
|
|
},
|
|
{
|
|
"entropy": 5.645009279251099,
|
|
"epoch": 1.3937828187355596,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048090425596728035,
|
|
"loss": 5.4546,
|
|
"mean_token_accuracy": 0.16196119636297227,
|
|
"num_tokens": 30599282.0,
|
|
"step": 16590
|
|
},
|
|
{
|
|
"entropy": 5.66185154914856,
|
|
"epoch": 1.3942028985507247,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00048089218092812254,
|
|
"loss": 5.4357,
|
|
"mean_token_accuracy": 0.16347795724868774,
|
|
"num_tokens": 30608244.0,
|
|
"step": 16595
|
|
},
|
|
{
|
|
"entropy": 5.751768589019775,
|
|
"epoch": 1.3946229783658894,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00048088010224169064,
|
|
"loss": 5.5588,
|
|
"mean_token_accuracy": 0.16680994927883147,
|
|
"num_tokens": 30617340.0,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"entropy": 5.784567546844483,
|
|
"epoch": 1.3950430581810545,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048086801990819886,
|
|
"loss": 5.4828,
|
|
"mean_token_accuracy": 0.16346753984689713,
|
|
"num_tokens": 30626244.0,
|
|
"step": 16605
|
|
},
|
|
{
|
|
"entropy": 5.667201566696167,
|
|
"epoch": 1.3954631379962192,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00048085593392786113,
|
|
"loss": 5.4677,
|
|
"mean_token_accuracy": 0.1689893737435341,
|
|
"num_tokens": 30635279.0,
|
|
"step": 16610
|
|
},
|
|
{
|
|
"entropy": 5.747064113616943,
|
|
"epoch": 1.395883217811384,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004808438443008915,
|
|
"loss": 5.5995,
|
|
"mean_token_accuracy": 0.15962631851434708,
|
|
"num_tokens": 30645790.0,
|
|
"step": 16615
|
|
},
|
|
{
|
|
"entropy": 5.690942096710205,
|
|
"epoch": 1.396303297626549,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 0.0004808317510275041,
|
|
"loss": 5.45,
|
|
"mean_token_accuracy": 0.16256778538227082,
|
|
"num_tokens": 30654497.0,
|
|
"step": 16620
|
|
},
|
|
{
|
|
"entropy": 5.765830707550049,
|
|
"epoch": 1.396723377441714,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004808196541079133,
|
|
"loss": 5.5093,
|
|
"mean_token_accuracy": 0.16061384826898575,
|
|
"num_tokens": 30663760.0,
|
|
"step": 16625
|
|
},
|
|
{
|
|
"entropy": 5.737986993789673,
|
|
"epoch": 1.3971434572568788,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00048080755354233326,
|
|
"loss": 5.5036,
|
|
"mean_token_accuracy": 0.17019174993038177,
|
|
"num_tokens": 30674263.0,
|
|
"step": 16630
|
|
},
|
|
{
|
|
"entropy": 5.708775997161865,
|
|
"epoch": 1.3975635370720436,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004807954493309784,
|
|
"loss": 5.3802,
|
|
"mean_token_accuracy": 0.16836380660533906,
|
|
"num_tokens": 30683501.0,
|
|
"step": 16635
|
|
},
|
|
{
|
|
"entropy": 5.653238725662232,
|
|
"epoch": 1.3979836168872086,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00048078334147406314,
|
|
"loss": 5.3704,
|
|
"mean_token_accuracy": 0.17907529175281525,
|
|
"num_tokens": 30691917.0,
|
|
"step": 16640
|
|
},
|
|
{
|
|
"entropy": 5.636937618255615,
|
|
"epoch": 1.3984036967023734,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048077122997180197,
|
|
"loss": 5.4514,
|
|
"mean_token_accuracy": 0.1658071830868721,
|
|
"num_tokens": 30701753.0,
|
|
"step": 16645
|
|
},
|
|
{
|
|
"entropy": 5.557118940353393,
|
|
"epoch": 1.3988237765175384,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 0.0004807591148244093,
|
|
"loss": 5.4191,
|
|
"mean_token_accuracy": 0.16260174959897994,
|
|
"num_tokens": 30710878.0,
|
|
"step": 16650
|
|
},
|
|
{
|
|
"entropy": 5.618271827697754,
|
|
"epoch": 1.3992438563327032,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004807469960321,
|
|
"loss": 5.3137,
|
|
"mean_token_accuracy": 0.17308908998966216,
|
|
"num_tokens": 30719372.0,
|
|
"step": 16655
|
|
},
|
|
{
|
|
"entropy": 5.683672761917114,
|
|
"epoch": 1.399663936147868,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048073487359508854,
|
|
"loss": 5.4876,
|
|
"mean_token_accuracy": 0.15821529626846315,
|
|
"num_tokens": 30728529.0,
|
|
"step": 16660
|
|
},
|
|
{
|
|
"entropy": 5.769331645965576,
|
|
"epoch": 1.400084015963033,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.00048072274751358976,
|
|
"loss": 5.4266,
|
|
"mean_token_accuracy": 0.16961006075143814,
|
|
"num_tokens": 30737704.0,
|
|
"step": 16665
|
|
},
|
|
{
|
|
"entropy": 5.672802448272705,
|
|
"epoch": 1.4005040957781978,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00048071061778781843,
|
|
"loss": 5.4031,
|
|
"mean_token_accuracy": 0.16336706131696702,
|
|
"num_tokens": 30747836.0,
|
|
"step": 16670
|
|
},
|
|
{
|
|
"entropy": 5.595252180099488,
|
|
"epoch": 1.4009241755933628,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004806984844179894,
|
|
"loss": 5.4637,
|
|
"mean_token_accuracy": 0.16031693965196608,
|
|
"num_tokens": 30757881.0,
|
|
"step": 16675
|
|
},
|
|
{
|
|
"entropy": 5.706535530090332,
|
|
"epoch": 1.4013442554085276,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048068634740431774,
|
|
"loss": 5.4726,
|
|
"mean_token_accuracy": 0.1561596304178238,
|
|
"num_tokens": 30767592.0,
|
|
"step": 16680
|
|
},
|
|
{
|
|
"entropy": 5.703032445907593,
|
|
"epoch": 1.4017643352236924,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004806742067470182,
|
|
"loss": 5.435,
|
|
"mean_token_accuracy": 0.16835850328207017,
|
|
"num_tokens": 30776633.0,
|
|
"step": 16685
|
|
},
|
|
{
|
|
"entropy": 5.748832893371582,
|
|
"epoch": 1.4021844150388574,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048066206244630613,
|
|
"loss": 5.3957,
|
|
"mean_token_accuracy": 0.1625844269990921,
|
|
"num_tokens": 30785195.0,
|
|
"step": 16690
|
|
},
|
|
{
|
|
"entropy": 5.596337413787841,
|
|
"epoch": 1.4026044948540224,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048064991450239643,
|
|
"loss": 5.3959,
|
|
"mean_token_accuracy": 0.16495241075754166,
|
|
"num_tokens": 30794397.0,
|
|
"step": 16695
|
|
},
|
|
{
|
|
"entropy": 5.76853609085083,
|
|
"epoch": 1.4030245746691872,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048063776291550444,
|
|
"loss": 5.5523,
|
|
"mean_token_accuracy": 0.1575335018336773,
|
|
"num_tokens": 30803312.0,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"entropy": 5.758233070373535,
|
|
"epoch": 1.403444654484352,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048062560768584537,
|
|
"loss": 5.4565,
|
|
"mean_token_accuracy": 0.17063064128160477,
|
|
"num_tokens": 30812519.0,
|
|
"step": 16705
|
|
},
|
|
{
|
|
"entropy": 5.646391153335571,
|
|
"epoch": 1.403864734299517,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00048061344881363444,
|
|
"loss": 5.4061,
|
|
"mean_token_accuracy": 0.17314539104700089,
|
|
"num_tokens": 30821558.0,
|
|
"step": 16710
|
|
},
|
|
{
|
|
"entropy": 5.68760871887207,
|
|
"epoch": 1.4042848141146818,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004806012862990873,
|
|
"loss": 5.4262,
|
|
"mean_token_accuracy": 0.16372249722480775,
|
|
"num_tokens": 30831521.0,
|
|
"step": 16715
|
|
},
|
|
{
|
|
"entropy": 5.68061900138855,
|
|
"epoch": 1.4047048939298468,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00048058912014241914,
|
|
"loss": 5.4044,
|
|
"mean_token_accuracy": 0.16505313515663148,
|
|
"num_tokens": 30841191.0,
|
|
"step": 16720
|
|
},
|
|
{
|
|
"entropy": 5.709570789337159,
|
|
"epoch": 1.4051249737450116,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004805769503438456,
|
|
"loss": 5.5102,
|
|
"mean_token_accuracy": 0.1652674689888954,
|
|
"num_tokens": 30850556.0,
|
|
"step": 16725
|
|
},
|
|
{
|
|
"entropy": 5.701706600189209,
|
|
"epoch": 1.4055450535601763,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048056477690358227,
|
|
"loss": 5.4131,
|
|
"mean_token_accuracy": 0.1686984494328499,
|
|
"num_tokens": 30859410.0,
|
|
"step": 16730
|
|
},
|
|
{
|
|
"entropy": 5.7712499618530275,
|
|
"epoch": 1.4059651333753413,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004805525998218447,
|
|
"loss": 5.4582,
|
|
"mean_token_accuracy": 0.16039325296878815,
|
|
"num_tokens": 30868048.0,
|
|
"step": 16735
|
|
},
|
|
{
|
|
"entropy": 5.7124098777771,
|
|
"epoch": 1.4063852131905061,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00048054041909884873,
|
|
"loss": 5.4697,
|
|
"mean_token_accuracy": 0.16726680248975753,
|
|
"num_tokens": 30876785.0,
|
|
"step": 16740
|
|
},
|
|
{
|
|
"entropy": 5.764161920547485,
|
|
"epoch": 1.4068052930056711,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048052823473481007,
|
|
"loss": 5.5345,
|
|
"mean_token_accuracy": 0.16368919163942336,
|
|
"num_tokens": 30886158.0,
|
|
"step": 16745
|
|
},
|
|
{
|
|
"entropy": 5.709494638442993,
|
|
"epoch": 1.407225372820836,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00048051604672994446,
|
|
"loss": 5.3873,
|
|
"mean_token_accuracy": 0.1646023690700531,
|
|
"num_tokens": 30895283.0,
|
|
"step": 16750
|
|
},
|
|
{
|
|
"entropy": 5.696353149414063,
|
|
"epoch": 1.4076454526360007,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048050385508446804,
|
|
"loss": 5.4284,
|
|
"mean_token_accuracy": 0.16812965720891954,
|
|
"num_tokens": 30905514.0,
|
|
"step": 16755
|
|
},
|
|
{
|
|
"entropy": 5.664879083633423,
|
|
"epoch": 1.4080655324511657,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048049165979859655,
|
|
"loss": 5.331,
|
|
"mean_token_accuracy": 0.18449335247278215,
|
|
"num_tokens": 30914794.0,
|
|
"step": 16760
|
|
},
|
|
{
|
|
"entropy": 5.610575008392334,
|
|
"epoch": 1.4084856122663307,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048047946087254615,
|
|
"loss": 5.3627,
|
|
"mean_token_accuracy": 0.16559927463531493,
|
|
"num_tokens": 30923823.0,
|
|
"step": 16765
|
|
},
|
|
{
|
|
"entropy": 5.6542726993560795,
|
|
"epoch": 1.4089056920814955,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00048046725830653295,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.16385638117790222,
|
|
"num_tokens": 30932738.0,
|
|
"step": 16770
|
|
},
|
|
{
|
|
"entropy": 5.704129838943482,
|
|
"epoch": 1.4093257718966603,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048045505210077304,
|
|
"loss": 5.4767,
|
|
"mean_token_accuracy": 0.15995497554540633,
|
|
"num_tokens": 30942302.0,
|
|
"step": 16775
|
|
},
|
|
{
|
|
"entropy": 5.705305194854736,
|
|
"epoch": 1.4097458517118253,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004804428422554826,
|
|
"loss": 5.3999,
|
|
"mean_token_accuracy": 0.16517668217420578,
|
|
"num_tokens": 30951662.0,
|
|
"step": 16780
|
|
},
|
|
{
|
|
"entropy": 5.643369197845459,
|
|
"epoch": 1.41016593152699,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0004804306287708782,
|
|
"loss": 5.4139,
|
|
"mean_token_accuracy": 0.1685831978917122,
|
|
"num_tokens": 30960475.0,
|
|
"step": 16785
|
|
},
|
|
{
|
|
"entropy": 5.592676210403442,
|
|
"epoch": 1.410586011342155,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048041841164717574,
|
|
"loss": 5.2528,
|
|
"mean_token_accuracy": 0.1767956107854843,
|
|
"num_tokens": 30969075.0,
|
|
"step": 16790
|
|
},
|
|
{
|
|
"entropy": 5.635186338424683,
|
|
"epoch": 1.41100609115732,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004804061908845921,
|
|
"loss": 5.3445,
|
|
"mean_token_accuracy": 0.17429747730493544,
|
|
"num_tokens": 30978030.0,
|
|
"step": 16795
|
|
},
|
|
{
|
|
"entropy": 5.63826003074646,
|
|
"epoch": 1.4114261709724847,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00048039396648334346,
|
|
"loss": 5.322,
|
|
"mean_token_accuracy": 0.16926524937152862,
|
|
"num_tokens": 30985639.0,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"entropy": 5.685590744018555,
|
|
"epoch": 1.4118462507876497,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004803817384436465,
|
|
"loss": 5.4499,
|
|
"mean_token_accuracy": 0.16543682664632797,
|
|
"num_tokens": 30994811.0,
|
|
"step": 16805
|
|
},
|
|
{
|
|
"entropy": 5.71953272819519,
|
|
"epoch": 1.4122663306028145,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004803695067657178,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.16598510146141052,
|
|
"num_tokens": 31003813.0,
|
|
"step": 16810
|
|
},
|
|
{
|
|
"entropy": 5.641027021408081,
|
|
"epoch": 1.4126864104179795,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000480357271449774,
|
|
"loss": 5.3693,
|
|
"mean_token_accuracy": 0.1740890622138977,
|
|
"num_tokens": 31012488.0,
|
|
"step": 16815
|
|
},
|
|
{
|
|
"entropy": 5.6430689811706545,
|
|
"epoch": 1.4131064902331443,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004803450324960318,
|
|
"loss": 5.3921,
|
|
"mean_token_accuracy": 0.16979921013116836,
|
|
"num_tokens": 31021089.0,
|
|
"step": 16820
|
|
},
|
|
{
|
|
"entropy": 5.653257369995117,
|
|
"epoch": 1.413526570048309,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.00048033278990470825,
|
|
"loss": 5.4096,
|
|
"mean_token_accuracy": 0.16547489091753959,
|
|
"num_tokens": 31029903.0,
|
|
"step": 16825
|
|
},
|
|
{
|
|
"entropy": 5.63095440864563,
|
|
"epoch": 1.413946649863474,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048032054367601996,
|
|
"loss": 5.421,
|
|
"mean_token_accuracy": 0.1633308783173561,
|
|
"num_tokens": 31039207.0,
|
|
"step": 16830
|
|
},
|
|
{
|
|
"entropy": 5.651738262176513,
|
|
"epoch": 1.414366729678639,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00048030829381018396,
|
|
"loss": 5.4428,
|
|
"mean_token_accuracy": 0.16122666299343108,
|
|
"num_tokens": 31048190.0,
|
|
"step": 16835
|
|
},
|
|
{
|
|
"entropy": 5.708361196517944,
|
|
"epoch": 1.4147868094938039,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004802960403074173,
|
|
"loss": 5.5316,
|
|
"mean_token_accuracy": 0.16462094187736512,
|
|
"num_tokens": 31058769.0,
|
|
"step": 16840
|
|
},
|
|
{
|
|
"entropy": 5.701053285598755,
|
|
"epoch": 1.4152068893089687,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048028378316793705,
|
|
"loss": 5.4687,
|
|
"mean_token_accuracy": 0.16018210723996162,
|
|
"num_tokens": 31066830.0,
|
|
"step": 16845
|
|
},
|
|
{
|
|
"entropy": 5.762956762313843,
|
|
"epoch": 1.4156269691241337,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004802715223919602,
|
|
"loss": 5.5172,
|
|
"mean_token_accuracy": 0.16773394793272017,
|
|
"num_tokens": 31077205.0,
|
|
"step": 16850
|
|
},
|
|
{
|
|
"entropy": 5.7432409763336185,
|
|
"epoch": 1.4160470489392984,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00048025925797970403,
|
|
"loss": 5.4479,
|
|
"mean_token_accuracy": 0.17057251334190368,
|
|
"num_tokens": 31087327.0,
|
|
"step": 16855
|
|
},
|
|
{
|
|
"entropy": 5.639508008956909,
|
|
"epoch": 1.4164671287544635,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048024698993138587,
|
|
"loss": 5.3833,
|
|
"mean_token_accuracy": 0.16887278407812117,
|
|
"num_tokens": 31096501.0,
|
|
"step": 16860
|
|
},
|
|
{
|
|
"entropy": 5.735842370986939,
|
|
"epoch": 1.4168872085696282,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048023471824722294,
|
|
"loss": 5.5523,
|
|
"mean_token_accuracy": 0.1566422998905182,
|
|
"num_tokens": 31105949.0,
|
|
"step": 16865
|
|
},
|
|
{
|
|
"entropy": 5.765266227722168,
|
|
"epoch": 1.417307288384793,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00048022244292743256,
|
|
"loss": 5.4616,
|
|
"mean_token_accuracy": 0.1579113557934761,
|
|
"num_tokens": 31115482.0,
|
|
"step": 16870
|
|
},
|
|
{
|
|
"entropy": 5.7321278095245365,
|
|
"epoch": 1.417727368199958,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048021016397223234,
|
|
"loss": 5.407,
|
|
"mean_token_accuracy": 0.16931116878986358,
|
|
"num_tokens": 31124758.0,
|
|
"step": 16875
|
|
},
|
|
{
|
|
"entropy": 5.654321622848511,
|
|
"epoch": 1.4181474480151228,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048019788138183977,
|
|
"loss": 5.2972,
|
|
"mean_token_accuracy": 0.17919143736362458,
|
|
"num_tokens": 31134114.0,
|
|
"step": 16880
|
|
},
|
|
{
|
|
"entropy": 5.586613321304322,
|
|
"epoch": 1.4185675278302878,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048018559515647244,
|
|
"loss": 5.3523,
|
|
"mean_token_accuracy": 0.17073431313037873,
|
|
"num_tokens": 31142667.0,
|
|
"step": 16885
|
|
},
|
|
{
|
|
"entropy": 5.671979999542236,
|
|
"epoch": 1.4189876076454526,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00048017330529634785,
|
|
"loss": 5.4433,
|
|
"mean_token_accuracy": 0.1582137778401375,
|
|
"num_tokens": 31152105.0,
|
|
"step": 16890
|
|
},
|
|
{
|
|
"entropy": 5.698092317581176,
|
|
"epoch": 1.4194076874606174,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048016101180168376,
|
|
"loss": 5.4208,
|
|
"mean_token_accuracy": 0.1703786239027977,
|
|
"num_tokens": 31160277.0,
|
|
"step": 16895
|
|
},
|
|
{
|
|
"entropy": 5.8327394962310795,
|
|
"epoch": 1.4198277672757824,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048014871467269804,
|
|
"loss": 5.6275,
|
|
"mean_token_accuracy": 0.15696136504411698,
|
|
"num_tokens": 31170677.0,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"entropy": 5.710501289367675,
|
|
"epoch": 1.4202478470909472,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.00048013641390960856,
|
|
"loss": 5.413,
|
|
"mean_token_accuracy": 0.16240498870611192,
|
|
"num_tokens": 31179298.0,
|
|
"step": 16905
|
|
},
|
|
{
|
|
"entropy": 5.650837802886963,
|
|
"epoch": 1.4206679269061122,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004801241095126331,
|
|
"loss": 5.4281,
|
|
"mean_token_accuracy": 0.16397203356027604,
|
|
"num_tokens": 31188547.0,
|
|
"step": 16910
|
|
},
|
|
{
|
|
"entropy": 5.683053827285766,
|
|
"epoch": 1.421088006721277,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004801118014819896,
|
|
"loss": 5.429,
|
|
"mean_token_accuracy": 0.16916512846946716,
|
|
"num_tokens": 31197680.0,
|
|
"step": 16915
|
|
},
|
|
{
|
|
"entropy": 5.670131063461303,
|
|
"epoch": 1.421508086536442,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004800994898178962,
|
|
"loss": 5.3795,
|
|
"mean_token_accuracy": 0.17118050009012223,
|
|
"num_tokens": 31206351.0,
|
|
"step": 16920
|
|
},
|
|
{
|
|
"entropy": 5.689674186706543,
|
|
"epoch": 1.4219281663516068,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004800871745205708,
|
|
"loss": 5.5787,
|
|
"mean_token_accuracy": 0.15732864812016487,
|
|
"num_tokens": 31216478.0,
|
|
"step": 16925
|
|
},
|
|
{
|
|
"entropy": 5.787313032150268,
|
|
"epoch": 1.4223482461667718,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00048007485559023195,
|
|
"loss": 5.5266,
|
|
"mean_token_accuracy": 0.15895494371652602,
|
|
"num_tokens": 31225920.0,
|
|
"step": 16930
|
|
},
|
|
{
|
|
"entropy": 5.704079055786133,
|
|
"epoch": 1.4227683259819366,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004800625330270975,
|
|
"loss": 5.4163,
|
|
"mean_token_accuracy": 0.1649041622877121,
|
|
"num_tokens": 31235061.0,
|
|
"step": 16935
|
|
},
|
|
{
|
|
"entropy": 5.615435409545898,
|
|
"epoch": 1.4231884057971014,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004800502068313859,
|
|
"loss": 5.3819,
|
|
"mean_token_accuracy": 0.17181412726640702,
|
|
"num_tokens": 31243448.0,
|
|
"step": 16940
|
|
},
|
|
{
|
|
"entropy": 5.699060869216919,
|
|
"epoch": 1.4236084856122664,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004800378770033154,
|
|
"loss": 5.4936,
|
|
"mean_token_accuracy": 0.16858059167861938,
|
|
"num_tokens": 31252569.0,
|
|
"step": 16945
|
|
},
|
|
{
|
|
"entropy": 5.703611755371094,
|
|
"epoch": 1.4240285654274312,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004800255435431046,
|
|
"loss": 5.3883,
|
|
"mean_token_accuracy": 0.17073103338479995,
|
|
"num_tokens": 31261905.0,
|
|
"step": 16950
|
|
},
|
|
{
|
|
"entropy": 5.615508317947388,
|
|
"epoch": 1.4244486452425962,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00048001320645097177,
|
|
"loss": 5.361,
|
|
"mean_token_accuracy": 0.1737958535552025,
|
|
"num_tokens": 31271203.0,
|
|
"step": 16955
|
|
},
|
|
{
|
|
"entropy": 5.630927085876465,
|
|
"epoch": 1.424868725057761,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048000086572713566,
|
|
"loss": 5.354,
|
|
"mean_token_accuracy": 0.17280679643154145,
|
|
"num_tokens": 31279812.0,
|
|
"step": 16960
|
|
},
|
|
{
|
|
"entropy": 5.674156904220581,
|
|
"epoch": 1.4252888048729258,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004799885213718147,
|
|
"loss": 5.4149,
|
|
"mean_token_accuracy": 0.16382081657648087,
|
|
"num_tokens": 31289615.0,
|
|
"step": 16965
|
|
},
|
|
{
|
|
"entropy": 5.658738088607788,
|
|
"epoch": 1.4257088846880908,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047997617338522763,
|
|
"loss": 5.3518,
|
|
"mean_token_accuracy": 0.17239830791950225,
|
|
"num_tokens": 31298947.0,
|
|
"step": 16970
|
|
},
|
|
{
|
|
"entropy": 5.650487899780273,
|
|
"epoch": 1.4261289645032555,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047996382176759324,
|
|
"loss": 5.33,
|
|
"mean_token_accuracy": 0.17185672670602797,
|
|
"num_tokens": 31307465.0,
|
|
"step": 16975
|
|
},
|
|
{
|
|
"entropy": 5.605889129638672,
|
|
"epoch": 1.4265490443184206,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004799514665191303,
|
|
"loss": 5.4702,
|
|
"mean_token_accuracy": 0.16345242261886597,
|
|
"num_tokens": 31317682.0,
|
|
"step": 16980
|
|
},
|
|
{
|
|
"entropy": 5.726818227767945,
|
|
"epoch": 1.4269691241335853,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004799391076400576,
|
|
"loss": 5.4472,
|
|
"mean_token_accuracy": 0.16512487083673477,
|
|
"num_tokens": 31326113.0,
|
|
"step": 16985
|
|
},
|
|
{
|
|
"entropy": 5.791937685012817,
|
|
"epoch": 1.4273892039487501,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00047992674513059415,
|
|
"loss": 5.4919,
|
|
"mean_token_accuracy": 0.16668398678302765,
|
|
"num_tokens": 31335263.0,
|
|
"step": 16990
|
|
},
|
|
{
|
|
"entropy": 5.6625199794769285,
|
|
"epoch": 1.4278092837639151,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00047991437899095896,
|
|
"loss": 5.4298,
|
|
"mean_token_accuracy": 0.1710612565279007,
|
|
"num_tokens": 31344503.0,
|
|
"step": 16995
|
|
},
|
|
{
|
|
"entropy": 5.647831153869629,
|
|
"epoch": 1.4282293635790801,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00047990200922137105,
|
|
"loss": 5.4908,
|
|
"mean_token_accuracy": 0.16613128632307053,
|
|
"num_tokens": 31354530.0,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"entropy": 5.668387365341187,
|
|
"epoch": 1.428649443394245,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004798896358220496,
|
|
"loss": 5.3034,
|
|
"mean_token_accuracy": 0.1711835592985153,
|
|
"num_tokens": 31362761.0,
|
|
"step": 17005
|
|
},
|
|
{
|
|
"entropy": 5.680157566070557,
|
|
"epoch": 1.4290695232094097,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004798772587932137,
|
|
"loss": 5.3365,
|
|
"mean_token_accuracy": 0.16386652886867523,
|
|
"num_tokens": 31372933.0,
|
|
"step": 17010
|
|
},
|
|
{
|
|
"entropy": 5.753627347946167,
|
|
"epoch": 1.4294896030245747,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004798648781350826,
|
|
"loss": 5.5313,
|
|
"mean_token_accuracy": 0.16360146701335906,
|
|
"num_tokens": 31382651.0,
|
|
"step": 17015
|
|
},
|
|
{
|
|
"entropy": 5.657275533676147,
|
|
"epoch": 1.4299096828397395,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004798524938478758,
|
|
"loss": 5.4663,
|
|
"mean_token_accuracy": 0.16007311642169952,
|
|
"num_tokens": 31392272.0,
|
|
"step": 17020
|
|
},
|
|
{
|
|
"entropy": 5.65654354095459,
|
|
"epoch": 1.4303297626549045,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004798401059318124,
|
|
"loss": 5.3702,
|
|
"mean_token_accuracy": 0.1685507357120514,
|
|
"num_tokens": 31400684.0,
|
|
"step": 17025
|
|
},
|
|
{
|
|
"entropy": 5.652564525604248,
|
|
"epoch": 1.4307498424700693,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004798277143871122,
|
|
"loss": 5.3624,
|
|
"mean_token_accuracy": 0.17421618700027466,
|
|
"num_tokens": 31409082.0,
|
|
"step": 17030
|
|
},
|
|
{
|
|
"entropy": 5.608336639404297,
|
|
"epoch": 1.431169922285234,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004798153192139944,
|
|
"loss": 5.3376,
|
|
"mean_token_accuracy": 0.1730009838938713,
|
|
"num_tokens": 31417415.0,
|
|
"step": 17035
|
|
},
|
|
{
|
|
"entropy": 5.675747871398926,
|
|
"epoch": 1.431590002100399,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004798029204126786,
|
|
"loss": 5.5005,
|
|
"mean_token_accuracy": 0.1690568134188652,
|
|
"num_tokens": 31427510.0,
|
|
"step": 17040
|
|
},
|
|
{
|
|
"entropy": 5.636645269393921,
|
|
"epoch": 1.432010081915564,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004797905179833847,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.17016119211912156,
|
|
"num_tokens": 31436187.0,
|
|
"step": 17045
|
|
},
|
|
{
|
|
"entropy": 5.665362167358398,
|
|
"epoch": 1.432430161730729,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004797781119263321,
|
|
"loss": 5.3552,
|
|
"mean_token_accuracy": 0.16701350957155228,
|
|
"num_tokens": 31445179.0,
|
|
"step": 17050
|
|
},
|
|
{
|
|
"entropy": 5.708790063858032,
|
|
"epoch": 1.4328502415458937,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004797657022417408,
|
|
"loss": 5.4449,
|
|
"mean_token_accuracy": 0.16478859335184098,
|
|
"num_tokens": 31454434.0,
|
|
"step": 17055
|
|
},
|
|
{
|
|
"entropy": 5.681474256515503,
|
|
"epoch": 1.4332703213610585,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00047975328892983045,
|
|
"loss": 5.4336,
|
|
"mean_token_accuracy": 0.16830161362886428,
|
|
"num_tokens": 31464202.0,
|
|
"step": 17060
|
|
},
|
|
{
|
|
"entropy": 5.605258941650391,
|
|
"epoch": 1.4336904011762235,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00047974087199082095,
|
|
"loss": 5.3277,
|
|
"mean_token_accuracy": 0.17215612679719924,
|
|
"num_tokens": 31473158.0,
|
|
"step": 17065
|
|
},
|
|
{
|
|
"entropy": 5.659666633605957,
|
|
"epoch": 1.4341104809913885,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.00047972845142493244,
|
|
"loss": 5.3615,
|
|
"mean_token_accuracy": 0.16211307048797607,
|
|
"num_tokens": 31482643.0,
|
|
"step": 17070
|
|
},
|
|
{
|
|
"entropy": 5.625508260726929,
|
|
"epoch": 1.4345305608065533,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004797160272323848,
|
|
"loss": 5.4164,
|
|
"mean_token_accuracy": 0.1696289971470833,
|
|
"num_tokens": 31492080.0,
|
|
"step": 17075
|
|
},
|
|
{
|
|
"entropy": 5.67778902053833,
|
|
"epoch": 1.434950640621718,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00047970359941339815,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.16916269809007645,
|
|
"num_tokens": 31501990.0,
|
|
"step": 17080
|
|
},
|
|
{
|
|
"entropy": 5.686505317687988,
|
|
"epoch": 1.435370720436883,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004796911679681926,
|
|
"loss": 5.4451,
|
|
"mean_token_accuracy": 0.16082692742347718,
|
|
"num_tokens": 31510548.0,
|
|
"step": 17085
|
|
},
|
|
{
|
|
"entropy": 5.671147012710572,
|
|
"epoch": 1.4357908002520479,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00047967873289698847,
|
|
"loss": 5.4048,
|
|
"mean_token_accuracy": 0.16617012917995452,
|
|
"num_tokens": 31518695.0,
|
|
"step": 17090
|
|
},
|
|
{
|
|
"entropy": 5.754871845245361,
|
|
"epoch": 1.4362108800672129,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00047966629420000595,
|
|
"loss": 5.5615,
|
|
"mean_token_accuracy": 0.16368394792079927,
|
|
"num_tokens": 31528021.0,
|
|
"step": 17095
|
|
},
|
|
{
|
|
"entropy": 5.7568220615386965,
|
|
"epoch": 1.4366309598823777,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004796538518774654,
|
|
"loss": 5.5284,
|
|
"mean_token_accuracy": 0.16016919240355493,
|
|
"num_tokens": 31537786.0,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"entropy": 5.689480447769165,
|
|
"epoch": 1.4370510396975424,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00047964140592958725,
|
|
"loss": 5.4719,
|
|
"mean_token_accuracy": 0.16369976103305817,
|
|
"num_tokens": 31548006.0,
|
|
"step": 17105
|
|
},
|
|
{
|
|
"entropy": 5.710929727554321,
|
|
"epoch": 1.4374711195127075,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.000479628956356592,
|
|
"loss": 5.4102,
|
|
"mean_token_accuracy": 0.16462980061769486,
|
|
"num_tokens": 31557042.0,
|
|
"step": 17110
|
|
},
|
|
{
|
|
"entropy": 5.743411254882813,
|
|
"epoch": 1.4378911993278722,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004796165031587001,
|
|
"loss": 5.4294,
|
|
"mean_token_accuracy": 0.16347581148147583,
|
|
"num_tokens": 31566661.0,
|
|
"step": 17115
|
|
},
|
|
{
|
|
"entropy": 5.729912614822387,
|
|
"epoch": 1.4383112791430372,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004796040463361323,
|
|
"loss": 5.3991,
|
|
"mean_token_accuracy": 0.17761249095201492,
|
|
"num_tokens": 31575724.0,
|
|
"step": 17120
|
|
},
|
|
{
|
|
"entropy": 5.694164514541626,
|
|
"epoch": 1.438731358958202,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004795915858891091,
|
|
"loss": 5.4881,
|
|
"mean_token_accuracy": 0.17017182260751723,
|
|
"num_tokens": 31585068.0,
|
|
"step": 17125
|
|
},
|
|
{
|
|
"entropy": 5.751265192031861,
|
|
"epoch": 1.4391514387733668,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004795791218178514,
|
|
"loss": 5.5202,
|
|
"mean_token_accuracy": 0.1640462413430214,
|
|
"num_tokens": 31594629.0,
|
|
"step": 17130
|
|
},
|
|
{
|
|
"entropy": 5.655387258529663,
|
|
"epoch": 1.4395715185885318,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047956665412257984,
|
|
"loss": 5.3913,
|
|
"mean_token_accuracy": 0.16778073012828826,
|
|
"num_tokens": 31603469.0,
|
|
"step": 17135
|
|
},
|
|
{
|
|
"entropy": 5.647593832015991,
|
|
"epoch": 1.4399915984036968,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00047955418280351526,
|
|
"loss": 5.3461,
|
|
"mean_token_accuracy": 0.17495327293872834,
|
|
"num_tokens": 31611674.0,
|
|
"step": 17140
|
|
},
|
|
{
|
|
"entropy": 5.800208330154419,
|
|
"epoch": 1.4404116782188616,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004795417078608788,
|
|
"loss": 5.622,
|
|
"mean_token_accuracy": 0.1545601725578308,
|
|
"num_tokens": 31621863.0,
|
|
"step": 17145
|
|
},
|
|
{
|
|
"entropy": 5.788693571090699,
|
|
"epoch": 1.4408317580340264,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00047952922929489126,
|
|
"loss": 5.4521,
|
|
"mean_token_accuracy": 0.1642246201634407,
|
|
"num_tokens": 31630968.0,
|
|
"step": 17150
|
|
},
|
|
{
|
|
"entropy": 5.64285249710083,
|
|
"epoch": 1.4412518378491914,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00047951674710577366,
|
|
"loss": 5.4419,
|
|
"mean_token_accuracy": 0.16613068878650666,
|
|
"num_tokens": 31640643.0,
|
|
"step": 17155
|
|
},
|
|
{
|
|
"entropy": 5.565954065322876,
|
|
"epoch": 1.4416719176643562,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047950426129374723,
|
|
"loss": 5.3347,
|
|
"mean_token_accuracy": 0.1745448738336563,
|
|
"num_tokens": 31648941.0,
|
|
"step": 17160
|
|
},
|
|
{
|
|
"entropy": 5.700513076782227,
|
|
"epoch": 1.4420919974795212,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047949177185903314,
|
|
"loss": 5.4437,
|
|
"mean_token_accuracy": 0.1697974219918251,
|
|
"num_tokens": 31658019.0,
|
|
"step": 17165
|
|
},
|
|
{
|
|
"entropy": 5.769097185134887,
|
|
"epoch": 1.442512077294686,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004794792788018526,
|
|
"loss": 5.5065,
|
|
"mean_token_accuracy": 0.15758488774299623,
|
|
"num_tokens": 31668050.0,
|
|
"step": 17170
|
|
},
|
|
{
|
|
"entropy": 5.686607456207275,
|
|
"epoch": 1.4429321571098508,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000479466782122427,
|
|
"loss": 5.3551,
|
|
"mean_token_accuracy": 0.16535573899745942,
|
|
"num_tokens": 31676727.0,
|
|
"step": 17175
|
|
},
|
|
{
|
|
"entropy": 5.683789539337158,
|
|
"epoch": 1.4433522369250158,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00047945428182097756,
|
|
"loss": 5.4525,
|
|
"mean_token_accuracy": 0.1617741197347641,
|
|
"num_tokens": 31686205.0,
|
|
"step": 17180
|
|
},
|
|
{
|
|
"entropy": 5.693828535079956,
|
|
"epoch": 1.4437723167401806,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047944177789772583,
|
|
"loss": 5.4559,
|
|
"mean_token_accuracy": 0.16552175134420394,
|
|
"num_tokens": 31695521.0,
|
|
"step": 17185
|
|
},
|
|
{
|
|
"entropy": 5.767838621139527,
|
|
"epoch": 1.4441923965553456,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004794292703528932,
|
|
"loss": 5.5186,
|
|
"mean_token_accuracy": 0.15298188775777816,
|
|
"num_tokens": 31706606.0,
|
|
"step": 17190
|
|
},
|
|
{
|
|
"entropy": 5.7720374584198,
|
|
"epoch": 1.4446124763705104,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00047941675918670133,
|
|
"loss": 5.5934,
|
|
"mean_token_accuracy": 0.15864021703600883,
|
|
"num_tokens": 31716881.0,
|
|
"step": 17195
|
|
},
|
|
{
|
|
"entropy": 5.724113607406617,
|
|
"epoch": 1.4450325561856752,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004794042443993719,
|
|
"loss": 5.3791,
|
|
"mean_token_accuracy": 0.16267655789852142,
|
|
"num_tokens": 31725878.0,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"entropy": 5.657223463058472,
|
|
"epoch": 1.4454526360008402,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004793917259911265,
|
|
"loss": 5.4676,
|
|
"mean_token_accuracy": 0.16497932225465775,
|
|
"num_tokens": 31735033.0,
|
|
"step": 17205
|
|
},
|
|
{
|
|
"entropy": 5.5900531768798825,
|
|
"epoch": 1.445872715816005,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004793792039621869,
|
|
"loss": 5.4147,
|
|
"mean_token_accuracy": 0.171473328769207,
|
|
"num_tokens": 31744887.0,
|
|
"step": 17210
|
|
},
|
|
{
|
|
"entropy": 5.733187532424926,
|
|
"epoch": 1.44629279563117,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047936667831277504,
|
|
"loss": 5.4767,
|
|
"mean_token_accuracy": 0.15901170670986176,
|
|
"num_tokens": 31754137.0,
|
|
"step": 17215
|
|
},
|
|
{
|
|
"entropy": 5.690765762329102,
|
|
"epoch": 1.4467128754463348,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004793541490431126,
|
|
"loss": 5.2947,
|
|
"mean_token_accuracy": 0.17318409383296968,
|
|
"num_tokens": 31763394.0,
|
|
"step": 17220
|
|
},
|
|
{
|
|
"entropy": 5.671582269668579,
|
|
"epoch": 1.4471329552614998,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004793416161534216,
|
|
"loss": 5.4326,
|
|
"mean_token_accuracy": 0.1657954916357994,
|
|
"num_tokens": 31771905.0,
|
|
"step": 17225
|
|
},
|
|
{
|
|
"entropy": 5.5294126033782955,
|
|
"epoch": 1.4475530350766646,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.00047932907964392423,
|
|
"loss": 5.2655,
|
|
"mean_token_accuracy": 0.1774240866303444,
|
|
"num_tokens": 31780788.0,
|
|
"step": 17230
|
|
},
|
|
{
|
|
"entropy": 5.70396466255188,
|
|
"epoch": 1.4479731148918296,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00047931653951484234,
|
|
"loss": 5.4452,
|
|
"mean_token_accuracy": 0.16516012102365493,
|
|
"num_tokens": 31790198.0,
|
|
"step": 17235
|
|
},
|
|
{
|
|
"entropy": 5.712733507156372,
|
|
"epoch": 1.4483931947069943,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00047930399576639815,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.16861406937241555,
|
|
"num_tokens": 31799396.0,
|
|
"step": 17240
|
|
},
|
|
{
|
|
"entropy": 5.621314477920532,
|
|
"epoch": 1.4488132745221591,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047929144839881386,
|
|
"loss": 5.2884,
|
|
"mean_token_accuracy": 0.18074664771556853,
|
|
"num_tokens": 31807680.0,
|
|
"step": 17245
|
|
},
|
|
{
|
|
"entropy": 5.722569370269776,
|
|
"epoch": 1.4492333543373241,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047927889741231186,
|
|
"loss": 5.4295,
|
|
"mean_token_accuracy": 0.16408731043338776,
|
|
"num_tokens": 31817406.0,
|
|
"step": 17250
|
|
},
|
|
{
|
|
"entropy": 5.660385704040527,
|
|
"epoch": 1.449653434152489,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047926634280711435,
|
|
"loss": 5.4135,
|
|
"mean_token_accuracy": 0.16933335810899736,
|
|
"num_tokens": 31826518.0,
|
|
"step": 17255
|
|
},
|
|
{
|
|
"entropy": 5.703531122207641,
|
|
"epoch": 1.450073513967654,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004792537845834437,
|
|
"loss": 5.4947,
|
|
"mean_token_accuracy": 0.15975457429885864,
|
|
"num_tokens": 31835538.0,
|
|
"step": 17260
|
|
},
|
|
{
|
|
"entropy": 5.664773654937744,
|
|
"epoch": 1.4504935937828187,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004792412227415224,
|
|
"loss": 5.3497,
|
|
"mean_token_accuracy": 0.17190734297037125,
|
|
"num_tokens": 31844899.0,
|
|
"step": 17265
|
|
},
|
|
{
|
|
"entropy": 5.627852296829223,
|
|
"epoch": 1.4509136735979835,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047922865728157314,
|
|
"loss": 5.3981,
|
|
"mean_token_accuracy": 0.1743706777691841,
|
|
"num_tokens": 31854322.0,
|
|
"step": 17270
|
|
},
|
|
{
|
|
"entropy": 5.6161435604095455,
|
|
"epoch": 1.4513337534131485,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004792160882038183,
|
|
"loss": 5.3679,
|
|
"mean_token_accuracy": 0.16462661772966386,
|
|
"num_tokens": 31863657.0,
|
|
"step": 17275
|
|
},
|
|
{
|
|
"entropy": 5.655448341369629,
|
|
"epoch": 1.4517538332283133,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004792035155084806,
|
|
"loss": 5.3615,
|
|
"mean_token_accuracy": 0.1683821603655815,
|
|
"num_tokens": 31873468.0,
|
|
"step": 17280
|
|
},
|
|
{
|
|
"entropy": 5.637265586853028,
|
|
"epoch": 1.4521739130434783,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047919093919578283,
|
|
"loss": 5.4728,
|
|
"mean_token_accuracy": 0.16719345450401307,
|
|
"num_tokens": 31882391.0,
|
|
"step": 17285
|
|
},
|
|
{
|
|
"entropy": 5.6774333953857425,
|
|
"epoch": 1.452593992858643,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004791783592659476,
|
|
"loss": 5.4566,
|
|
"mean_token_accuracy": 0.16625383794307708,
|
|
"num_tokens": 31891370.0,
|
|
"step": 17290
|
|
},
|
|
{
|
|
"entropy": 5.641020917892456,
|
|
"epoch": 1.4530140726738079,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000479165775719198,
|
|
"loss": 5.3919,
|
|
"mean_token_accuracy": 0.169977006316185,
|
|
"num_tokens": 31900688.0,
|
|
"step": 17295
|
|
},
|
|
{
|
|
"entropy": 5.628441858291626,
|
|
"epoch": 1.453434152488973,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00047915318855575674,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.1753471314907074,
|
|
"num_tokens": 31909359.0,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"entropy": 5.650968837738037,
|
|
"epoch": 1.453854232304138,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047914059777584686,
|
|
"loss": 5.3947,
|
|
"mean_token_accuracy": 0.16623954772949218,
|
|
"num_tokens": 31918529.0,
|
|
"step": 17305
|
|
},
|
|
{
|
|
"entropy": 5.679246520996093,
|
|
"epoch": 1.4542743121193027,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00047912800337969144,
|
|
"loss": 5.4662,
|
|
"mean_token_accuracy": 0.16294726431369783,
|
|
"num_tokens": 31928310.0,
|
|
"step": 17310
|
|
},
|
|
{
|
|
"entropy": 5.64129490852356,
|
|
"epoch": 1.4546943919344675,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047911540536751355,
|
|
"loss": 5.3744,
|
|
"mean_token_accuracy": 0.17034156024456024,
|
|
"num_tokens": 31937077.0,
|
|
"step": 17315
|
|
},
|
|
{
|
|
"entropy": 5.695573711395264,
|
|
"epoch": 1.4551144717496325,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004791028037395363,
|
|
"loss": 5.4298,
|
|
"mean_token_accuracy": 0.16439317166805267,
|
|
"num_tokens": 31946023.0,
|
|
"step": 17320
|
|
},
|
|
{
|
|
"entropy": 5.581758499145508,
|
|
"epoch": 1.4555345515647973,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00047909019849598305,
|
|
"loss": 5.2733,
|
|
"mean_token_accuracy": 0.17995132952928544,
|
|
"num_tokens": 31954741.0,
|
|
"step": 17325
|
|
},
|
|
{
|
|
"entropy": 5.651013660430908,
|
|
"epoch": 1.4559546313799623,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00047907758963707696,
|
|
"loss": 5.3939,
|
|
"mean_token_accuracy": 0.167492838203907,
|
|
"num_tokens": 31963516.0,
|
|
"step": 17330
|
|
},
|
|
{
|
|
"entropy": 5.683594417572022,
|
|
"epoch": 1.456374711195127,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00047906497716304153,
|
|
"loss": 5.4132,
|
|
"mean_token_accuracy": 0.17192533612251282,
|
|
"num_tokens": 31971917.0,
|
|
"step": 17335
|
|
},
|
|
{
|
|
"entropy": 5.674582862854004,
|
|
"epoch": 1.4567947910102919,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004790523610741001,
|
|
"loss": 5.4584,
|
|
"mean_token_accuracy": 0.16307643949985504,
|
|
"num_tokens": 31980718.0,
|
|
"step": 17340
|
|
},
|
|
{
|
|
"entropy": 5.716789674758911,
|
|
"epoch": 1.4572148708254569,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00047903974137047614,
|
|
"loss": 5.4001,
|
|
"mean_token_accuracy": 0.16782204508781434,
|
|
"num_tokens": 31988664.0,
|
|
"step": 17345
|
|
},
|
|
{
|
|
"entropy": 5.757473373413086,
|
|
"epoch": 1.4576349506406217,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047902711805239325,
|
|
"loss": 5.4791,
|
|
"mean_token_accuracy": 0.1642825037240982,
|
|
"num_tokens": 31998415.0,
|
|
"step": 17350
|
|
},
|
|
{
|
|
"entropy": 5.7503297328948975,
|
|
"epoch": 1.4580550304557867,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047901449112007494,
|
|
"loss": 5.4908,
|
|
"mean_token_accuracy": 0.16542867422103882,
|
|
"num_tokens": 32007915.0,
|
|
"step": 17355
|
|
},
|
|
{
|
|
"entropy": 5.642038631439209,
|
|
"epoch": 1.4584751102709514,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047900186057374514,
|
|
"loss": 5.4186,
|
|
"mean_token_accuracy": 0.16974506080150603,
|
|
"num_tokens": 32016582.0,
|
|
"step": 17360
|
|
},
|
|
{
|
|
"entropy": 5.568690633773803,
|
|
"epoch": 1.4588951900861162,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00047898922641362724,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.16496356278657914,
|
|
"num_tokens": 32026008.0,
|
|
"step": 17365
|
|
},
|
|
{
|
|
"entropy": 5.723394393920898,
|
|
"epoch": 1.4593152699012812,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004789765886399453,
|
|
"loss": 5.4592,
|
|
"mean_token_accuracy": 0.16515185236930846,
|
|
"num_tokens": 32034554.0,
|
|
"step": 17370
|
|
},
|
|
{
|
|
"entropy": 5.817387819290161,
|
|
"epoch": 1.4597353497164463,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00047896394725292313,
|
|
"loss": 5.4701,
|
|
"mean_token_accuracy": 0.17239008098840714,
|
|
"num_tokens": 32044003.0,
|
|
"step": 17375
|
|
},
|
|
{
|
|
"entropy": 5.650395154953003,
|
|
"epoch": 1.460155429531611,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00047895130225278473,
|
|
"loss": 5.4281,
|
|
"mean_token_accuracy": 0.1707577034831047,
|
|
"num_tokens": 32053753.0,
|
|
"step": 17380
|
|
},
|
|
{
|
|
"entropy": 5.639893341064453,
|
|
"epoch": 1.4605755093467758,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004789386536397539,
|
|
"loss": 5.4314,
|
|
"mean_token_accuracy": 0.1669726625084877,
|
|
"num_tokens": 32062459.0,
|
|
"step": 17385
|
|
},
|
|
{
|
|
"entropy": 5.7756260395050045,
|
|
"epoch": 1.4609955891619408,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004789260014140549,
|
|
"loss": 5.5241,
|
|
"mean_token_accuracy": 0.1664410337805748,
|
|
"num_tokens": 32072544.0,
|
|
"step": 17390
|
|
},
|
|
{
|
|
"entropy": 5.75843915939331,
|
|
"epoch": 1.4614156689771056,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047891334557591177,
|
|
"loss": 5.4623,
|
|
"mean_token_accuracy": 0.1596985414624214,
|
|
"num_tokens": 32082015.0,
|
|
"step": 17395
|
|
},
|
|
{
|
|
"entropy": 5.644048738479614,
|
|
"epoch": 1.4618357487922706,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004789006861255488,
|
|
"loss": 5.3924,
|
|
"mean_token_accuracy": 0.1662799596786499,
|
|
"num_tokens": 32091622.0,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"entropy": 5.709836626052857,
|
|
"epoch": 1.4622558286074354,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004788880230631901,
|
|
"loss": 5.5673,
|
|
"mean_token_accuracy": 0.15625317990779877,
|
|
"num_tokens": 32102716.0,
|
|
"step": 17405
|
|
},
|
|
{
|
|
"entropy": 5.7003098487854,
|
|
"epoch": 1.4626759084226002,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047887535638906005,
|
|
"loss": 5.3208,
|
|
"mean_token_accuracy": 0.1776137113571167,
|
|
"num_tokens": 32111051.0,
|
|
"step": 17410
|
|
},
|
|
{
|
|
"entropy": 5.586649465560913,
|
|
"epoch": 1.4630959882377652,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000478862686103383,
|
|
"loss": 5.3372,
|
|
"mean_token_accuracy": 0.17761677205562593,
|
|
"num_tokens": 32119781.0,
|
|
"step": 17415
|
|
},
|
|
{
|
|
"entropy": 5.712557697296143,
|
|
"epoch": 1.46351606805293,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00047885001220638354,
|
|
"loss": 5.435,
|
|
"mean_token_accuracy": 0.16851735562086106,
|
|
"num_tokens": 32128849.0,
|
|
"step": 17420
|
|
},
|
|
{
|
|
"entropy": 5.7341227531433105,
|
|
"epoch": 1.463936147868095,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00047883733469828604,
|
|
"loss": 5.4624,
|
|
"mean_token_accuracy": 0.1703486517071724,
|
|
"num_tokens": 32138046.0,
|
|
"step": 17425
|
|
},
|
|
{
|
|
"entropy": 5.8417564868927006,
|
|
"epoch": 1.4643562276832598,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047882465357931516,
|
|
"loss": 5.5281,
|
|
"mean_token_accuracy": 0.161974436044693,
|
|
"num_tokens": 32147994.0,
|
|
"step": 17430
|
|
},
|
|
{
|
|
"entropy": 5.779322147369385,
|
|
"epoch": 1.4647763074984246,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004788119688496954,
|
|
"loss": 5.4589,
|
|
"mean_token_accuracy": 0.16861263811588287,
|
|
"num_tokens": 32156835.0,
|
|
"step": 17435
|
|
},
|
|
{
|
|
"entropy": 5.6862884044647215,
|
|
"epoch": 1.4651963873135896,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004787992805096516,
|
|
"loss": 5.3936,
|
|
"mean_token_accuracy": 0.17358130365610122,
|
|
"num_tokens": 32166751.0,
|
|
"step": 17440
|
|
},
|
|
{
|
|
"entropy": 5.713692283630371,
|
|
"epoch": 1.4656164671287546,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00047878658855940855,
|
|
"loss": 5.5068,
|
|
"mean_token_accuracy": 0.16271049082279204,
|
|
"num_tokens": 32175705.0,
|
|
"step": 17445
|
|
},
|
|
{
|
|
"entropy": 5.826437711715698,
|
|
"epoch": 1.4660365469439194,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004787738929991909,
|
|
"loss": 5.5591,
|
|
"mean_token_accuracy": 0.15781314745545388,
|
|
"num_tokens": 32185404.0,
|
|
"step": 17450
|
|
},
|
|
{
|
|
"entropy": 5.72130651473999,
|
|
"epoch": 1.4664566267590842,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00047876119382922374,
|
|
"loss": 5.4299,
|
|
"mean_token_accuracy": 0.16798323690891265,
|
|
"num_tokens": 32194054.0,
|
|
"step": 17455
|
|
},
|
|
{
|
|
"entropy": 5.730863285064697,
|
|
"epoch": 1.4668767065742492,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00047874849104973194,
|
|
"loss": 5.4984,
|
|
"mean_token_accuracy": 0.15487258285284042,
|
|
"num_tokens": 32204080.0,
|
|
"step": 17460
|
|
},
|
|
{
|
|
"entropy": 5.704109954833984,
|
|
"epoch": 1.467296786389414,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00047873578466094054,
|
|
"loss": 5.4125,
|
|
"mean_token_accuracy": 0.161499485373497,
|
|
"num_tokens": 32213279.0,
|
|
"step": 17465
|
|
},
|
|
{
|
|
"entropy": 5.664938116073609,
|
|
"epoch": 1.467716866204579,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004787230746630746,
|
|
"loss": 5.4104,
|
|
"mean_token_accuracy": 0.17155456244945527,
|
|
"num_tokens": 32221668.0,
|
|
"step": 17470
|
|
},
|
|
{
|
|
"entropy": 5.695741128921509,
|
|
"epoch": 1.4681369460197438,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004787103610563593,
|
|
"loss": 5.3415,
|
|
"mean_token_accuracy": 0.17343094050884247,
|
|
"num_tokens": 32229683.0,
|
|
"step": 17475
|
|
},
|
|
{
|
|
"entropy": 5.691019868850708,
|
|
"epoch": 1.4685570258349085,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047869764384101993,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.16649516075849533,
|
|
"num_tokens": 32238948.0,
|
|
"step": 17480
|
|
},
|
|
{
|
|
"entropy": 5.6545178413391115,
|
|
"epoch": 1.4689771056500736,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047868492301728164,
|
|
"loss": 5.4404,
|
|
"mean_token_accuracy": 0.16138940006494523,
|
|
"num_tokens": 32248079.0,
|
|
"step": 17485
|
|
},
|
|
{
|
|
"entropy": 5.605484294891357,
|
|
"epoch": 1.4693971854652383,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00047867219858536975,
|
|
"loss": 5.2716,
|
|
"mean_token_accuracy": 0.1824018180370331,
|
|
"num_tokens": 32256413.0,
|
|
"step": 17490
|
|
},
|
|
{
|
|
"entropy": 5.6888096809387205,
|
|
"epoch": 1.4698172652804034,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004786594705455098,
|
|
"loss": 5.4408,
|
|
"mean_token_accuracy": 0.16207701563835145,
|
|
"num_tokens": 32265954.0,
|
|
"step": 17495
|
|
},
|
|
{
|
|
"entropy": 5.676724147796631,
|
|
"epoch": 1.4702373450955681,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004786467388979272,
|
|
"loss": 5.349,
|
|
"mean_token_accuracy": 0.171977636218071,
|
|
"num_tokens": 32273817.0,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"entropy": 5.605041551589966,
|
|
"epoch": 1.470657424910733,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047863400364284744,
|
|
"loss": 5.4111,
|
|
"mean_token_accuracy": 0.1661633461713791,
|
|
"num_tokens": 32283025.0,
|
|
"step": 17505
|
|
},
|
|
{
|
|
"entropy": 5.665054225921631,
|
|
"epoch": 1.471077504725898,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.00047862126478049623,
|
|
"loss": 5.3882,
|
|
"mean_token_accuracy": 0.16659335941076278,
|
|
"num_tokens": 32292321.0,
|
|
"step": 17510
|
|
},
|
|
{
|
|
"entropy": 5.784007930755616,
|
|
"epoch": 1.4714975845410627,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00047860852231109915,
|
|
"loss": 5.4876,
|
|
"mean_token_accuracy": 0.15348291248083115,
|
|
"num_tokens": 32302203.0,
|
|
"step": 17515
|
|
},
|
|
{
|
|
"entropy": 5.56384539604187,
|
|
"epoch": 1.4719176643562277,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004785957762348819,
|
|
"loss": 5.3156,
|
|
"mean_token_accuracy": 0.16967657655477525,
|
|
"num_tokens": 32310893.0,
|
|
"step": 17520
|
|
},
|
|
{
|
|
"entropy": 5.559794855117798,
|
|
"epoch": 1.4723377441713925,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004785830265520703,
|
|
"loss": 5.3744,
|
|
"mean_token_accuracy": 0.16862395852804185,
|
|
"num_tokens": 32320320.0,
|
|
"step": 17525
|
|
},
|
|
{
|
|
"entropy": 5.607880735397339,
|
|
"epoch": 1.4727578239865575,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00047857027326289023,
|
|
"loss": 5.2844,
|
|
"mean_token_accuracy": 0.17600037455558776,
|
|
"num_tokens": 32329196.0,
|
|
"step": 17530
|
|
},
|
|
{
|
|
"entropy": 5.6827874183654785,
|
|
"epoch": 1.4731779038017223,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00047855751636756763,
|
|
"loss": 5.4258,
|
|
"mean_token_accuracy": 0.16296974420547486,
|
|
"num_tokens": 32338529.0,
|
|
"step": 17535
|
|
},
|
|
{
|
|
"entropy": 5.707752323150634,
|
|
"epoch": 1.4735979836168873,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004785447558663284,
|
|
"loss": 5.418,
|
|
"mean_token_accuracy": 0.1722614958882332,
|
|
"num_tokens": 32347114.0,
|
|
"step": 17540
|
|
},
|
|
{
|
|
"entropy": 5.75955114364624,
|
|
"epoch": 1.474018063432052,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047853199175939865,
|
|
"loss": 5.6021,
|
|
"mean_token_accuracy": 0.1608388304710388,
|
|
"num_tokens": 32356765.0,
|
|
"step": 17545
|
|
},
|
|
{
|
|
"entropy": 5.7798620700836185,
|
|
"epoch": 1.474438143247217,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004785192240470045,
|
|
"loss": 5.5294,
|
|
"mean_token_accuracy": 0.16074298024177552,
|
|
"num_tokens": 32366175.0,
|
|
"step": 17550
|
|
},
|
|
{
|
|
"entropy": 5.649854564666748,
|
|
"epoch": 1.474858223062382,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000478506452729372,
|
|
"loss": 5.315,
|
|
"mean_token_accuracy": 0.1758470743894577,
|
|
"num_tokens": 32375063.0,
|
|
"step": 17555
|
|
},
|
|
{
|
|
"entropy": 5.6665150165557865,
|
|
"epoch": 1.4752783028775467,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047849367780672755,
|
|
"loss": 5.4086,
|
|
"mean_token_accuracy": 0.1674113929271698,
|
|
"num_tokens": 32384596.0,
|
|
"step": 17560
|
|
},
|
|
{
|
|
"entropy": 5.636862468719483,
|
|
"epoch": 1.4756983826927117,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004784808992792974,
|
|
"loss": 5.3593,
|
|
"mean_token_accuracy": 0.168624584376812,
|
|
"num_tokens": 32393489.0,
|
|
"step": 17565
|
|
},
|
|
{
|
|
"entropy": 5.677070379257202,
|
|
"epoch": 1.4761184625078765,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004784681171473079,
|
|
"loss": 5.3487,
|
|
"mean_token_accuracy": 0.1728109061717987,
|
|
"num_tokens": 32402192.0,
|
|
"step": 17570
|
|
},
|
|
{
|
|
"entropy": 5.739632654190063,
|
|
"epoch": 1.4765385423230413,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00047845533141098543,
|
|
"loss": 5.4413,
|
|
"mean_token_accuracy": 0.15874089151620865,
|
|
"num_tokens": 32411317.0,
|
|
"step": 17575
|
|
},
|
|
{
|
|
"entropy": 5.708612537384033,
|
|
"epoch": 1.4769586221382063,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004784425420705565,
|
|
"loss": 5.499,
|
|
"mean_token_accuracy": 0.1618265450000763,
|
|
"num_tokens": 32420308.0,
|
|
"step": 17580
|
|
},
|
|
{
|
|
"entropy": 5.618194961547852,
|
|
"epoch": 1.477378701953371,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004784297491262477,
|
|
"loss": 5.4258,
|
|
"mean_token_accuracy": 0.16643496304750444,
|
|
"num_tokens": 32429532.0,
|
|
"step": 17585
|
|
},
|
|
{
|
|
"entropy": 5.682935762405395,
|
|
"epoch": 1.477798781768536,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004784169525782858,
|
|
"loss": 5.4164,
|
|
"mean_token_accuracy": 0.16577064841985703,
|
|
"num_tokens": 32439382.0,
|
|
"step": 17590
|
|
},
|
|
{
|
|
"entropy": 5.7163759708404545,
|
|
"epoch": 1.4782188615837009,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004784041524268971,
|
|
"loss": 5.4034,
|
|
"mean_token_accuracy": 0.17389584332704544,
|
|
"num_tokens": 32447893.0,
|
|
"step": 17595
|
|
},
|
|
{
|
|
"entropy": 5.629817867279053,
|
|
"epoch": 1.4786389413988656,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00047839134867230874,
|
|
"loss": 5.4084,
|
|
"mean_token_accuracy": 0.1654166266322136,
|
|
"num_tokens": 32457770.0,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"entropy": 5.729843044281006,
|
|
"epoch": 1.4790590212140307,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047837854131474726,
|
|
"loss": 5.5139,
|
|
"mean_token_accuracy": 0.16561387926340104,
|
|
"num_tokens": 32467247.0,
|
|
"step": 17605
|
|
},
|
|
{
|
|
"entropy": 5.7485791683197025,
|
|
"epoch": 1.4794791010291957,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047836573035443976,
|
|
"loss": 5.4893,
|
|
"mean_token_accuracy": 0.16393031179904938,
|
|
"num_tokens": 32477453.0,
|
|
"step": 17610
|
|
},
|
|
{
|
|
"entropy": 5.762020063400269,
|
|
"epoch": 1.4798991808443605,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00047835291579161293,
|
|
"loss": 5.4549,
|
|
"mean_token_accuracy": 0.17096612453460694,
|
|
"num_tokens": 32486278.0,
|
|
"step": 17615
|
|
},
|
|
{
|
|
"entropy": 5.62855486869812,
|
|
"epoch": 1.4803192606595252,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004783400976264941,
|
|
"loss": 5.3828,
|
|
"mean_token_accuracy": 0.17316290289163588,
|
|
"num_tokens": 32495523.0,
|
|
"step": 17620
|
|
},
|
|
{
|
|
"entropy": 5.669747161865234,
|
|
"epoch": 1.4807393404746902,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00047832727585930997,
|
|
"loss": 5.419,
|
|
"mean_token_accuracy": 0.16708965897560119,
|
|
"num_tokens": 32504952.0,
|
|
"step": 17625
|
|
},
|
|
{
|
|
"entropy": 5.667424058914184,
|
|
"epoch": 1.481159420289855,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004783144504902879,
|
|
"loss": 5.3972,
|
|
"mean_token_accuracy": 0.16518824696540832,
|
|
"num_tokens": 32515620.0,
|
|
"step": 17630
|
|
},
|
|
{
|
|
"entropy": 5.632094812393189,
|
|
"epoch": 1.48157950010502,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000478301621519655,
|
|
"loss": 5.3601,
|
|
"mean_token_accuracy": 0.17287708073854446,
|
|
"num_tokens": 32524549.0,
|
|
"step": 17635
|
|
},
|
|
{
|
|
"entropy": 5.663208436965943,
|
|
"epoch": 1.4819995799201848,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004782887889476386,
|
|
"loss": 5.2658,
|
|
"mean_token_accuracy": 0.17909094095230102,
|
|
"num_tokens": 32533043.0,
|
|
"step": 17640
|
|
},
|
|
{
|
|
"entropy": 5.639974546432495,
|
|
"epoch": 1.4824196597353496,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.000478275952774466,
|
|
"loss": 5.3707,
|
|
"mean_token_accuracy": 0.1682120993733406,
|
|
"num_tokens": 32541679.0,
|
|
"step": 17645
|
|
},
|
|
{
|
|
"entropy": 5.659855556488037,
|
|
"epoch": 1.4828397395505146,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004782631130003646,
|
|
"loss": 5.4875,
|
|
"mean_token_accuracy": 0.17222274392843245,
|
|
"num_tokens": 32550922.0,
|
|
"step": 17650
|
|
},
|
|
{
|
|
"entropy": 5.73547625541687,
|
|
"epoch": 1.4832598193656794,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004782502696255617,
|
|
"loss": 5.4881,
|
|
"mean_token_accuracy": 0.16443574875593187,
|
|
"num_tokens": 32560063.0,
|
|
"step": 17655
|
|
},
|
|
{
|
|
"entropy": 5.655674934387207,
|
|
"epoch": 1.4836798991808444,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.00047823742265028495,
|
|
"loss": 5.3575,
|
|
"mean_token_accuracy": 0.16813185214996337,
|
|
"num_tokens": 32569476.0,
|
|
"step": 17660
|
|
},
|
|
{
|
|
"entropy": 5.677836179733276,
|
|
"epoch": 1.4840999789960092,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.000478224572074762,
|
|
"loss": 5.4225,
|
|
"mean_token_accuracy": 0.17570533752441406,
|
|
"num_tokens": 32578552.0,
|
|
"step": 17665
|
|
},
|
|
{
|
|
"entropy": 5.661578607559204,
|
|
"epoch": 1.484520058811174,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004782117178992203,
|
|
"loss": 5.4238,
|
|
"mean_token_accuracy": 0.16717635840177536,
|
|
"num_tokens": 32589074.0,
|
|
"step": 17670
|
|
},
|
|
{
|
|
"entropy": 5.676818895339966,
|
|
"epoch": 1.484940138626339,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004781988601238878,
|
|
"loss": 5.4446,
|
|
"mean_token_accuracy": 0.16712375432252885,
|
|
"num_tokens": 32599288.0,
|
|
"step": 17675
|
|
},
|
|
{
|
|
"entropy": 5.795759963989258,
|
|
"epoch": 1.485360218441504,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000478185998748992,
|
|
"loss": 5.4935,
|
|
"mean_token_accuracy": 0.16263023763895035,
|
|
"num_tokens": 32609430.0,
|
|
"step": 17680
|
|
},
|
|
{
|
|
"entropy": 5.638738298416138,
|
|
"epoch": 1.4857802982566688,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047817313377476083,
|
|
"loss": 5.3467,
|
|
"mean_token_accuracy": 0.16966764032840728,
|
|
"num_tokens": 32617763.0,
|
|
"step": 17685
|
|
},
|
|
{
|
|
"entropy": 5.5954235076904295,
|
|
"epoch": 1.4862003780718336,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00047816026520142234,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.16032783836126327,
|
|
"num_tokens": 32627465.0,
|
|
"step": 17690
|
|
},
|
|
{
|
|
"entropy": 5.728960990905762,
|
|
"epoch": 1.4866204578869986,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004781473930292043,
|
|
"loss": 5.3391,
|
|
"mean_token_accuracy": 0.17672401666641235,
|
|
"num_tokens": 32635984.0,
|
|
"step": 17695
|
|
},
|
|
{
|
|
"entropy": 5.587149381637573,
|
|
"epoch": 1.4870405377021634,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004781345172583348,
|
|
"loss": 5.2784,
|
|
"mean_token_accuracy": 0.17341048419475555,
|
|
"num_tokens": 32644346.0,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"entropy": 5.616852807998657,
|
|
"epoch": 1.4874606175173284,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047812163788904196,
|
|
"loss": 5.4103,
|
|
"mean_token_accuracy": 0.16415098160505295,
|
|
"num_tokens": 32654118.0,
|
|
"step": 17705
|
|
},
|
|
{
|
|
"entropy": 5.749323081970215,
|
|
"epoch": 1.4878806973324932,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047810875492155386,
|
|
"loss": 5.4415,
|
|
"mean_token_accuracy": 0.16800331622362136,
|
|
"num_tokens": 32664258.0,
|
|
"step": 17710
|
|
},
|
|
{
|
|
"entropy": 5.688397216796875,
|
|
"epoch": 1.488300777147658,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004780958683560987,
|
|
"loss": 5.4765,
|
|
"mean_token_accuracy": 0.16039148345589638,
|
|
"num_tokens": 32673672.0,
|
|
"step": 17715
|
|
},
|
|
{
|
|
"entropy": 5.712081003189087,
|
|
"epoch": 1.488720856962823,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004780829781929049,
|
|
"loss": 5.4578,
|
|
"mean_token_accuracy": 0.15657913982868193,
|
|
"num_tokens": 32682901.0,
|
|
"step": 17720
|
|
},
|
|
{
|
|
"entropy": 5.735140562057495,
|
|
"epoch": 1.4891409367779878,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004780700844322007,
|
|
"loss": 5.4014,
|
|
"mean_token_accuracy": 0.17273005843162537,
|
|
"num_tokens": 32691384.0,
|
|
"step": 17725
|
|
},
|
|
{
|
|
"entropy": 5.635052490234375,
|
|
"epoch": 1.4895610165931528,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00047805718707421446,
|
|
"loss": 5.4357,
|
|
"mean_token_accuracy": 0.16961687952280044,
|
|
"num_tokens": 32700758.0,
|
|
"step": 17730
|
|
},
|
|
{
|
|
"entropy": 5.759167098999024,
|
|
"epoch": 1.4899810964083176,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.00047804428611917475,
|
|
"loss": 5.5407,
|
|
"mean_token_accuracy": 0.16442745178937912,
|
|
"num_tokens": 32709676.0,
|
|
"step": 17735
|
|
},
|
|
{
|
|
"entropy": 5.7738946914672855,
|
|
"epoch": 1.4904011762234823,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047803138156731,
|
|
"loss": 5.4367,
|
|
"mean_token_accuracy": 0.1609507068991661,
|
|
"num_tokens": 32718102.0,
|
|
"step": 17740
|
|
},
|
|
{
|
|
"entropy": 5.749574279785156,
|
|
"epoch": 1.4908212560386473,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047801847341884897,
|
|
"loss": 5.4238,
|
|
"mean_token_accuracy": 0.16728150397539138,
|
|
"num_tokens": 32727356.0,
|
|
"step": 17745
|
|
},
|
|
{
|
|
"entropy": 5.610603475570679,
|
|
"epoch": 1.4912413358538124,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004780055616740202,
|
|
"loss": 5.4164,
|
|
"mean_token_accuracy": 0.16602010279893875,
|
|
"num_tokens": 32736605.0,
|
|
"step": 17750
|
|
},
|
|
{
|
|
"entropy": 5.626084041595459,
|
|
"epoch": 1.4916614156689771,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004779926463330524,
|
|
"loss": 5.3607,
|
|
"mean_token_accuracy": 0.17045399099588393,
|
|
"num_tokens": 32745573.0,
|
|
"step": 17755
|
|
},
|
|
{
|
|
"entropy": 5.6878427028656,
|
|
"epoch": 1.492081495484142,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004779797273961744,
|
|
"loss": 5.414,
|
|
"mean_token_accuracy": 0.17474236190319062,
|
|
"num_tokens": 32755695.0,
|
|
"step": 17760
|
|
},
|
|
{
|
|
"entropy": 5.6625172138214115,
|
|
"epoch": 1.492501575299307,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004779668048636151,
|
|
"loss": 5.3292,
|
|
"mean_token_accuracy": 0.1730514347553253,
|
|
"num_tokens": 32763570.0,
|
|
"step": 17765
|
|
},
|
|
{
|
|
"entropy": 5.612107133865356,
|
|
"epoch": 1.4929216551144717,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00047795387873560336,
|
|
"loss": 5.4331,
|
|
"mean_token_accuracy": 0.1678207114338875,
|
|
"num_tokens": 32772006.0,
|
|
"step": 17770
|
|
},
|
|
{
|
|
"entropy": 5.7148637771606445,
|
|
"epoch": 1.4933417349296367,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004779409490123681,
|
|
"loss": 5.3881,
|
|
"mean_token_accuracy": 0.16234676241874696,
|
|
"num_tokens": 32781080.0,
|
|
"step": 17775
|
|
},
|
|
{
|
|
"entropy": 5.635086727142334,
|
|
"epoch": 1.4937618147448015,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004779280156941384,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.16645084470510482,
|
|
"num_tokens": 32789880.0,
|
|
"step": 17780
|
|
},
|
|
{
|
|
"entropy": 5.69928207397461,
|
|
"epoch": 1.4941818945599663,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047791507878114354,
|
|
"loss": 5.3909,
|
|
"mean_token_accuracy": 0.16705690920352936,
|
|
"num_tokens": 32799222.0,
|
|
"step": 17785
|
|
},
|
|
{
|
|
"entropy": 5.626346826553345,
|
|
"epoch": 1.4946019743751313,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004779021382736124,
|
|
"loss": 5.387,
|
|
"mean_token_accuracy": 0.16727182418107986,
|
|
"num_tokens": 32808945.0,
|
|
"step": 17790
|
|
},
|
|
{
|
|
"entropy": 5.611076211929321,
|
|
"epoch": 1.495022054190296,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004778891941717745,
|
|
"loss": 5.3118,
|
|
"mean_token_accuracy": 0.18029792606830597,
|
|
"num_tokens": 32818386.0,
|
|
"step": 17795
|
|
},
|
|
{
|
|
"entropy": 5.5952142715454105,
|
|
"epoch": 1.495442134005461,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004778762464758589,
|
|
"loss": 5.3771,
|
|
"mean_token_accuracy": 0.16038608253002168,
|
|
"num_tokens": 32828364.0,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"entropy": 5.779965257644653,
|
|
"epoch": 1.495862213820626,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00047786329518609505,
|
|
"loss": 5.5137,
|
|
"mean_token_accuracy": 0.16410740464925766,
|
|
"num_tokens": 32837399.0,
|
|
"step": 17805
|
|
},
|
|
{
|
|
"entropy": 5.671717548370362,
|
|
"epoch": 1.4962822936357907,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047785034030271243,
|
|
"loss": 5.3413,
|
|
"mean_token_accuracy": 0.1711513638496399,
|
|
"num_tokens": 32846111.0,
|
|
"step": 17810
|
|
},
|
|
{
|
|
"entropy": 5.6222676753997805,
|
|
"epoch": 1.4967023734509557,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004778373818259404,
|
|
"loss": 5.2429,
|
|
"mean_token_accuracy": 0.1814047634601593,
|
|
"num_tokens": 32855839.0,
|
|
"step": 17815
|
|
},
|
|
{
|
|
"entropy": 5.71916937828064,
|
|
"epoch": 1.4971224532661207,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00047782441975600866,
|
|
"loss": 5.5456,
|
|
"mean_token_accuracy": 0.16741917729377748,
|
|
"num_tokens": 32865946.0,
|
|
"step": 17820
|
|
},
|
|
{
|
|
"entropy": 5.748912906646728,
|
|
"epoch": 1.4975425330812855,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004778114540931468,
|
|
"loss": 5.5114,
|
|
"mean_token_accuracy": 0.16409117877483367,
|
|
"num_tokens": 32875310.0,
|
|
"step": 17825
|
|
},
|
|
{
|
|
"entropy": 5.702952241897583,
|
|
"epoch": 1.4979626128964503,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.00047779848483758445,
|
|
"loss": 5.4483,
|
|
"mean_token_accuracy": 0.16831188052892684,
|
|
"num_tokens": 32885315.0,
|
|
"step": 17830
|
|
},
|
|
{
|
|
"entropy": 5.684667110443115,
|
|
"epoch": 1.4983826927116153,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047778551198955133,
|
|
"loss": 5.4043,
|
|
"mean_token_accuracy": 0.1707111567258835,
|
|
"num_tokens": 32894055.0,
|
|
"step": 17835
|
|
},
|
|
{
|
|
"entropy": 5.64805235862732,
|
|
"epoch": 1.49880277252678,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004777725355492773,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.17348893135786056,
|
|
"num_tokens": 32903030.0,
|
|
"step": 17840
|
|
},
|
|
{
|
|
"entropy": 5.665900611877442,
|
|
"epoch": 1.499222852341945,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004777595555169922,
|
|
"loss": 5.3429,
|
|
"mean_token_accuracy": 0.17314210832118987,
|
|
"num_tokens": 32911562.0,
|
|
"step": 17845
|
|
},
|
|
{
|
|
"entropy": 5.706243324279785,
|
|
"epoch": 1.4996429321571099,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000477746571892926,
|
|
"loss": 5.464,
|
|
"mean_token_accuracy": 0.16257281601428986,
|
|
"num_tokens": 32920376.0,
|
|
"step": 17850
|
|
},
|
|
{
|
|
"entropy": 5.663986158370972,
|
|
"epoch": 1.5000630119722747,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004777335846773087,
|
|
"loss": 5.3903,
|
|
"mean_token_accuracy": 0.16299790441989898,
|
|
"num_tokens": 32929374.0,
|
|
"step": 17855
|
|
},
|
|
{
|
|
"entropy": 5.528833436965942,
|
|
"epoch": 1.5004830917874397,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00047772059387037025,
|
|
"loss": 5.345,
|
|
"mean_token_accuracy": 0.16556637734174728,
|
|
"num_tokens": 32938695.0,
|
|
"step": 17860
|
|
},
|
|
{
|
|
"entropy": 5.671306324005127,
|
|
"epoch": 1.5009031716026044,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004777075994723409,
|
|
"loss": 5.4045,
|
|
"mean_token_accuracy": 0.1704086810350418,
|
|
"num_tokens": 32947725.0,
|
|
"step": 17865
|
|
},
|
|
{
|
|
"entropy": 5.726226949691773,
|
|
"epoch": 1.5013232514177695,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047769460148345085,
|
|
"loss": 5.4181,
|
|
"mean_token_accuracy": 0.16411009281873704,
|
|
"num_tokens": 32957017.0,
|
|
"step": 17870
|
|
},
|
|
{
|
|
"entropy": 5.675952672958374,
|
|
"epoch": 1.5017433312329342,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004776815999039303,
|
|
"loss": 5.3935,
|
|
"mean_token_accuracy": 0.1685171753168106,
|
|
"num_tokens": 32965944.0,
|
|
"step": 17875
|
|
},
|
|
{
|
|
"entropy": 5.637391996383667,
|
|
"epoch": 1.502163411048099,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004776685947340096,
|
|
"loss": 5.3918,
|
|
"mean_token_accuracy": 0.17094200998544692,
|
|
"num_tokens": 32975368.0,
|
|
"step": 17880
|
|
},
|
|
{
|
|
"entropy": 5.685165643692017,
|
|
"epoch": 1.502583490863264,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004776555859739191,
|
|
"loss": 5.4559,
|
|
"mean_token_accuracy": 0.16454171389341354,
|
|
"num_tokens": 32984603.0,
|
|
"step": 17885
|
|
},
|
|
{
|
|
"entropy": 5.6984397888183596,
|
|
"epoch": 1.503003570678429,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00047764257362388913,
|
|
"loss": 5.4249,
|
|
"mean_token_accuracy": 0.16488805860280992,
|
|
"num_tokens": 32993621.0,
|
|
"step": 17890
|
|
},
|
|
{
|
|
"entropy": 5.642865991592407,
|
|
"epoch": 1.5034236504935938,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004776295576841504,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.1731736972928047,
|
|
"num_tokens": 33002637.0,
|
|
"step": 17895
|
|
},
|
|
{
|
|
"entropy": 5.664972877502441,
|
|
"epoch": 1.5038437303087586,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00047761653815493337,
|
|
"loss": 5.3564,
|
|
"mean_token_accuracy": 0.17393183410167695,
|
|
"num_tokens": 33011964.0,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"entropy": 5.658042669296265,
|
|
"epoch": 1.5042638101239234,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00047760351503646877,
|
|
"loss": 5.4165,
|
|
"mean_token_accuracy": 0.16770535558462143,
|
|
"num_tokens": 33020626.0,
|
|
"step": 17905
|
|
},
|
|
{
|
|
"entropy": 5.70390887260437,
|
|
"epoch": 1.5046838899390884,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004775904883289871,
|
|
"loss": 5.369,
|
|
"mean_token_accuracy": 0.1692973181605339,
|
|
"num_tokens": 33029212.0,
|
|
"step": 17910
|
|
},
|
|
{
|
|
"entropy": 5.6756768226623535,
|
|
"epoch": 1.5051039697542534,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00047757745803271936,
|
|
"loss": 5.4381,
|
|
"mean_token_accuracy": 0.16383266746997832,
|
|
"num_tokens": 33038893.0,
|
|
"step": 17915
|
|
},
|
|
{
|
|
"entropy": 5.661106920242309,
|
|
"epoch": 1.5055240495694182,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004775644241478962,
|
|
"loss": 5.4223,
|
|
"mean_token_accuracy": 0.16328874826431275,
|
|
"num_tokens": 33048058.0,
|
|
"step": 17920
|
|
},
|
|
{
|
|
"entropy": 5.62230749130249,
|
|
"epoch": 1.505944129384583,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.00047755138667474864,
|
|
"loss": 5.3164,
|
|
"mean_token_accuracy": 0.1771548643708229,
|
|
"num_tokens": 33057106.0,
|
|
"step": 17925
|
|
},
|
|
{
|
|
"entropy": 5.60415210723877,
|
|
"epoch": 1.506364209199748,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004775383456135075,
|
|
"loss": 5.4777,
|
|
"mean_token_accuracy": 0.16880970150232316,
|
|
"num_tokens": 33066400.0,
|
|
"step": 17930
|
|
},
|
|
{
|
|
"entropy": 5.663134336471558,
|
|
"epoch": 1.5067842890149128,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004775253009644038,
|
|
"loss": 5.3276,
|
|
"mean_token_accuracy": 0.17642468810081482,
|
|
"num_tokens": 33075357.0,
|
|
"step": 17935
|
|
},
|
|
{
|
|
"entropy": 5.7705831050872805,
|
|
"epoch": 1.5072043688300778,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047751225272766885,
|
|
"loss": 5.4278,
|
|
"mean_token_accuracy": 0.1641027197241783,
|
|
"num_tokens": 33085707.0,
|
|
"step": 17940
|
|
},
|
|
{
|
|
"entropy": 5.800422859191895,
|
|
"epoch": 1.5076244486452426,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004774992009035335,
|
|
"loss": 5.5494,
|
|
"mean_token_accuracy": 0.16157107502222062,
|
|
"num_tokens": 33095825.0,
|
|
"step": 17945
|
|
},
|
|
{
|
|
"entropy": 5.597539043426513,
|
|
"epoch": 1.5080445284604074,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004774861454922291,
|
|
"loss": 5.3414,
|
|
"mean_token_accuracy": 0.174434395134449,
|
|
"num_tokens": 33105130.0,
|
|
"step": 17950
|
|
},
|
|
{
|
|
"entropy": 5.596598339080811,
|
|
"epoch": 1.5084646082755724,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004774730864939869,
|
|
"loss": 5.378,
|
|
"mean_token_accuracy": 0.16594540178775788,
|
|
"num_tokens": 33113226.0,
|
|
"step": 17955
|
|
},
|
|
{
|
|
"entropy": 5.715326309204102,
|
|
"epoch": 1.5088846880907374,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047746002390903824,
|
|
"loss": 5.3872,
|
|
"mean_token_accuracy": 0.1708257630467415,
|
|
"num_tokens": 33120824.0,
|
|
"step": 17960
|
|
},
|
|
{
|
|
"entropy": 5.746819305419922,
|
|
"epoch": 1.5093047679059022,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004774469577376145,
|
|
"loss": 5.3633,
|
|
"mean_token_accuracy": 0.17433841079473494,
|
|
"num_tokens": 33129503.0,
|
|
"step": 17965
|
|
},
|
|
{
|
|
"entropy": 5.552629375457764,
|
|
"epoch": 1.509724847721067,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047743388797994715,
|
|
"loss": 5.2681,
|
|
"mean_token_accuracy": 0.17450862377882004,
|
|
"num_tokens": 33138838.0,
|
|
"step": 17970
|
|
},
|
|
{
|
|
"entropy": 5.621928453445435,
|
|
"epoch": 1.5101449275362318,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047742081463626767,
|
|
"loss": 5.3923,
|
|
"mean_token_accuracy": 0.16948231309652328,
|
|
"num_tokens": 33148142.0,
|
|
"step": 17975
|
|
},
|
|
{
|
|
"entropy": 5.645056867599488,
|
|
"epoch": 1.5105650073513968,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004774077377068078,
|
|
"loss": 5.3853,
|
|
"mean_token_accuracy": 0.16999683529138565,
|
|
"num_tokens": 33156750.0,
|
|
"step": 17980
|
|
},
|
|
{
|
|
"entropy": 5.755242204666137,
|
|
"epoch": 1.5109850871665618,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000477394657191799,
|
|
"loss": 5.5408,
|
|
"mean_token_accuracy": 0.15939399749040603,
|
|
"num_tokens": 33166511.0,
|
|
"step": 17985
|
|
},
|
|
{
|
|
"entropy": 5.70735993385315,
|
|
"epoch": 1.5114051669817266,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047738157309147307,
|
|
"loss": 5.4727,
|
|
"mean_token_accuracy": 0.16851068288087845,
|
|
"num_tokens": 33175812.0,
|
|
"step": 17990
|
|
},
|
|
{
|
|
"entropy": 5.578419828414917,
|
|
"epoch": 1.5118252467968913,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00047736848540606174,
|
|
"loss": 5.3388,
|
|
"mean_token_accuracy": 0.16674845963716506,
|
|
"num_tokens": 33185201.0,
|
|
"step": 17995
|
|
},
|
|
{
|
|
"entropy": 5.634521389007569,
|
|
"epoch": 1.5122453266120561,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000477355394135797,
|
|
"loss": 5.3332,
|
|
"mean_token_accuracy": 0.17178126722574233,
|
|
"num_tokens": 33195151.0,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 1.5122453266120561,
|
|
"eval_entropy": 5.504568783942394,
|
|
"eval_loss": 5.480621814727783,
|
|
"eval_mean_token_accuracy": 0.17380510120579043,
|
|
"eval_num_tokens": 33195151.0,
|
|
"eval_runtime": 27.2739,
|
|
"eval_samples_per_second": 1370.028,
|
|
"eval_steps_per_second": 171.263,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"entropy": 5.7297890186309814,
|
|
"epoch": 1.5126654064272211,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004773422992809106,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.16926338374614716,
|
|
"num_tokens": 33204800.0,
|
|
"step": 18005
|
|
},
|
|
{
|
|
"entropy": 5.695334625244141,
|
|
"epoch": 1.5130854862423861,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004773292008416346,
|
|
"loss": 5.4322,
|
|
"mean_token_accuracy": 0.1651061251759529,
|
|
"num_tokens": 33214529.0,
|
|
"step": 18010
|
|
},
|
|
{
|
|
"entropy": 5.6870293617248535,
|
|
"epoch": 1.513505566057551,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047731609881820095,
|
|
"loss": 5.4368,
|
|
"mean_token_accuracy": 0.16418869495391847,
|
|
"num_tokens": 33224522.0,
|
|
"step": 18015
|
|
},
|
|
{
|
|
"entropy": 5.750136613845825,
|
|
"epoch": 1.5139256458727157,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00047730299321084173,
|
|
"loss": 5.4425,
|
|
"mean_token_accuracy": 0.16809688359498978,
|
|
"num_tokens": 33233220.0,
|
|
"step": 18020
|
|
},
|
|
{
|
|
"entropy": 5.716884803771973,
|
|
"epoch": 1.5143457256878807,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047728988401978916,
|
|
"loss": 5.3468,
|
|
"mean_token_accuracy": 0.173400317132473,
|
|
"num_tokens": 33242277.0,
|
|
"step": 18025
|
|
},
|
|
{
|
|
"entropy": 5.7281084060668945,
|
|
"epoch": 1.5147658055030457,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004772767712452756,
|
|
"loss": 5.4088,
|
|
"mean_token_accuracy": 0.17954297214746476,
|
|
"num_tokens": 33251113.0,
|
|
"step": 18030
|
|
},
|
|
{
|
|
"entropy": 5.60842080116272,
|
|
"epoch": 1.5151858853182105,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.00047726365488753305,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.15993862450122834,
|
|
"num_tokens": 33261055.0,
|
|
"step": 18035
|
|
},
|
|
{
|
|
"entropy": 5.685538625717163,
|
|
"epoch": 1.5156059651333753,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00047725053494679403,
|
|
"loss": 5.5104,
|
|
"mean_token_accuracy": 0.16750353425741196,
|
|
"num_tokens": 33270981.0,
|
|
"step": 18040
|
|
},
|
|
{
|
|
"entropy": 5.811197137832641,
|
|
"epoch": 1.51602604494854,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00047723741142329104,
|
|
"loss": 5.4511,
|
|
"mean_token_accuracy": 0.16344697326421737,
|
|
"num_tokens": 33279516.0,
|
|
"step": 18045
|
|
},
|
|
{
|
|
"entropy": 5.623986768722534,
|
|
"epoch": 1.516446124763705,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00047722428431725637,
|
|
"loss": 5.372,
|
|
"mean_token_accuracy": 0.17835780680179597,
|
|
"num_tokens": 33288300.0,
|
|
"step": 18050
|
|
},
|
|
{
|
|
"entropy": 5.646885824203491,
|
|
"epoch": 1.5168662045788701,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004772111536289226,
|
|
"loss": 5.4115,
|
|
"mean_token_accuracy": 0.1641728550195694,
|
|
"num_tokens": 33299059.0,
|
|
"step": 18055
|
|
},
|
|
{
|
|
"entropy": 5.689133930206299,
|
|
"epoch": 1.517286284394035,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047719801935852235,
|
|
"loss": 5.468,
|
|
"mean_token_accuracy": 0.16429835706949233,
|
|
"num_tokens": 33308879.0,
|
|
"step": 18060
|
|
},
|
|
{
|
|
"entropy": 5.763861560821534,
|
|
"epoch": 1.5177063642091997,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004771848815062883,
|
|
"loss": 5.5568,
|
|
"mean_token_accuracy": 0.1608145996928215,
|
|
"num_tokens": 33318615.0,
|
|
"step": 18065
|
|
},
|
|
{
|
|
"entropy": 5.809006929397583,
|
|
"epoch": 1.5181264440243645,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004771717400724532,
|
|
"loss": 5.5845,
|
|
"mean_token_accuracy": 0.15996418967843057,
|
|
"num_tokens": 33328748.0,
|
|
"step": 18070
|
|
},
|
|
{
|
|
"entropy": 5.765374803543091,
|
|
"epoch": 1.5185465238395295,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004771585950572499,
|
|
"loss": 5.3919,
|
|
"mean_token_accuracy": 0.16406020075082778,
|
|
"num_tokens": 33338350.0,
|
|
"step": 18075
|
|
},
|
|
{
|
|
"entropy": 5.623263883590698,
|
|
"epoch": 1.5189666036546945,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004771454464609111,
|
|
"loss": 5.4011,
|
|
"mean_token_accuracy": 0.16918568760156633,
|
|
"num_tokens": 33348202.0,
|
|
"step": 18080
|
|
},
|
|
{
|
|
"entropy": 5.613306331634521,
|
|
"epoch": 1.5193866834698593,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004771322942836699,
|
|
"loss": 5.3967,
|
|
"mean_token_accuracy": 0.16765800267457961,
|
|
"num_tokens": 33356996.0,
|
|
"step": 18085
|
|
},
|
|
{
|
|
"entropy": 5.791823196411133,
|
|
"epoch": 1.519806763285024,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004771191385257592,
|
|
"loss": 5.5247,
|
|
"mean_token_accuracy": 0.16046885251998902,
|
|
"num_tokens": 33366173.0,
|
|
"step": 18090
|
|
},
|
|
{
|
|
"entropy": 5.713813591003418,
|
|
"epoch": 1.520226843100189,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004771059791874119,
|
|
"loss": 5.4365,
|
|
"mean_token_accuracy": 0.15948131680488586,
|
|
"num_tokens": 33375921.0,
|
|
"step": 18095
|
|
},
|
|
{
|
|
"entropy": 5.6319067001342775,
|
|
"epoch": 1.520646922915354,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 0.0004770928162688613,
|
|
"loss": 5.4232,
|
|
"mean_token_accuracy": 0.16363133490085602,
|
|
"num_tokens": 33385538.0,
|
|
"step": 18100
|
|
},
|
|
{
|
|
"entropy": 5.633490324020386,
|
|
"epoch": 1.5210670027305189,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00047707964977034055,
|
|
"loss": 5.3274,
|
|
"mean_token_accuracy": 0.18080521374940872,
|
|
"num_tokens": 33393728.0,
|
|
"step": 18105
|
|
},
|
|
{
|
|
"entropy": 5.776975011825561,
|
|
"epoch": 1.5214870825456837,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004770664796920828,
|
|
"loss": 5.4259,
|
|
"mean_token_accuracy": 0.1658819019794464,
|
|
"num_tokens": 33402540.0,
|
|
"step": 18110
|
|
},
|
|
{
|
|
"entropy": 5.648982095718384,
|
|
"epoch": 1.5219071623608484,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004770533060343215,
|
|
"loss": 5.3993,
|
|
"mean_token_accuracy": 0.1668563425540924,
|
|
"num_tokens": 33411706.0,
|
|
"step": 18115
|
|
},
|
|
{
|
|
"entropy": 5.619913053512573,
|
|
"epoch": 1.5223272421760135,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004770401287972899,
|
|
"loss": 5.346,
|
|
"mean_token_accuracy": 0.17197668105363845,
|
|
"num_tokens": 33420604.0,
|
|
"step": 18120
|
|
},
|
|
{
|
|
"entropy": 5.612928819656372,
|
|
"epoch": 1.5227473219911785,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047702694798122143,
|
|
"loss": 5.3312,
|
|
"mean_token_accuracy": 0.18267546892166137,
|
|
"num_tokens": 33429558.0,
|
|
"step": 18125
|
|
},
|
|
{
|
|
"entropy": 5.845659017562866,
|
|
"epoch": 1.5231674018063432,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047701376358634957,
|
|
"loss": 5.5331,
|
|
"mean_token_accuracy": 0.16271810382604598,
|
|
"num_tokens": 33439620.0,
|
|
"step": 18130
|
|
},
|
|
{
|
|
"entropy": 5.746625709533691,
|
|
"epoch": 1.523587481621508,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00047700057561290797,
|
|
"loss": 5.4849,
|
|
"mean_token_accuracy": 0.1619314581155777,
|
|
"num_tokens": 33449067.0,
|
|
"step": 18135
|
|
},
|
|
{
|
|
"entropy": 5.6104577541351315,
|
|
"epoch": 1.5240075614366728,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004769873840611302,
|
|
"loss": 5.388,
|
|
"mean_token_accuracy": 0.17093031108379364,
|
|
"num_tokens": 33458089.0,
|
|
"step": 18140
|
|
},
|
|
{
|
|
"entropy": 5.674795293807984,
|
|
"epoch": 1.5244276412518378,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004769741889312499,
|
|
"loss": 5.4976,
|
|
"mean_token_accuracy": 0.1689228668808937,
|
|
"num_tokens": 33466883.0,
|
|
"step": 18145
|
|
},
|
|
{
|
|
"entropy": 5.725237464904785,
|
|
"epoch": 1.5248477210670028,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047696099022350087,
|
|
"loss": 5.5247,
|
|
"mean_token_accuracy": 0.15924528241157532,
|
|
"num_tokens": 33476649.0,
|
|
"step": 18150
|
|
},
|
|
{
|
|
"entropy": 5.798870325088501,
|
|
"epoch": 1.5252678008821676,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00047694778793811685,
|
|
"loss": 5.4913,
|
|
"mean_token_accuracy": 0.16371531635522843,
|
|
"num_tokens": 33486274.0,
|
|
"step": 18155
|
|
},
|
|
{
|
|
"entropy": 5.731025695800781,
|
|
"epoch": 1.5256878806973324,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00047693458207533177,
|
|
"loss": 5.3745,
|
|
"mean_token_accuracy": 0.1666399672627449,
|
|
"num_tokens": 33494950.0,
|
|
"step": 18160
|
|
},
|
|
{
|
|
"entropy": 5.659780883789063,
|
|
"epoch": 1.5261079605124974,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004769213726353795,
|
|
"loss": 5.3996,
|
|
"mean_token_accuracy": 0.1708945393562317,
|
|
"num_tokens": 33503545.0,
|
|
"step": 18165
|
|
},
|
|
{
|
|
"entropy": 5.648102521896362,
|
|
"epoch": 1.5265280403276622,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047690815961849416,
|
|
"loss": 5.4462,
|
|
"mean_token_accuracy": 0.1661043107509613,
|
|
"num_tokens": 33512871.0,
|
|
"step": 18170
|
|
},
|
|
{
|
|
"entropy": 5.623683214187622,
|
|
"epoch": 1.5269481201428272,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004768949430249097,
|
|
"loss": 5.3626,
|
|
"mean_token_accuracy": 0.16892132312059402,
|
|
"num_tokens": 33521933.0,
|
|
"step": 18175
|
|
},
|
|
{
|
|
"entropy": 5.672886848449707,
|
|
"epoch": 1.527368199957992,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004768817228548603,
|
|
"loss": 5.3511,
|
|
"mean_token_accuracy": 0.1706907257437706,
|
|
"num_tokens": 33531370.0,
|
|
"step": 18180
|
|
},
|
|
{
|
|
"entropy": 5.755971002578735,
|
|
"epoch": 1.5277882797731568,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004768684991085802,
|
|
"loss": 5.4365,
|
|
"mean_token_accuracy": 0.16248024702072145,
|
|
"num_tokens": 33540310.0,
|
|
"step": 18185
|
|
},
|
|
{
|
|
"entropy": 5.687887954711914,
|
|
"epoch": 1.5282083595883218,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00047685527178630347,
|
|
"loss": 5.4598,
|
|
"mean_token_accuracy": 0.16537974774837494,
|
|
"num_tokens": 33549943.0,
|
|
"step": 18190
|
|
},
|
|
{
|
|
"entropy": 5.752259922027588,
|
|
"epoch": 1.5286284394034868,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004768420408882646,
|
|
"loss": 5.5298,
|
|
"mean_token_accuracy": 0.16441700905561446,
|
|
"num_tokens": 33560167.0,
|
|
"step": 18195
|
|
},
|
|
{
|
|
"entropy": 5.757403898239136,
|
|
"epoch": 1.5290485192186516,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00047682880641469787,
|
|
"loss": 5.4111,
|
|
"mean_token_accuracy": 0.16261017471551895,
|
|
"num_tokens": 33569604.0,
|
|
"step": 18200
|
|
},
|
|
{
|
|
"entropy": 5.701638650894165,
|
|
"epoch": 1.5294685990338164,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004768155683658378,
|
|
"loss": 5.3972,
|
|
"mean_token_accuracy": 0.168385748565197,
|
|
"num_tokens": 33578400.0,
|
|
"step": 18205
|
|
},
|
|
{
|
|
"entropy": 5.596540117263794,
|
|
"epoch": 1.5298886788489812,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004768023267419188,
|
|
"loss": 5.3728,
|
|
"mean_token_accuracy": 0.16698229908943177,
|
|
"num_tokens": 33587527.0,
|
|
"step": 18210
|
|
},
|
|
{
|
|
"entropy": 5.585406541824341,
|
|
"epoch": 1.5303087586641462,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004767890815431756,
|
|
"loss": 5.31,
|
|
"mean_token_accuracy": 0.1722709432244301,
|
|
"num_tokens": 33596026.0,
|
|
"step": 18215
|
|
},
|
|
{
|
|
"entropy": 5.698364782333374,
|
|
"epoch": 1.5307288384793112,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047677583276984264,
|
|
"loss": 5.3995,
|
|
"mean_token_accuracy": 0.16997897922992705,
|
|
"num_tokens": 33605906.0,
|
|
"step": 18220
|
|
},
|
|
{
|
|
"entropy": 5.687321901321411,
|
|
"epoch": 1.531148918294476,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004767625804221548,
|
|
"loss": 5.36,
|
|
"mean_token_accuracy": 0.17047615945339203,
|
|
"num_tokens": 33615758.0,
|
|
"step": 18225
|
|
},
|
|
{
|
|
"entropy": 5.662997770309448,
|
|
"epoch": 1.5315689981096408,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004767493245003466,
|
|
"loss": 5.4245,
|
|
"mean_token_accuracy": 0.18040256053209305,
|
|
"num_tokens": 33625486.0,
|
|
"step": 18230
|
|
},
|
|
{
|
|
"entropy": 5.663189315795899,
|
|
"epoch": 1.5319890779248058,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00047673606500465315,
|
|
"loss": 5.3718,
|
|
"mean_token_accuracy": 0.17638310939073562,
|
|
"num_tokens": 33633954.0,
|
|
"step": 18235
|
|
},
|
|
{
|
|
"entropy": 5.633836793899536,
|
|
"epoch": 1.5324091577399706,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000476722801935309,
|
|
"loss": 5.4511,
|
|
"mean_token_accuracy": 0.166046205163002,
|
|
"num_tokens": 33642478.0,
|
|
"step": 18240
|
|
},
|
|
{
|
|
"entropy": 5.6103380680084225,
|
|
"epoch": 1.5328292375551356,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004767095352925495,
|
|
"loss": 5.3701,
|
|
"mean_token_accuracy": 0.1702152296900749,
|
|
"num_tokens": 33650785.0,
|
|
"step": 18245
|
|
},
|
|
{
|
|
"entropy": 5.659248542785645,
|
|
"epoch": 1.5332493173703003,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004766962650766093,
|
|
"loss": 5.3337,
|
|
"mean_token_accuracy": 0.17309417128562926,
|
|
"num_tokens": 33659677.0,
|
|
"step": 18250
|
|
},
|
|
{
|
|
"entropy": 5.716655015945435,
|
|
"epoch": 1.5336693971854651,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00047668299128772365,
|
|
"loss": 5.5052,
|
|
"mean_token_accuracy": 0.1620546281337738,
|
|
"num_tokens": 33669493.0,
|
|
"step": 18255
|
|
},
|
|
{
|
|
"entropy": 5.766137742996216,
|
|
"epoch": 1.5340894770006301,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004766697139261277,
|
|
"loss": 5.4809,
|
|
"mean_token_accuracy": 0.1693834885954857,
|
|
"num_tokens": 33678446.0,
|
|
"step": 18260
|
|
},
|
|
{
|
|
"entropy": 5.688551139831543,
|
|
"epoch": 1.5345095568157952,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004766564329920566,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.17938026487827302,
|
|
"num_tokens": 33687647.0,
|
|
"step": 18265
|
|
},
|
|
{
|
|
"entropy": 5.66825041770935,
|
|
"epoch": 1.53492963663096,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004766431484857456,
|
|
"loss": 5.4354,
|
|
"mean_token_accuracy": 0.1683764412999153,
|
|
"num_tokens": 33697395.0,
|
|
"step": 18270
|
|
},
|
|
{
|
|
"entropy": 5.6449426174163815,
|
|
"epoch": 1.5353497164461247,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047662986040743004,
|
|
"loss": 5.4179,
|
|
"mean_token_accuracy": 0.1762421429157257,
|
|
"num_tokens": 33706779.0,
|
|
"step": 18275
|
|
},
|
|
{
|
|
"entropy": 5.6319070816040036,
|
|
"epoch": 1.5357697962612895,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004766165687573454,
|
|
"loss": 5.399,
|
|
"mean_token_accuracy": 0.16638792753219606,
|
|
"num_tokens": 33714828.0,
|
|
"step": 18280
|
|
},
|
|
{
|
|
"entropy": 5.7225525856018065,
|
|
"epoch": 1.5361898760764545,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.000476603273535727,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.16816843450069427,
|
|
"num_tokens": 33724730.0,
|
|
"step": 18285
|
|
},
|
|
{
|
|
"entropy": 5.7629804611206055,
|
|
"epoch": 1.5366099558916195,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004765899747428104,
|
|
"loss": 5.4813,
|
|
"mean_token_accuracy": 0.16490163505077363,
|
|
"num_tokens": 33734374.0,
|
|
"step": 18290
|
|
},
|
|
{
|
|
"entropy": 5.7630139827728275,
|
|
"epoch": 1.5370300357067843,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00047657667237883125,
|
|
"loss": 5.4618,
|
|
"mean_token_accuracy": 0.17239924520254135,
|
|
"num_tokens": 33743395.0,
|
|
"step": 18295
|
|
},
|
|
{
|
|
"entropy": 5.72203483581543,
|
|
"epoch": 1.537450115521949,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00047656336644402513,
|
|
"loss": 5.5038,
|
|
"mean_token_accuracy": 0.1658702626824379,
|
|
"num_tokens": 33752526.0,
|
|
"step": 18300
|
|
},
|
|
{
|
|
"entropy": 5.73434624671936,
|
|
"epoch": 1.5378701953371139,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004765500569386278,
|
|
"loss": 5.4341,
|
|
"mean_token_accuracy": 0.17372529208660126,
|
|
"num_tokens": 33761310.0,
|
|
"step": 18305
|
|
},
|
|
{
|
|
"entropy": 5.630677986145019,
|
|
"epoch": 1.538290275152279,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.000476536743862875,
|
|
"loss": 5.3564,
|
|
"mean_token_accuracy": 0.17069067656993867,
|
|
"num_tokens": 33770870.0,
|
|
"step": 18310
|
|
},
|
|
{
|
|
"entropy": 5.587197399139404,
|
|
"epoch": 1.538710354967444,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047652342721700246,
|
|
"loss": 5.3123,
|
|
"mean_token_accuracy": 0.16748333871364593,
|
|
"num_tokens": 33779648.0,
|
|
"step": 18315
|
|
},
|
|
{
|
|
"entropy": 5.689319229125976,
|
|
"epoch": 1.5391304347826087,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004765101070012462,
|
|
"loss": 5.5059,
|
|
"mean_token_accuracy": 0.1615031287074089,
|
|
"num_tokens": 33789172.0,
|
|
"step": 18320
|
|
},
|
|
{
|
|
"entropy": 5.810400390625,
|
|
"epoch": 1.5395505145977735,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00047649678321584214,
|
|
"loss": 5.4895,
|
|
"mean_token_accuracy": 0.15798811763525009,
|
|
"num_tokens": 33798069.0,
|
|
"step": 18325
|
|
},
|
|
{
|
|
"entropy": 5.732732534408569,
|
|
"epoch": 1.5399705944129385,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047648345586102643,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.16982662975788115,
|
|
"num_tokens": 33806214.0,
|
|
"step": 18330
|
|
},
|
|
{
|
|
"entropy": 5.712227535247803,
|
|
"epoch": 1.5403906742281035,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.000476470124937035,
|
|
"loss": 5.4266,
|
|
"mean_token_accuracy": 0.17047962546348572,
|
|
"num_tokens": 33815365.0,
|
|
"step": 18335
|
|
},
|
|
{
|
|
"entropy": 5.728869104385376,
|
|
"epoch": 1.5408107540432683,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000476456790444104,
|
|
"loss": 5.3487,
|
|
"mean_token_accuracy": 0.17773585617542267,
|
|
"num_tokens": 33825204.0,
|
|
"step": 18340
|
|
},
|
|
{
|
|
"entropy": 5.687373256683349,
|
|
"epoch": 1.541230833858433,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004764434523824697,
|
|
"loss": 5.4619,
|
|
"mean_token_accuracy": 0.1697180077433586,
|
|
"num_tokens": 33834439.0,
|
|
"step": 18345
|
|
},
|
|
{
|
|
"entropy": 5.622870349884034,
|
|
"epoch": 1.5416509136735979,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00047643011075236845,
|
|
"loss": 5.4381,
|
|
"mean_token_accuracy": 0.1638789251446724,
|
|
"num_tokens": 33843959.0,
|
|
"step": 18350
|
|
},
|
|
{
|
|
"entropy": 5.776487016677857,
|
|
"epoch": 1.5420709934887629,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047641676555403646,
|
|
"loss": 5.4804,
|
|
"mean_token_accuracy": 0.15986314862966539,
|
|
"num_tokens": 33853234.0,
|
|
"step": 18355
|
|
},
|
|
{
|
|
"entropy": 5.695157814025879,
|
|
"epoch": 1.5424910733039279,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004764034167877102,
|
|
"loss": 5.3797,
|
|
"mean_token_accuracy": 0.16742191165685655,
|
|
"num_tokens": 33861755.0,
|
|
"step": 18360
|
|
},
|
|
{
|
|
"entropy": 5.719500398635864,
|
|
"epoch": 1.5429111531190927,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00047639006445362607,
|
|
"loss": 5.4946,
|
|
"mean_token_accuracy": 0.16939375996589662,
|
|
"num_tokens": 33870956.0,
|
|
"step": 18365
|
|
},
|
|
{
|
|
"entropy": 5.639527320861816,
|
|
"epoch": 1.5433312329342574,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004763767085520207,
|
|
"loss": 5.3368,
|
|
"mean_token_accuracy": 0.17298437505960465,
|
|
"num_tokens": 33880568.0,
|
|
"step": 18370
|
|
},
|
|
{
|
|
"entropy": 5.727531051635742,
|
|
"epoch": 1.5437513127494222,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004763633490831306,
|
|
"loss": 5.5471,
|
|
"mean_token_accuracy": 0.15493866950273513,
|
|
"num_tokens": 33890145.0,
|
|
"step": 18375
|
|
},
|
|
{
|
|
"entropy": 5.6116053581237795,
|
|
"epoch": 1.5441713925645872,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004763499860471925,
|
|
"loss": 5.3965,
|
|
"mean_token_accuracy": 0.16893347650766372,
|
|
"num_tokens": 33899155.0,
|
|
"step": 18380
|
|
},
|
|
{
|
|
"entropy": 5.6794798374176025,
|
|
"epoch": 1.5445914723797523,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000476336619444443,
|
|
"loss": 5.4366,
|
|
"mean_token_accuracy": 0.16216899007558822,
|
|
"num_tokens": 33909410.0,
|
|
"step": 18385
|
|
},
|
|
{
|
|
"entropy": 5.643740177154541,
|
|
"epoch": 1.545011552194917,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000476323249275119,
|
|
"loss": 5.3071,
|
|
"mean_token_accuracy": 0.17813037484884262,
|
|
"num_tokens": 33918451.0,
|
|
"step": 18390
|
|
},
|
|
{
|
|
"entropy": 5.5850482940673825,
|
|
"epoch": 1.5454316320100818,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004763098755394573,
|
|
"loss": 5.3449,
|
|
"mean_token_accuracy": 0.17247679233551025,
|
|
"num_tokens": 33928317.0,
|
|
"step": 18395
|
|
},
|
|
{
|
|
"entropy": 5.704434871673584,
|
|
"epoch": 1.5458517118252468,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004762964982376949,
|
|
"loss": 5.5166,
|
|
"mean_token_accuracy": 0.16591467410326005,
|
|
"num_tokens": 33938010.0,
|
|
"step": 18400
|
|
},
|
|
{
|
|
"entropy": 5.716954278945923,
|
|
"epoch": 1.5462717916404118,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00047628311737006856,
|
|
"loss": 5.3336,
|
|
"mean_token_accuracy": 0.1735645353794098,
|
|
"num_tokens": 33946964.0,
|
|
"step": 18405
|
|
},
|
|
{
|
|
"entropy": 5.686046504974366,
|
|
"epoch": 1.5466918714555766,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047626973293681555,
|
|
"loss": 5.349,
|
|
"mean_token_accuracy": 0.16914291232824324,
|
|
"num_tokens": 33956026.0,
|
|
"step": 18410
|
|
},
|
|
{
|
|
"entropy": 5.612794685363769,
|
|
"epoch": 1.5471119512707414,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004762563449381728,
|
|
"loss": 5.3924,
|
|
"mean_token_accuracy": 0.16146431416273116,
|
|
"num_tokens": 33965787.0,
|
|
"step": 18415
|
|
},
|
|
{
|
|
"entropy": 5.663423538208008,
|
|
"epoch": 1.5475320310859062,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00047624295337437753,
|
|
"loss": 5.4273,
|
|
"mean_token_accuracy": 0.1688649833202362,
|
|
"num_tokens": 33974178.0,
|
|
"step": 18420
|
|
},
|
|
{
|
|
"entropy": 5.628804731369018,
|
|
"epoch": 1.5479521109010712,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004762295582456669,
|
|
"loss": 5.2858,
|
|
"mean_token_accuracy": 0.17369863390922546,
|
|
"num_tokens": 33983652.0,
|
|
"step": 18425
|
|
},
|
|
{
|
|
"entropy": 5.696892833709716,
|
|
"epoch": 1.5483721907162362,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047621615955227835,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.1774067535996437,
|
|
"num_tokens": 33991938.0,
|
|
"step": 18430
|
|
},
|
|
{
|
|
"entropy": 5.6132800579071045,
|
|
"epoch": 1.548792270531401,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004762027572944491,
|
|
"loss": 5.3544,
|
|
"mean_token_accuracy": 0.16801770478487016,
|
|
"num_tokens": 33999918.0,
|
|
"step": 18435
|
|
},
|
|
{
|
|
"entropy": 5.5902656555175785,
|
|
"epoch": 1.5492123503465658,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00047618935147241667,
|
|
"loss": 5.3731,
|
|
"mean_token_accuracy": 0.17459045797586442,
|
|
"num_tokens": 34008416.0,
|
|
"step": 18440
|
|
},
|
|
{
|
|
"entropy": 5.701586627960205,
|
|
"epoch": 1.5496324301617306,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 0.0004761759420864184,
|
|
"loss": 5.4532,
|
|
"mean_token_accuracy": 0.16581283658742904,
|
|
"num_tokens": 34017616.0,
|
|
"step": 18445
|
|
},
|
|
{
|
|
"entropy": 5.712861061096191,
|
|
"epoch": 1.5500525099768956,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000476162529136692,
|
|
"loss": 5.3818,
|
|
"mean_token_accuracy": 0.17086593359708785,
|
|
"num_tokens": 34026064.0,
|
|
"step": 18450
|
|
},
|
|
{
|
|
"entropy": 5.564694118499756,
|
|
"epoch": 1.5504725897920606,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004761491126234749,
|
|
"loss": 5.2959,
|
|
"mean_token_accuracy": 0.1739438533782959,
|
|
"num_tokens": 34035378.0,
|
|
"step": 18455
|
|
},
|
|
{
|
|
"entropy": 5.6146468162536625,
|
|
"epoch": 1.5508926696072254,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004761356925470049,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.1704146921634674,
|
|
"num_tokens": 34044600.0,
|
|
"step": 18460
|
|
},
|
|
{
|
|
"entropy": 5.710069417953491,
|
|
"epoch": 1.5513127494223902,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00047612226890751956,
|
|
"loss": 5.4336,
|
|
"mean_token_accuracy": 0.16696672439575194,
|
|
"num_tokens": 34054680.0,
|
|
"step": 18465
|
|
},
|
|
{
|
|
"entropy": 5.65276689529419,
|
|
"epoch": 1.5517328292375552,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047610884170525697,
|
|
"loss": 5.3498,
|
|
"mean_token_accuracy": 0.1752360135316849,
|
|
"num_tokens": 34063034.0,
|
|
"step": 18470
|
|
},
|
|
{
|
|
"entropy": 5.627860975265503,
|
|
"epoch": 1.55215290905272,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004760954109404547,
|
|
"loss": 5.351,
|
|
"mean_token_accuracy": 0.17447586208581925,
|
|
"num_tokens": 34072122.0,
|
|
"step": 18475
|
|
},
|
|
{
|
|
"entropy": 5.674824905395508,
|
|
"epoch": 1.552572988867885,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.0004760819766133508,
|
|
"loss": 5.3586,
|
|
"mean_token_accuracy": 0.16940293908119203,
|
|
"num_tokens": 34081493.0,
|
|
"step": 18480
|
|
},
|
|
{
|
|
"entropy": 5.668784093856812,
|
|
"epoch": 1.5529930686830498,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00047606853872418317,
|
|
"loss": 5.4445,
|
|
"mean_token_accuracy": 0.16179682463407516,
|
|
"num_tokens": 34090872.0,
|
|
"step": 18485
|
|
},
|
|
{
|
|
"entropy": 5.624145078659057,
|
|
"epoch": 1.5534131484982145,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004760550972731899,
|
|
"loss": 5.3052,
|
|
"mean_token_accuracy": 0.1742589369416237,
|
|
"num_tokens": 34100729.0,
|
|
"step": 18490
|
|
},
|
|
{
|
|
"entropy": 5.540934467315674,
|
|
"epoch": 1.5538332283133796,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004760416522606092,
|
|
"loss": 5.2939,
|
|
"mean_token_accuracy": 0.1751124456524849,
|
|
"num_tokens": 34109492.0,
|
|
"step": 18495
|
|
},
|
|
{
|
|
"entropy": 5.580523681640625,
|
|
"epoch": 1.5542533081285446,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004760282036866791,
|
|
"loss": 5.4,
|
|
"mean_token_accuracy": 0.17484914511442184,
|
|
"num_tokens": 34119529.0,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"entropy": 5.76246075630188,
|
|
"epoch": 1.5546733879437094,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004760147515516379,
|
|
"loss": 5.4362,
|
|
"mean_token_accuracy": 0.1649763211607933,
|
|
"num_tokens": 34128261.0,
|
|
"step": 18505
|
|
},
|
|
{
|
|
"entropy": 5.6341499328613285,
|
|
"epoch": 1.5550934677588741,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047600129585572386,
|
|
"loss": 5.4324,
|
|
"mean_token_accuracy": 0.17126839607954025,
|
|
"num_tokens": 34136916.0,
|
|
"step": 18510
|
|
},
|
|
{
|
|
"entropy": 5.713113260269165,
|
|
"epoch": 1.555513547574039,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004759878365991754,
|
|
"loss": 5.3471,
|
|
"mean_token_accuracy": 0.17166002988815307,
|
|
"num_tokens": 34146400.0,
|
|
"step": 18515
|
|
},
|
|
{
|
|
"entropy": 5.674141216278076,
|
|
"epoch": 1.555933627389204,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004759743737822309,
|
|
"loss": 5.3993,
|
|
"mean_token_accuracy": 0.1691730111837387,
|
|
"num_tokens": 34155611.0,
|
|
"step": 18520
|
|
},
|
|
{
|
|
"entropy": 5.65713849067688,
|
|
"epoch": 1.556353707204369,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.00047596090740512884,
|
|
"loss": 5.428,
|
|
"mean_token_accuracy": 0.1695108011364937,
|
|
"num_tokens": 34165301.0,
|
|
"step": 18525
|
|
},
|
|
{
|
|
"entropy": 5.70047779083252,
|
|
"epoch": 1.5567737870195337,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.00047594743746810786,
|
|
"loss": 5.4018,
|
|
"mean_token_accuracy": 0.16435787677764893,
|
|
"num_tokens": 34174655.0,
|
|
"step": 18530
|
|
},
|
|
{
|
|
"entropy": 5.802553367614746,
|
|
"epoch": 1.5571938668346985,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047593396397140644,
|
|
"loss": 5.5507,
|
|
"mean_token_accuracy": 0.1595836400985718,
|
|
"num_tokens": 34184293.0,
|
|
"step": 18535
|
|
},
|
|
{
|
|
"entropy": 5.7214781761169435,
|
|
"epoch": 1.5576139466498635,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004759204869152632,
|
|
"loss": 5.4373,
|
|
"mean_token_accuracy": 0.16149042397737504,
|
|
"num_tokens": 34193025.0,
|
|
"step": 18540
|
|
},
|
|
{
|
|
"entropy": 5.620850515365601,
|
|
"epoch": 1.5580340264650283,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004759070062999171,
|
|
"loss": 5.3478,
|
|
"mean_token_accuracy": 0.1678580015897751,
|
|
"num_tokens": 34201082.0,
|
|
"step": 18545
|
|
},
|
|
{
|
|
"entropy": 5.739461946487427,
|
|
"epoch": 1.5584541062801933,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004758935221256069,
|
|
"loss": 5.4907,
|
|
"mean_token_accuracy": 0.16538347899913788,
|
|
"num_tokens": 34211210.0,
|
|
"step": 18550
|
|
},
|
|
{
|
|
"entropy": 5.702043962478638,
|
|
"epoch": 1.558874186095358,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00047588003439257134,
|
|
"loss": 5.4279,
|
|
"mean_token_accuracy": 0.1693740040063858,
|
|
"num_tokens": 34220309.0,
|
|
"step": 18555
|
|
},
|
|
{
|
|
"entropy": 5.728823947906494,
|
|
"epoch": 1.559294265910523,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00047586654310104946,
|
|
"loss": 5.4202,
|
|
"mean_token_accuracy": 0.1592714488506317,
|
|
"num_tokens": 34229532.0,
|
|
"step": 18560
|
|
},
|
|
{
|
|
"entropy": 5.792129182815552,
|
|
"epoch": 1.559714345725688,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004758530482512801,
|
|
"loss": 5.6455,
|
|
"mean_token_accuracy": 0.15465014576911926,
|
|
"num_tokens": 34239543.0,
|
|
"step": 18565
|
|
},
|
|
{
|
|
"entropy": 5.7673375606536865,
|
|
"epoch": 1.560134425540853,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004758395498435024,
|
|
"loss": 5.4486,
|
|
"mean_token_accuracy": 0.16822385787963867,
|
|
"num_tokens": 34248654.0,
|
|
"step": 18570
|
|
},
|
|
{
|
|
"entropy": 5.71659140586853,
|
|
"epoch": 1.5605545053560177,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.00047582604787795555,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.16151682287454605,
|
|
"num_tokens": 34258757.0,
|
|
"step": 18575
|
|
},
|
|
{
|
|
"entropy": 5.668481111526489,
|
|
"epoch": 1.5609745851711825,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0004758125423548787,
|
|
"loss": 5.4308,
|
|
"mean_token_accuracy": 0.1640526682138443,
|
|
"num_tokens": 34268253.0,
|
|
"step": 18580
|
|
},
|
|
{
|
|
"entropy": 5.759385299682617,
|
|
"epoch": 1.5613946649863473,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047579903327451097,
|
|
"loss": 5.4909,
|
|
"mean_token_accuracy": 0.1663891091942787,
|
|
"num_tokens": 34277361.0,
|
|
"step": 18585
|
|
},
|
|
{
|
|
"entropy": 5.640477037429809,
|
|
"epoch": 1.5618147448015123,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004757855206370919,
|
|
"loss": 5.3618,
|
|
"mean_token_accuracy": 0.16783252209424973,
|
|
"num_tokens": 34285923.0,
|
|
"step": 18590
|
|
},
|
|
{
|
|
"entropy": 5.600503778457641,
|
|
"epoch": 1.5622348246166773,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00047577200444286064,
|
|
"loss": 5.3768,
|
|
"mean_token_accuracy": 0.1716615855693817,
|
|
"num_tokens": 34296300.0,
|
|
"step": 18595
|
|
},
|
|
{
|
|
"entropy": 5.766132545471192,
|
|
"epoch": 1.562654904431842,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004757584846920567,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.16635561734437943,
|
|
"num_tokens": 34305757.0,
|
|
"step": 18600
|
|
},
|
|
{
|
|
"entropy": 5.677987813949585,
|
|
"epoch": 1.5630749842470069,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004757449613849196,
|
|
"loss": 5.4464,
|
|
"mean_token_accuracy": 0.16000643074512483,
|
|
"num_tokens": 34314714.0,
|
|
"step": 18605
|
|
},
|
|
{
|
|
"entropy": 5.718979597091675,
|
|
"epoch": 1.5634950640621716,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047573143452168883,
|
|
"loss": 5.473,
|
|
"mean_token_accuracy": 0.16973401680588723,
|
|
"num_tokens": 34323501.0,
|
|
"step": 18610
|
|
},
|
|
{
|
|
"entropy": 5.73273868560791,
|
|
"epoch": 1.5639151438773367,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00047571790410260405,
|
|
"loss": 5.4017,
|
|
"mean_token_accuracy": 0.17346812933683395,
|
|
"num_tokens": 34331752.0,
|
|
"step": 18615
|
|
},
|
|
{
|
|
"entropy": 5.717556381225586,
|
|
"epoch": 1.5643352236925017,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.000475704370127905,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.16100564748048782,
|
|
"num_tokens": 34341479.0,
|
|
"step": 18620
|
|
},
|
|
{
|
|
"entropy": 5.6721264839172365,
|
|
"epoch": 1.5647553035076665,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004756908325978314,
|
|
"loss": 5.4556,
|
|
"mean_token_accuracy": 0.1629202827811241,
|
|
"num_tokens": 34350991.0,
|
|
"step": 18625
|
|
},
|
|
{
|
|
"entropy": 5.697770977020264,
|
|
"epoch": 1.5651753833228312,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00047567729151262305,
|
|
"loss": 5.3765,
|
|
"mean_token_accuracy": 0.16833187639713287,
|
|
"num_tokens": 34360089.0,
|
|
"step": 18630
|
|
},
|
|
{
|
|
"entropy": 5.693409872055054,
|
|
"epoch": 1.5655954631379962,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004756637468725198,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.17019174247980118,
|
|
"num_tokens": 34370352.0,
|
|
"step": 18635
|
|
},
|
|
{
|
|
"entropy": 5.6412163257598875,
|
|
"epoch": 1.5660155429531613,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004756501986777616,
|
|
"loss": 5.3334,
|
|
"mean_token_accuracy": 0.1646198183298111,
|
|
"num_tokens": 34378958.0,
|
|
"step": 18640
|
|
},
|
|
{
|
|
"entropy": 5.577014398574829,
|
|
"epoch": 1.566435622768326,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00047563664692858843,
|
|
"loss": 5.3075,
|
|
"mean_token_accuracy": 0.17557633221149443,
|
|
"num_tokens": 34387723.0,
|
|
"step": 18645
|
|
},
|
|
{
|
|
"entropy": 5.667887926101685,
|
|
"epoch": 1.5668557025834908,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004756230916252404,
|
|
"loss": 5.4322,
|
|
"mean_token_accuracy": 0.17246091961860657,
|
|
"num_tokens": 34397089.0,
|
|
"step": 18650
|
|
},
|
|
{
|
|
"entropy": 5.754067516326904,
|
|
"epoch": 1.5672757823986556,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047560953276795756,
|
|
"loss": 5.4493,
|
|
"mean_token_accuracy": 0.1675298720598221,
|
|
"num_tokens": 34406278.0,
|
|
"step": 18655
|
|
},
|
|
{
|
|
"entropy": 5.7338409423828125,
|
|
"epoch": 1.5676958622138206,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 0.00047559597035698014,
|
|
"loss": 5.4153,
|
|
"mean_token_accuracy": 0.16818469762802124,
|
|
"num_tokens": 34415404.0,
|
|
"step": 18660
|
|
},
|
|
{
|
|
"entropy": 5.689050960540771,
|
|
"epoch": 1.5681159420289856,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004755824043925485,
|
|
"loss": 5.4658,
|
|
"mean_token_accuracy": 0.17355379313230515,
|
|
"num_tokens": 34425036.0,
|
|
"step": 18665
|
|
},
|
|
{
|
|
"entropy": 5.6759899139404295,
|
|
"epoch": 1.5685360218441504,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004755688348749027,
|
|
"loss": 5.3721,
|
|
"mean_token_accuracy": 0.16852474361658096,
|
|
"num_tokens": 34434246.0,
|
|
"step": 18670
|
|
},
|
|
{
|
|
"entropy": 5.6307172775268555,
|
|
"epoch": 1.5689561016593152,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004755552618042834,
|
|
"loss": 5.3735,
|
|
"mean_token_accuracy": 0.1715213656425476,
|
|
"num_tokens": 34444189.0,
|
|
"step": 18675
|
|
},
|
|
{
|
|
"entropy": 5.694113779067993,
|
|
"epoch": 1.56937618147448,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004755416851809308,
|
|
"loss": 5.3705,
|
|
"mean_token_accuracy": 0.17202963531017304,
|
|
"num_tokens": 34453727.0,
|
|
"step": 18680
|
|
},
|
|
{
|
|
"entropy": 5.555972719192505,
|
|
"epoch": 1.569796261289645,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004755281050050856,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.16777419596910476,
|
|
"num_tokens": 34462835.0,
|
|
"step": 18685
|
|
},
|
|
{
|
|
"entropy": 5.644486761093139,
|
|
"epoch": 1.57021634110481,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004755145212769882,
|
|
"loss": 5.4169,
|
|
"mean_token_accuracy": 0.16981538236141205,
|
|
"num_tokens": 34471642.0,
|
|
"step": 18690
|
|
},
|
|
{
|
|
"entropy": 5.722853660583496,
|
|
"epoch": 1.5706364209199748,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047550093399687936,
|
|
"loss": 5.3804,
|
|
"mean_token_accuracy": 0.16804203689098357,
|
|
"num_tokens": 34480468.0,
|
|
"step": 18695
|
|
},
|
|
{
|
|
"entropy": 5.763798809051513,
|
|
"epoch": 1.5710565007351396,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004754873431649997,
|
|
"loss": 5.4243,
|
|
"mean_token_accuracy": 0.16598083227872848,
|
|
"num_tokens": 34490299.0,
|
|
"step": 18700
|
|
},
|
|
{
|
|
"entropy": 5.668231630325318,
|
|
"epoch": 1.5714765805503046,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047547374878159003,
|
|
"loss": 5.4338,
|
|
"mean_token_accuracy": 0.1664573848247528,
|
|
"num_tokens": 34498831.0,
|
|
"step": 18705
|
|
},
|
|
{
|
|
"entropy": 5.675115299224854,
|
|
"epoch": 1.5718966603654696,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004754601508468911,
|
|
"loss": 5.4249,
|
|
"mean_token_accuracy": 0.16958544850349427,
|
|
"num_tokens": 34508048.0,
|
|
"step": 18710
|
|
},
|
|
{
|
|
"entropy": 5.676489400863647,
|
|
"epoch": 1.5723167401806344,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004754465493611438,
|
|
"loss": 5.5091,
|
|
"mean_token_accuracy": 0.16318027675151825,
|
|
"num_tokens": 34517070.0,
|
|
"step": 18715
|
|
},
|
|
{
|
|
"entropy": 5.6339551448822025,
|
|
"epoch": 1.5727368199957992,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.00047543294432458904,
|
|
"loss": 5.2937,
|
|
"mean_token_accuracy": 0.1759590983390808,
|
|
"num_tokens": 34525934.0,
|
|
"step": 18720
|
|
},
|
|
{
|
|
"entropy": 5.744489860534668,
|
|
"epoch": 1.573156899810964,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000475419335737468,
|
|
"loss": 5.5149,
|
|
"mean_token_accuracy": 0.16737214624881744,
|
|
"num_tokens": 34534222.0,
|
|
"step": 18725
|
|
},
|
|
{
|
|
"entropy": 5.750201940536499,
|
|
"epoch": 1.573576979626129,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.00047540572360002157,
|
|
"loss": 5.4944,
|
|
"mean_token_accuracy": 0.16553839445114135,
|
|
"num_tokens": 34543291.0,
|
|
"step": 18730
|
|
},
|
|
{
|
|
"entropy": 5.757966184616089,
|
|
"epoch": 1.573997059441294,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00047539210791249095,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.17250452637672425,
|
|
"num_tokens": 34552383.0,
|
|
"step": 18735
|
|
},
|
|
{
|
|
"entropy": 5.687971353530884,
|
|
"epoch": 1.5744171392564588,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004753784886751173,
|
|
"loss": 5.3368,
|
|
"mean_token_accuracy": 0.1798310786485672,
|
|
"num_tokens": 34560311.0,
|
|
"step": 18740
|
|
},
|
|
{
|
|
"entropy": 5.587876176834106,
|
|
"epoch": 1.5748372190716236,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004753648658881419,
|
|
"loss": 5.3912,
|
|
"mean_token_accuracy": 0.17629951983690262,
|
|
"num_tokens": 34569903.0,
|
|
"step": 18745
|
|
},
|
|
{
|
|
"entropy": 5.642320442199707,
|
|
"epoch": 1.5752572988867883,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047535123955180607,
|
|
"loss": 5.4037,
|
|
"mean_token_accuracy": 0.16801706254482268,
|
|
"num_tokens": 34579735.0,
|
|
"step": 18750
|
|
},
|
|
{
|
|
"entropy": 5.7766814708709715,
|
|
"epoch": 1.5756773787019533,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004753376096663512,
|
|
"loss": 5.4316,
|
|
"mean_token_accuracy": 0.16776171922683716,
|
|
"num_tokens": 34589105.0,
|
|
"step": 18755
|
|
},
|
|
{
|
|
"entropy": 5.625161170959473,
|
|
"epoch": 1.5760974585171184,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00047532397623201877,
|
|
"loss": 5.3705,
|
|
"mean_token_accuracy": 0.17527176439762115,
|
|
"num_tokens": 34597883.0,
|
|
"step": 18760
|
|
},
|
|
{
|
|
"entropy": 5.675880050659179,
|
|
"epoch": 1.5765175383322831,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047531033924905024,
|
|
"loss": 5.3506,
|
|
"mean_token_accuracy": 0.17240157425403596,
|
|
"num_tokens": 34606666.0,
|
|
"step": 18765
|
|
},
|
|
{
|
|
"entropy": 5.746255779266358,
|
|
"epoch": 1.576937618147448,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004752966987176873,
|
|
"loss": 5.4819,
|
|
"mean_token_accuracy": 0.16786147505044938,
|
|
"num_tokens": 34616547.0,
|
|
"step": 18770
|
|
},
|
|
{
|
|
"entropy": 5.713323879241943,
|
|
"epoch": 1.577357697962613,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004752830546381713,
|
|
"loss": 5.4497,
|
|
"mean_token_accuracy": 0.16839058697223663,
|
|
"num_tokens": 34625679.0,
|
|
"step": 18775
|
|
},
|
|
{
|
|
"entropy": 5.610950660705567,
|
|
"epoch": 1.5777777777777777,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004752694070107442,
|
|
"loss": 5.3817,
|
|
"mean_token_accuracy": 0.1739755392074585,
|
|
"num_tokens": 34635633.0,
|
|
"step": 18780
|
|
},
|
|
{
|
|
"entropy": 5.7086036682128904,
|
|
"epoch": 1.5781978575929427,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004752557558356476,
|
|
"loss": 5.4156,
|
|
"mean_token_accuracy": 0.17332434356212617,
|
|
"num_tokens": 34645206.0,
|
|
"step": 18785
|
|
},
|
|
{
|
|
"entropy": 5.656038665771485,
|
|
"epoch": 1.5786179374081075,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004752421011131234,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.16186445355415344,
|
|
"num_tokens": 34653884.0,
|
|
"step": 18790
|
|
},
|
|
{
|
|
"entropy": 5.606300926208496,
|
|
"epoch": 1.5790380172232723,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00047522844284341364,
|
|
"loss": 5.2898,
|
|
"mean_token_accuracy": 0.17618423253297805,
|
|
"num_tokens": 34662170.0,
|
|
"step": 18795
|
|
},
|
|
{
|
|
"entropy": 5.66893949508667,
|
|
"epoch": 1.5794580970384373,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004752147810267601,
|
|
"loss": 5.4433,
|
|
"mean_token_accuracy": 0.16510264128446578,
|
|
"num_tokens": 34672548.0,
|
|
"step": 18800
|
|
},
|
|
{
|
|
"entropy": 5.760573959350586,
|
|
"epoch": 1.5798781768536023,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047520111566340465,
|
|
"loss": 5.4323,
|
|
"mean_token_accuracy": 0.1679047629237175,
|
|
"num_tokens": 34680972.0,
|
|
"step": 18805
|
|
},
|
|
{
|
|
"entropy": 5.643776369094849,
|
|
"epoch": 1.580298256668767,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00047518744675358965,
|
|
"loss": 5.3027,
|
|
"mean_token_accuracy": 0.17184915244579316,
|
|
"num_tokens": 34689589.0,
|
|
"step": 18810
|
|
},
|
|
{
|
|
"entropy": 5.626055669784546,
|
|
"epoch": 1.580718336483932,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004751737742975571,
|
|
"loss": 5.338,
|
|
"mean_token_accuracy": 0.17203721702098845,
|
|
"num_tokens": 34698747.0,
|
|
"step": 18815
|
|
},
|
|
{
|
|
"entropy": 5.675599765777588,
|
|
"epoch": 1.5811384162990967,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047516009829554913,
|
|
"loss": 5.4003,
|
|
"mean_token_accuracy": 0.16775297075510026,
|
|
"num_tokens": 34707502.0,
|
|
"step": 18820
|
|
},
|
|
{
|
|
"entropy": 5.608147096633911,
|
|
"epoch": 1.5815584961142617,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00047514641874780815,
|
|
"loss": 5.3289,
|
|
"mean_token_accuracy": 0.17193017303943633,
|
|
"num_tokens": 34715879.0,
|
|
"step": 18825
|
|
},
|
|
{
|
|
"entropy": 5.649180126190186,
|
|
"epoch": 1.5819785759294267,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00047513273565457644,
|
|
"loss": 5.5108,
|
|
"mean_token_accuracy": 0.16794274374842644,
|
|
"num_tokens": 34726090.0,
|
|
"step": 18830
|
|
},
|
|
{
|
|
"entropy": 5.7852592945098875,
|
|
"epoch": 1.5823986557445915,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004751190490160964,
|
|
"loss": 5.4755,
|
|
"mean_token_accuracy": 0.16427757740020751,
|
|
"num_tokens": 34736014.0,
|
|
"step": 18835
|
|
},
|
|
{
|
|
"entropy": 5.749915409088135,
|
|
"epoch": 1.5828187355597563,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047510535883261035,
|
|
"loss": 5.415,
|
|
"mean_token_accuracy": 0.16692599207162856,
|
|
"num_tokens": 34745648.0,
|
|
"step": 18840
|
|
},
|
|
{
|
|
"entropy": 5.649198770523071,
|
|
"epoch": 1.5832388153749213,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.000475091665104361,
|
|
"loss": 5.3967,
|
|
"mean_token_accuracy": 0.17258985787630082,
|
|
"num_tokens": 34753908.0,
|
|
"step": 18845
|
|
},
|
|
{
|
|
"entropy": 5.645108318328857,
|
|
"epoch": 1.583658895190086,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004750779678315908,
|
|
"loss": 5.2509,
|
|
"mean_token_accuracy": 0.17971468716859818,
|
|
"num_tokens": 34762303.0,
|
|
"step": 18850
|
|
},
|
|
{
|
|
"entropy": 5.632398986816407,
|
|
"epoch": 1.584078975005251,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004750642670145424,
|
|
"loss": 5.4294,
|
|
"mean_token_accuracy": 0.16685875207185746,
|
|
"num_tokens": 34771463.0,
|
|
"step": 18855
|
|
},
|
|
{
|
|
"entropy": 5.779457092285156,
|
|
"epoch": 1.5844990548204159,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004750505626534585,
|
|
"loss": 5.5146,
|
|
"mean_token_accuracy": 0.16541918367147446,
|
|
"num_tokens": 34780704.0,
|
|
"step": 18860
|
|
},
|
|
{
|
|
"entropy": 5.615437173843384,
|
|
"epoch": 1.5849191346355807,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047503685474858194,
|
|
"loss": 5.3305,
|
|
"mean_token_accuracy": 0.1751614198088646,
|
|
"num_tokens": 34790262.0,
|
|
"step": 18865
|
|
},
|
|
{
|
|
"entropy": 5.691679000854492,
|
|
"epoch": 1.5853392144507457,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004750231433001555,
|
|
"loss": 5.3665,
|
|
"mean_token_accuracy": 0.1725798651576042,
|
|
"num_tokens": 34799450.0,
|
|
"step": 18870
|
|
},
|
|
{
|
|
"entropy": 5.732432460784912,
|
|
"epoch": 1.5857592942659107,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004750094283084221,
|
|
"loss": 5.4141,
|
|
"mean_token_accuracy": 0.16283925771713256,
|
|
"num_tokens": 34808220.0,
|
|
"step": 18875
|
|
},
|
|
{
|
|
"entropy": 5.716584873199463,
|
|
"epoch": 1.5861793740810755,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00047499570977362467,
|
|
"loss": 5.4334,
|
|
"mean_token_accuracy": 0.16313114315271376,
|
|
"num_tokens": 34817846.0,
|
|
"step": 18880
|
|
},
|
|
{
|
|
"entropy": 5.688366794586182,
|
|
"epoch": 1.5865994538962402,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00047498198769600617,
|
|
"loss": 5.4526,
|
|
"mean_token_accuracy": 0.16976003497838973,
|
|
"num_tokens": 34826962.0,
|
|
"step": 18885
|
|
},
|
|
{
|
|
"entropy": 5.637577390670776,
|
|
"epoch": 1.587019533711405,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004749682620758097,
|
|
"loss": 5.3876,
|
|
"mean_token_accuracy": 0.1662908226251602,
|
|
"num_tokens": 34837170.0,
|
|
"step": 18890
|
|
},
|
|
{
|
|
"entropy": 5.624025487899781,
|
|
"epoch": 1.58743961352657,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00047495453291327854,
|
|
"loss": 5.3856,
|
|
"mean_token_accuracy": 0.17156262695789337,
|
|
"num_tokens": 34845336.0,
|
|
"step": 18895
|
|
},
|
|
{
|
|
"entropy": 5.641190814971924,
|
|
"epoch": 1.587859693341735,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047494080020865577,
|
|
"loss": 5.3634,
|
|
"mean_token_accuracy": 0.17117148637771606,
|
|
"num_tokens": 34854613.0,
|
|
"step": 18900
|
|
},
|
|
{
|
|
"entropy": 5.714927101135254,
|
|
"epoch": 1.5882797731568998,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004749270639621846,
|
|
"loss": 5.4495,
|
|
"mean_token_accuracy": 0.16892678290605545,
|
|
"num_tokens": 34864254.0,
|
|
"step": 18905
|
|
},
|
|
{
|
|
"entropy": 5.7336501121521,
|
|
"epoch": 1.5886998529720646,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004749133241741085,
|
|
"loss": 5.4825,
|
|
"mean_token_accuracy": 0.1654273435473442,
|
|
"num_tokens": 34874380.0,
|
|
"step": 18910
|
|
},
|
|
{
|
|
"entropy": 5.716697835922242,
|
|
"epoch": 1.5891199327872296,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004748995808446708,
|
|
"loss": 5.4443,
|
|
"mean_token_accuracy": 0.16573767066001893,
|
|
"num_tokens": 34883688.0,
|
|
"step": 18915
|
|
},
|
|
{
|
|
"entropy": 5.658730459213257,
|
|
"epoch": 1.5895400126023944,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047488583397411495,
|
|
"loss": 5.3102,
|
|
"mean_token_accuracy": 0.17554232925176622,
|
|
"num_tokens": 34892831.0,
|
|
"step": 18920
|
|
},
|
|
{
|
|
"entropy": 5.709734773635864,
|
|
"epoch": 1.5899600924175594,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047487208356268454,
|
|
"loss": 5.4004,
|
|
"mean_token_accuracy": 0.17941274642944335,
|
|
"num_tokens": 34901517.0,
|
|
"step": 18925
|
|
},
|
|
{
|
|
"entropy": 5.688491916656494,
|
|
"epoch": 1.5903801722327242,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047485832961062296,
|
|
"loss": 5.4002,
|
|
"mean_token_accuracy": 0.17023382037878038,
|
|
"num_tokens": 34910765.0,
|
|
"step": 18930
|
|
},
|
|
{
|
|
"entropy": 5.723994779586792,
|
|
"epoch": 1.590800252047889,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00047484457211817405,
|
|
"loss": 5.4441,
|
|
"mean_token_accuracy": 0.16038562953472138,
|
|
"num_tokens": 34919799.0,
|
|
"step": 18935
|
|
},
|
|
{
|
|
"entropy": 5.630226898193359,
|
|
"epoch": 1.591220331863054,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047483081108558143,
|
|
"loss": 5.3115,
|
|
"mean_token_accuracy": 0.17336263954639436,
|
|
"num_tokens": 34928199.0,
|
|
"step": 18940
|
|
},
|
|
{
|
|
"entropy": 5.682058715820313,
|
|
"epoch": 1.591640411678219,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000474817046513089,
|
|
"loss": 5.4412,
|
|
"mean_token_accuracy": 0.16989699453115464,
|
|
"num_tokens": 34937751.0,
|
|
"step": 18945
|
|
},
|
|
{
|
|
"entropy": 5.724739217758179,
|
|
"epoch": 1.5920604914933838,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004748032784009403,
|
|
"loss": 5.3858,
|
|
"mean_token_accuracy": 0.17437688410282134,
|
|
"num_tokens": 34946052.0,
|
|
"step": 18950
|
|
},
|
|
{
|
|
"entropy": 5.651232576370239,
|
|
"epoch": 1.5924805713085486,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004747895067493796,
|
|
"loss": 5.3793,
|
|
"mean_token_accuracy": 0.1674926221370697,
|
|
"num_tokens": 34954932.0,
|
|
"step": 18955
|
|
},
|
|
{
|
|
"entropy": 5.675562763214112,
|
|
"epoch": 1.5929006511237134,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004747757315586505,
|
|
"loss": 5.3688,
|
|
"mean_token_accuracy": 0.17305743098258972,
|
|
"num_tokens": 34963581.0,
|
|
"step": 18960
|
|
},
|
|
{
|
|
"entropy": 5.539657783508301,
|
|
"epoch": 1.5933207309388784,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047476195282899727,
|
|
"loss": 5.1861,
|
|
"mean_token_accuracy": 0.18181020617485047,
|
|
"num_tokens": 34972844.0,
|
|
"step": 18965
|
|
},
|
|
{
|
|
"entropy": 5.623536205291748,
|
|
"epoch": 1.5937408107540434,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00047474817056066383,
|
|
"loss": 5.396,
|
|
"mean_token_accuracy": 0.176412869989872,
|
|
"num_tokens": 34981998.0,
|
|
"step": 18970
|
|
},
|
|
{
|
|
"entropy": 5.595731449127197,
|
|
"epoch": 1.5941608905692082,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047473438475389453,
|
|
"loss": 5.3263,
|
|
"mean_token_accuracy": 0.17470391392707824,
|
|
"num_tokens": 34990552.0,
|
|
"step": 18975
|
|
},
|
|
{
|
|
"entropy": 5.687963628768921,
|
|
"epoch": 1.594580970384373,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004747205954089333,
|
|
"loss": 5.3401,
|
|
"mean_token_accuracy": 0.17572322934865953,
|
|
"num_tokens": 35000259.0,
|
|
"step": 18980
|
|
},
|
|
{
|
|
"entropy": 5.716721391677856,
|
|
"epoch": 1.5950010501995378,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004747068025260247,
|
|
"loss": 5.4253,
|
|
"mean_token_accuracy": 0.16249436065554618,
|
|
"num_tokens": 35009592.0,
|
|
"step": 18985
|
|
},
|
|
{
|
|
"entropy": 5.700528287887574,
|
|
"epoch": 1.5954211300147028,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004746930061054129,
|
|
"loss": 5.4772,
|
|
"mean_token_accuracy": 0.15898309648036957,
|
|
"num_tokens": 35019356.0,
|
|
"step": 18990
|
|
},
|
|
{
|
|
"entropy": 5.6559325695037845,
|
|
"epoch": 1.5958412098298678,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00047467920614734224,
|
|
"loss": 5.3952,
|
|
"mean_token_accuracy": 0.17310373932123185,
|
|
"num_tokens": 35028764.0,
|
|
"step": 18995
|
|
},
|
|
{
|
|
"entropy": 5.6906005859375,
|
|
"epoch": 1.5962612896450326,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004746654026520573,
|
|
"loss": 5.4045,
|
|
"mean_token_accuracy": 0.16763416677713394,
|
|
"num_tokens": 35037903.0,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"entropy": 5.642781209945679,
|
|
"epoch": 1.5966813694601973,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004746515956198026,
|
|
"loss": 5.3038,
|
|
"mean_token_accuracy": 0.17678880393505098,
|
|
"num_tokens": 35046326.0,
|
|
"step": 19005
|
|
},
|
|
{
|
|
"entropy": 5.741660451889038,
|
|
"epoch": 1.5971014492753624,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00047463778505082266,
|
|
"loss": 5.5384,
|
|
"mean_token_accuracy": 0.16487176418304444,
|
|
"num_tokens": 35055551.0,
|
|
"step": 19010
|
|
},
|
|
{
|
|
"entropy": 5.641852474212646,
|
|
"epoch": 1.5975215290905274,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004746239709453621,
|
|
"loss": 5.3079,
|
|
"mean_token_accuracy": 0.18089368045330048,
|
|
"num_tokens": 35065595.0,
|
|
"step": 19015
|
|
},
|
|
{
|
|
"entropy": 5.710475492477417,
|
|
"epoch": 1.5979416089056921,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004746101533036658,
|
|
"loss": 5.4167,
|
|
"mean_token_accuracy": 0.16984072029590608,
|
|
"num_tokens": 35075097.0,
|
|
"step": 19020
|
|
},
|
|
{
|
|
"entropy": 5.825159311294556,
|
|
"epoch": 1.598361688720857,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00047459633212597834,
|
|
"loss": 5.5007,
|
|
"mean_token_accuracy": 0.16182542145252227,
|
|
"num_tokens": 35084092.0,
|
|
"step": 19025
|
|
},
|
|
{
|
|
"entropy": 5.685335683822632,
|
|
"epoch": 1.5987817685360217,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004745825074125447,
|
|
"loss": 5.3897,
|
|
"mean_token_accuracy": 0.16710626929998398,
|
|
"num_tokens": 35093007.0,
|
|
"step": 19030
|
|
},
|
|
{
|
|
"entropy": 5.754900789260864,
|
|
"epoch": 1.5992018483511867,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004745686791636097,
|
|
"loss": 5.4559,
|
|
"mean_token_accuracy": 0.16395678967237473,
|
|
"num_tokens": 35103094.0,
|
|
"step": 19035
|
|
},
|
|
{
|
|
"entropy": 5.639309453964233,
|
|
"epoch": 1.5996219281663517,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047455484737941823,
|
|
"loss": 5.3045,
|
|
"mean_token_accuracy": 0.17383471876382828,
|
|
"num_tokens": 35112561.0,
|
|
"step": 19040
|
|
},
|
|
{
|
|
"entropy": 5.610976266860962,
|
|
"epoch": 1.6000420079815165,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004745410120602155,
|
|
"loss": 5.3837,
|
|
"mean_token_accuracy": 0.16612301766872406,
|
|
"num_tokens": 35121718.0,
|
|
"step": 19045
|
|
},
|
|
{
|
|
"entropy": 5.7062891006469725,
|
|
"epoch": 1.6004620877966813,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00047452717320624647,
|
|
"loss": 5.344,
|
|
"mean_token_accuracy": 0.18142815828323364,
|
|
"num_tokens": 35130073.0,
|
|
"step": 19050
|
|
},
|
|
{
|
|
"entropy": 5.670109796524048,
|
|
"epoch": 1.600882167611846,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004745133308177562,
|
|
"loss": 5.3913,
|
|
"mean_token_accuracy": 0.16597676426172256,
|
|
"num_tokens": 35138876.0,
|
|
"step": 19055
|
|
},
|
|
{
|
|
"entropy": 5.669570541381836,
|
|
"epoch": 1.601302247427011,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00047449948489499007,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.1685373529791832,
|
|
"num_tokens": 35147750.0,
|
|
"step": 19060
|
|
},
|
|
{
|
|
"entropy": 5.678817701339722,
|
|
"epoch": 1.6017223272421761,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00047448563543819335,
|
|
"loss": 5.4017,
|
|
"mean_token_accuracy": 0.17186661213636398,
|
|
"num_tokens": 35156955.0,
|
|
"step": 19065
|
|
},
|
|
{
|
|
"entropy": 5.661539745330811,
|
|
"epoch": 1.602142407057341,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004744717824476112,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.16969927847385408,
|
|
"num_tokens": 35166542.0,
|
|
"step": 19070
|
|
},
|
|
{
|
|
"entropy": 5.707697916030884,
|
|
"epoch": 1.6025624868725057,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00047445792592348926,
|
|
"loss": 5.3853,
|
|
"mean_token_accuracy": 0.16943657100200654,
|
|
"num_tokens": 35175258.0,
|
|
"step": 19075
|
|
},
|
|
{
|
|
"entropy": 5.701454114913941,
|
|
"epoch": 1.6029825666876707,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004744440658660729,
|
|
"loss": 5.3865,
|
|
"mean_token_accuracy": 0.16605425924062728,
|
|
"num_tokens": 35184970.0,
|
|
"step": 19080
|
|
},
|
|
{
|
|
"entropy": 5.687052440643311,
|
|
"epoch": 1.6034026465028357,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004744302022756075,
|
|
"loss": 5.3784,
|
|
"mean_token_accuracy": 0.16496190279722214,
|
|
"num_tokens": 35193948.0,
|
|
"step": 19085
|
|
},
|
|
{
|
|
"entropy": 5.577232599258423,
|
|
"epoch": 1.6038227263180005,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00047441633515233874,
|
|
"loss": 5.3198,
|
|
"mean_token_accuracy": 0.17375623136758805,
|
|
"num_tokens": 35203792.0,
|
|
"step": 19090
|
|
},
|
|
{
|
|
"entropy": 5.670841121673584,
|
|
"epoch": 1.6042428061331653,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004744024644965123,
|
|
"loss": 5.4944,
|
|
"mean_token_accuracy": 0.16568351536989212,
|
|
"num_tokens": 35212684.0,
|
|
"step": 19095
|
|
},
|
|
{
|
|
"entropy": 5.673999786376953,
|
|
"epoch": 1.60466288594833,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047438859030837397,
|
|
"loss": 5.2946,
|
|
"mean_token_accuracy": 0.17858032286167144,
|
|
"num_tokens": 35220830.0,
|
|
"step": 19100
|
|
},
|
|
{
|
|
"entropy": 5.708344316482544,
|
|
"epoch": 1.605082965763495,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047437471258816936,
|
|
"loss": 5.3833,
|
|
"mean_token_accuracy": 0.16468634456396103,
|
|
"num_tokens": 35230171.0,
|
|
"step": 19105
|
|
},
|
|
{
|
|
"entropy": 5.619188070297241,
|
|
"epoch": 1.60550304557866,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047436083133614446,
|
|
"loss": 5.3073,
|
|
"mean_token_accuracy": 0.17591052502393723,
|
|
"num_tokens": 35239022.0,
|
|
"step": 19110
|
|
},
|
|
{
|
|
"entropy": 5.629873466491699,
|
|
"epoch": 1.6059231253938249,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047434694655254495,
|
|
"loss": 5.3297,
|
|
"mean_token_accuracy": 0.16770700961351395,
|
|
"num_tokens": 35247564.0,
|
|
"step": 19115
|
|
},
|
|
{
|
|
"entropy": 5.6350812911987305,
|
|
"epoch": 1.6063432052089897,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000474333058237617,
|
|
"loss": 5.3529,
|
|
"mean_token_accuracy": 0.16446800380945206,
|
|
"num_tokens": 35256175.0,
|
|
"step": 19120
|
|
},
|
|
{
|
|
"entropy": 5.780952882766724,
|
|
"epoch": 1.6067632850241544,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047431916639160656,
|
|
"loss": 5.5043,
|
|
"mean_token_accuracy": 0.1661346063017845,
|
|
"num_tokens": 35265278.0,
|
|
"step": 19125
|
|
},
|
|
{
|
|
"entropy": 5.603296756744385,
|
|
"epoch": 1.6071833648393195,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004743052710147598,
|
|
"loss": 5.2283,
|
|
"mean_token_accuracy": 0.1780938133597374,
|
|
"num_tokens": 35274715.0,
|
|
"step": 19130
|
|
},
|
|
{
|
|
"entropy": 5.574432277679444,
|
|
"epoch": 1.6076034446544845,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00047429137210732266,
|
|
"loss": 5.3431,
|
|
"mean_token_accuracy": 0.1689825624227524,
|
|
"num_tokens": 35285450.0,
|
|
"step": 19135
|
|
},
|
|
{
|
|
"entropy": 5.659537506103516,
|
|
"epoch": 1.6080235244696492,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004742774696695415,
|
|
"loss": 5.3553,
|
|
"mean_token_accuracy": 0.1621303752064705,
|
|
"num_tokens": 35294531.0,
|
|
"step": 19140
|
|
},
|
|
{
|
|
"entropy": 5.693420028686523,
|
|
"epoch": 1.608443604284814,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047426356370166266,
|
|
"loss": 5.4104,
|
|
"mean_token_accuracy": 0.16336591690778732,
|
|
"num_tokens": 35303749.0,
|
|
"step": 19145
|
|
},
|
|
{
|
|
"entropy": 5.59863772392273,
|
|
"epoch": 1.608863684099979,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004742496542039324,
|
|
"loss": 5.3695,
|
|
"mean_token_accuracy": 0.16599306017160415,
|
|
"num_tokens": 35312994.0,
|
|
"step": 19150
|
|
},
|
|
{
|
|
"entropy": 5.656160926818847,
|
|
"epoch": 1.6092837639151438,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047423574117659703,
|
|
"loss": 5.3488,
|
|
"mean_token_accuracy": 0.1693723350763321,
|
|
"num_tokens": 35322533.0,
|
|
"step": 19155
|
|
},
|
|
{
|
|
"entropy": 5.681179428100586,
|
|
"epoch": 1.6097038437303088,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.00047422182461990316,
|
|
"loss": 5.3872,
|
|
"mean_token_accuracy": 0.1734430029988289,
|
|
"num_tokens": 35331872.0,
|
|
"step": 19160
|
|
},
|
|
{
|
|
"entropy": 5.643349313735962,
|
|
"epoch": 1.6101239235454736,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00047420790453409724,
|
|
"loss": 5.4206,
|
|
"mean_token_accuracy": 0.16745028495788575,
|
|
"num_tokens": 35341517.0,
|
|
"step": 19165
|
|
},
|
|
{
|
|
"entropy": 5.632366943359375,
|
|
"epoch": 1.6105440033606384,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004741939809194258,
|
|
"loss": 5.3309,
|
|
"mean_token_accuracy": 0.176885287463665,
|
|
"num_tokens": 35350291.0,
|
|
"step": 19170
|
|
},
|
|
{
|
|
"entropy": 5.727736234664917,
|
|
"epoch": 1.6109640831758034,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047418005377613566,
|
|
"loss": 5.499,
|
|
"mean_token_accuracy": 0.1620399162173271,
|
|
"num_tokens": 35360711.0,
|
|
"step": 19175
|
|
},
|
|
{
|
|
"entropy": 5.703640460968018,
|
|
"epoch": 1.6113841629909684,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004741661231044733,
|
|
"loss": 5.3995,
|
|
"mean_token_accuracy": 0.1704120382666588,
|
|
"num_tokens": 35370069.0,
|
|
"step": 19180
|
|
},
|
|
{
|
|
"entropy": 5.749680423736573,
|
|
"epoch": 1.6118042428061332,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00047415218890468577,
|
|
"loss": 5.3856,
|
|
"mean_token_accuracy": 0.18042093962430955,
|
|
"num_tokens": 35380389.0,
|
|
"step": 19185
|
|
},
|
|
{
|
|
"entropy": 5.660278797149658,
|
|
"epoch": 1.612224322621298,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004741382511770197,
|
|
"loss": 5.3838,
|
|
"mean_token_accuracy": 0.17036385387182235,
|
|
"num_tokens": 35389420.0,
|
|
"step": 19190
|
|
},
|
|
{
|
|
"entropy": 5.662668371200562,
|
|
"epoch": 1.6126444024364628,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00047412430992172205,
|
|
"loss": 5.4823,
|
|
"mean_token_accuracy": 0.15827725529670716,
|
|
"num_tokens": 35399418.0,
|
|
"step": 19195
|
|
},
|
|
{
|
|
"entropy": 5.634368419647217,
|
|
"epoch": 1.6130644822516278,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00047411036513903974,
|
|
"loss": 5.3616,
|
|
"mean_token_accuracy": 0.17389402389526368,
|
|
"num_tokens": 35408717.0,
|
|
"step": 19200
|
|
},
|
|
{
|
|
"entropy": 5.6884690284729,
|
|
"epoch": 1.6134845620667928,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00047409641682921987,
|
|
"loss": 5.3188,
|
|
"mean_token_accuracy": 0.18027044236660003,
|
|
"num_tokens": 35417118.0,
|
|
"step": 19205
|
|
},
|
|
{
|
|
"entropy": 5.686248636245727,
|
|
"epoch": 1.6139046418819576,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004740824649925096,
|
|
"loss": 5.4141,
|
|
"mean_token_accuracy": 0.1654793232679367,
|
|
"num_tokens": 35425526.0,
|
|
"step": 19210
|
|
},
|
|
{
|
|
"entropy": 5.595103168487549,
|
|
"epoch": 1.6143247216971224,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004740685096291559,
|
|
"loss": 5.4122,
|
|
"mean_token_accuracy": 0.16647179573774337,
|
|
"num_tokens": 35434932.0,
|
|
"step": 19215
|
|
},
|
|
{
|
|
"entropy": 5.725376129150391,
|
|
"epoch": 1.6147448015122874,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00047405455073940597,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.16955055445432662,
|
|
"num_tokens": 35443909.0,
|
|
"step": 19220
|
|
},
|
|
{
|
|
"entropy": 5.752730035781861,
|
|
"epoch": 1.6151648813274522,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004740405883235072,
|
|
"loss": 5.4143,
|
|
"mean_token_accuracy": 0.17224101722240448,
|
|
"num_tokens": 35454082.0,
|
|
"step": 19225
|
|
},
|
|
{
|
|
"entropy": 5.780597686767578,
|
|
"epoch": 1.6155849611426172,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047402662238170694,
|
|
"loss": 5.4702,
|
|
"mean_token_accuracy": 0.16434868276119233,
|
|
"num_tokens": 35464547.0,
|
|
"step": 19230
|
|
},
|
|
{
|
|
"entropy": 5.657827091217041,
|
|
"epoch": 1.616005040957782,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004740126529142526,
|
|
"loss": 5.3376,
|
|
"mean_token_accuracy": 0.17347298115491866,
|
|
"num_tokens": 35473310.0,
|
|
"step": 19235
|
|
},
|
|
{
|
|
"entropy": 5.602123212814331,
|
|
"epoch": 1.6164251207729468,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004739986799213915,
|
|
"loss": 5.4651,
|
|
"mean_token_accuracy": 0.1707776516675949,
|
|
"num_tokens": 35483502.0,
|
|
"step": 19240
|
|
},
|
|
{
|
|
"entropy": 5.694213247299194,
|
|
"epoch": 1.6168452005881118,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004739847034033713,
|
|
"loss": 5.4299,
|
|
"mean_token_accuracy": 0.16592200696468354,
|
|
"num_tokens": 35493063.0,
|
|
"step": 19245
|
|
},
|
|
{
|
|
"entropy": 5.674246883392334,
|
|
"epoch": 1.6172652804032768,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00047397072336043957,
|
|
"loss": 5.3847,
|
|
"mean_token_accuracy": 0.1654440939426422,
|
|
"num_tokens": 35501829.0,
|
|
"step": 19250
|
|
},
|
|
{
|
|
"entropy": 5.7208233833312985,
|
|
"epoch": 1.6176853602184416,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00047395673979284383,
|
|
"loss": 5.4025,
|
|
"mean_token_accuracy": 0.16252227872610092,
|
|
"num_tokens": 35510411.0,
|
|
"step": 19255
|
|
},
|
|
{
|
|
"entropy": 5.695710945129394,
|
|
"epoch": 1.6181054400336063,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000473942752700832,
|
|
"loss": 5.4071,
|
|
"mean_token_accuracy": 0.168272402882576,
|
|
"num_tokens": 35519571.0,
|
|
"step": 19260
|
|
},
|
|
{
|
|
"entropy": 5.633262681961059,
|
|
"epoch": 1.6185255198487711,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00047392876208465166,
|
|
"loss": 5.3537,
|
|
"mean_token_accuracy": 0.1690814658999443,
|
|
"num_tokens": 35527306.0,
|
|
"step": 19265
|
|
},
|
|
{
|
|
"entropy": 5.6343008518219,
|
|
"epoch": 1.6189455996639361,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004739147679445508,
|
|
"loss": 5.3577,
|
|
"mean_token_accuracy": 0.1658302888274193,
|
|
"num_tokens": 35536126.0,
|
|
"step": 19270
|
|
},
|
|
{
|
|
"entropy": 5.644708919525146,
|
|
"epoch": 1.6193656794791011,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004739007702807773,
|
|
"loss": 5.4217,
|
|
"mean_token_accuracy": 0.16885081082582473,
|
|
"num_tokens": 35545593.0,
|
|
"step": 19275
|
|
},
|
|
{
|
|
"entropy": 5.625165557861328,
|
|
"epoch": 1.619785759294266,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047388676909357894,
|
|
"loss": 5.3437,
|
|
"mean_token_accuracy": 0.1670317158102989,
|
|
"num_tokens": 35554780.0,
|
|
"step": 19280
|
|
},
|
|
{
|
|
"entropy": 5.674202489852905,
|
|
"epoch": 1.6202058391094307,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047387276438320394,
|
|
"loss": 5.3462,
|
|
"mean_token_accuracy": 0.17734202444553376,
|
|
"num_tokens": 35562982.0,
|
|
"step": 19285
|
|
},
|
|
{
|
|
"entropy": 5.682125806808472,
|
|
"epoch": 1.6206259189245955,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004738587561499003,
|
|
"loss": 5.464,
|
|
"mean_token_accuracy": 0.16998654305934907,
|
|
"num_tokens": 35571528.0,
|
|
"step": 19290
|
|
},
|
|
{
|
|
"entropy": 5.594412136077881,
|
|
"epoch": 1.6210459987397605,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047384474439391615,
|
|
"loss": 5.2968,
|
|
"mean_token_accuracy": 0.17942917197942734,
|
|
"num_tokens": 35580386.0,
|
|
"step": 19295
|
|
},
|
|
{
|
|
"entropy": 5.609464263916015,
|
|
"epoch": 1.6214660785549255,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004738307291154998,
|
|
"loss": 5.269,
|
|
"mean_token_accuracy": 0.16951826214790344,
|
|
"num_tokens": 35589456.0,
|
|
"step": 19300
|
|
},
|
|
{
|
|
"entropy": 5.646043395996093,
|
|
"epoch": 1.6218861583700903,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004738167103148995,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.17219835072755812,
|
|
"num_tokens": 35598116.0,
|
|
"step": 19305
|
|
},
|
|
{
|
|
"entropy": 5.676636123657227,
|
|
"epoch": 1.622306238185255,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047380268799236355,
|
|
"loss": 5.3641,
|
|
"mean_token_accuracy": 0.16999810189008713,
|
|
"num_tokens": 35606481.0,
|
|
"step": 19310
|
|
},
|
|
{
|
|
"entropy": 5.633781385421753,
|
|
"epoch": 1.62272631800042,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00047378866214814024,
|
|
"loss": 5.3475,
|
|
"mean_token_accuracy": 0.16768400371074677,
|
|
"num_tokens": 35615517.0,
|
|
"step": 19315
|
|
},
|
|
{
|
|
"entropy": 5.662630224227906,
|
|
"epoch": 1.6231463978155851,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047377463278247827,
|
|
"loss": 5.4018,
|
|
"mean_token_accuracy": 0.1614094376564026,
|
|
"num_tokens": 35625100.0,
|
|
"step": 19320
|
|
},
|
|
{
|
|
"entropy": 5.698197555541992,
|
|
"epoch": 1.62356647763075,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000473760599895626,
|
|
"loss": 5.3197,
|
|
"mean_token_accuracy": 0.16777887046337128,
|
|
"num_tokens": 35634572.0,
|
|
"step": 19325
|
|
},
|
|
{
|
|
"entropy": 5.671027040481567,
|
|
"epoch": 1.6239865574459147,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000473746563487832,
|
|
"loss": 5.3585,
|
|
"mean_token_accuracy": 0.1732994943857193,
|
|
"num_tokens": 35643883.0,
|
|
"step": 19330
|
|
},
|
|
{
|
|
"entropy": 5.641132640838623,
|
|
"epoch": 1.6244066372610795,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047373252355934506,
|
|
"loss": 5.4252,
|
|
"mean_token_accuracy": 0.16886914223432542,
|
|
"num_tokens": 35652527.0,
|
|
"step": 19335
|
|
},
|
|
{
|
|
"entropy": 5.691527080535889,
|
|
"epoch": 1.6248267170762445,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047371848011041375,
|
|
"loss": 5.4632,
|
|
"mean_token_accuracy": 0.16798330396413802,
|
|
"num_tokens": 35662436.0,
|
|
"step": 19340
|
|
},
|
|
{
|
|
"entropy": 5.699794816970825,
|
|
"epoch": 1.6252467968914095,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047370443314128687,
|
|
"loss": 5.3483,
|
|
"mean_token_accuracy": 0.17165588736534118,
|
|
"num_tokens": 35672302.0,
|
|
"step": 19345
|
|
},
|
|
{
|
|
"entropy": 5.659704780578613,
|
|
"epoch": 1.6256668767065743,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004736903826522132,
|
|
"loss": 5.4101,
|
|
"mean_token_accuracy": 0.16816721260547637,
|
|
"num_tokens": 35680852.0,
|
|
"step": 19350
|
|
},
|
|
{
|
|
"entropy": 5.6761833190917965,
|
|
"epoch": 1.626086956521739,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004736763286434419,
|
|
"loss": 5.3811,
|
|
"mean_token_accuracy": 0.17145880460739135,
|
|
"num_tokens": 35690159.0,
|
|
"step": 19355
|
|
},
|
|
{
|
|
"entropy": 5.622335624694824,
|
|
"epoch": 1.6265070363369039,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004736622711152216,
|
|
"loss": 5.3144,
|
|
"mean_token_accuracy": 0.17438797056674957,
|
|
"num_tokens": 35699165.0,
|
|
"step": 19360
|
|
},
|
|
{
|
|
"entropy": 5.680206346511841,
|
|
"epoch": 1.6269271161520689,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004736482100678015,
|
|
"loss": 5.379,
|
|
"mean_token_accuracy": 0.17168426364660264,
|
|
"num_tokens": 35708910.0,
|
|
"step": 19365
|
|
},
|
|
{
|
|
"entropy": 5.680268287658691,
|
|
"epoch": 1.6273471959672339,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047363414550143063,
|
|
"loss": 5.4539,
|
|
"mean_token_accuracy": 0.16627233028411864,
|
|
"num_tokens": 35718218.0,
|
|
"step": 19370
|
|
},
|
|
{
|
|
"entropy": 5.661238050460815,
|
|
"epoch": 1.6277672757823987,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00047362007741635816,
|
|
"loss": 5.3692,
|
|
"mean_token_accuracy": 0.17138148248195648,
|
|
"num_tokens": 35727076.0,
|
|
"step": 19375
|
|
},
|
|
{
|
|
"entropy": 5.655786752700806,
|
|
"epoch": 1.6281873555975634,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004736060058128333,
|
|
"loss": 5.4598,
|
|
"mean_token_accuracy": 0.1673205927014351,
|
|
"num_tokens": 35736316.0,
|
|
"step": 19380
|
|
},
|
|
{
|
|
"entropy": 5.689300918579102,
|
|
"epoch": 1.6286074354127285,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047359193069110533,
|
|
"loss": 5.4293,
|
|
"mean_token_accuracy": 0.17298100590705873,
|
|
"num_tokens": 35745747.0,
|
|
"step": 19385
|
|
},
|
|
{
|
|
"entropy": 5.791736125946045,
|
|
"epoch": 1.6290275152278935,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047357785205142354,
|
|
"loss": 5.3922,
|
|
"mean_token_accuracy": 0.17255930006504058,
|
|
"num_tokens": 35754825.0,
|
|
"step": 19390
|
|
},
|
|
{
|
|
"entropy": 5.630894136428833,
|
|
"epoch": 1.6294475950430583,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004735637698940374,
|
|
"loss": 5.3536,
|
|
"mean_token_accuracy": 0.17112387716770172,
|
|
"num_tokens": 35764504.0,
|
|
"step": 19395
|
|
},
|
|
{
|
|
"entropy": 5.721408700942993,
|
|
"epoch": 1.629867674858223,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004735496842191963,
|
|
"loss": 5.4416,
|
|
"mean_token_accuracy": 0.17230593860149385,
|
|
"num_tokens": 35774195.0,
|
|
"step": 19400
|
|
},
|
|
{
|
|
"entropy": 5.609949207305908,
|
|
"epoch": 1.6302877546733878,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047353559502714976,
|
|
"loss": 5.3104,
|
|
"mean_token_accuracy": 0.1747656896710396,
|
|
"num_tokens": 35783721.0,
|
|
"step": 19405
|
|
},
|
|
{
|
|
"entropy": 5.641864967346192,
|
|
"epoch": 1.6307078344885528,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004735215023181474,
|
|
"loss": 5.3991,
|
|
"mean_token_accuracy": 0.16826074570417404,
|
|
"num_tokens": 35792821.0,
|
|
"step": 19410
|
|
},
|
|
{
|
|
"entropy": 5.676604318618774,
|
|
"epoch": 1.6311279143037178,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047350740609243883,
|
|
"loss": 5.4285,
|
|
"mean_token_accuracy": 0.1649575188755989,
|
|
"num_tokens": 35802746.0,
|
|
"step": 19415
|
|
},
|
|
{
|
|
"entropy": 5.721334552764892,
|
|
"epoch": 1.6315479941188826,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004734933063502738,
|
|
"loss": 5.421,
|
|
"mean_token_accuracy": 0.17509810924530028,
|
|
"num_tokens": 35811196.0,
|
|
"step": 19420
|
|
},
|
|
{
|
|
"entropy": 5.818255996704101,
|
|
"epoch": 1.6319680739340474,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00047347920309190203,
|
|
"loss": 5.4471,
|
|
"mean_token_accuracy": 0.16493862569332124,
|
|
"num_tokens": 35820787.0,
|
|
"step": 19425
|
|
},
|
|
{
|
|
"entropy": 5.703247213363648,
|
|
"epoch": 1.6323881537492122,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004734650963175734,
|
|
"loss": 5.4246,
|
|
"mean_token_accuracy": 0.16639426350593567,
|
|
"num_tokens": 35831247.0,
|
|
"step": 19430
|
|
},
|
|
{
|
|
"entropy": 5.648799848556519,
|
|
"epoch": 1.6328082335643772,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00047345098602753777,
|
|
"loss": 5.4563,
|
|
"mean_token_accuracy": 0.16505984961986542,
|
|
"num_tokens": 35840759.0,
|
|
"step": 19435
|
|
},
|
|
{
|
|
"entropy": 5.622422122955323,
|
|
"epoch": 1.6332283133795422,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004734368722220451,
|
|
"loss": 5.41,
|
|
"mean_token_accuracy": 0.16521313637495041,
|
|
"num_tokens": 35850137.0,
|
|
"step": 19440
|
|
},
|
|
{
|
|
"entropy": 5.628439140319824,
|
|
"epoch": 1.633648393194707,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004734227549013455,
|
|
"loss": 5.2396,
|
|
"mean_token_accuracy": 0.1794390082359314,
|
|
"num_tokens": 35858412.0,
|
|
"step": 19445
|
|
},
|
|
{
|
|
"entropy": 5.655402612686157,
|
|
"epoch": 1.6340684730098718,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004734086340656889,
|
|
"loss": 5.3312,
|
|
"mean_token_accuracy": 0.1723542883992195,
|
|
"num_tokens": 35868202.0,
|
|
"step": 19450
|
|
},
|
|
{
|
|
"entropy": 5.646328258514404,
|
|
"epoch": 1.6344885528250368,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004733945097153255,
|
|
"loss": 5.4003,
|
|
"mean_token_accuracy": 0.17372321784496308,
|
|
"num_tokens": 35877237.0,
|
|
"step": 19455
|
|
},
|
|
{
|
|
"entropy": 5.608543586730957,
|
|
"epoch": 1.6349086326402016,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004733803818505055,
|
|
"loss": 5.2715,
|
|
"mean_token_accuracy": 0.1802636206150055,
|
|
"num_tokens": 35887016.0,
|
|
"step": 19460
|
|
},
|
|
{
|
|
"entropy": 5.677346563339233,
|
|
"epoch": 1.6353287124553666,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047336625047147924,
|
|
"loss": 5.3485,
|
|
"mean_token_accuracy": 0.17663054317235946,
|
|
"num_tokens": 35896393.0,
|
|
"step": 19465
|
|
},
|
|
{
|
|
"entropy": 5.643209791183471,
|
|
"epoch": 1.6357487922705314,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047335211557849693,
|
|
"loss": 5.3902,
|
|
"mean_token_accuracy": 0.16930769830942155,
|
|
"num_tokens": 35905237.0,
|
|
"step": 19470
|
|
},
|
|
{
|
|
"entropy": 5.671267795562744,
|
|
"epoch": 1.6361688720856962,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004733379771718092,
|
|
"loss": 5.4229,
|
|
"mean_token_accuracy": 0.17023178488016127,
|
|
"num_tokens": 35914352.0,
|
|
"step": 19475
|
|
},
|
|
{
|
|
"entropy": 5.692772483825683,
|
|
"epoch": 1.6365889519008612,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004733238352516661,
|
|
"loss": 5.4805,
|
|
"mean_token_accuracy": 0.16938166916370392,
|
|
"num_tokens": 35923785.0,
|
|
"step": 19480
|
|
},
|
|
{
|
|
"entropy": 5.761615133285522,
|
|
"epoch": 1.6370090317160262,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047330968981831856,
|
|
"loss": 5.3858,
|
|
"mean_token_accuracy": 0.16777340024709703,
|
|
"num_tokens": 35932495.0,
|
|
"step": 19485
|
|
},
|
|
{
|
|
"entropy": 5.69402379989624,
|
|
"epoch": 1.637429111531191,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00047329554087201687,
|
|
"loss": 5.351,
|
|
"mean_token_accuracy": 0.17982448786497116,
|
|
"num_tokens": 35941745.0,
|
|
"step": 19490
|
|
},
|
|
{
|
|
"entropy": 5.660278224945069,
|
|
"epoch": 1.6378491913463558,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00047328138841301186,
|
|
"loss": 5.4418,
|
|
"mean_token_accuracy": 0.16807905286550523,
|
|
"num_tokens": 35950281.0,
|
|
"step": 19495
|
|
},
|
|
{
|
|
"entropy": 5.653802061080933,
|
|
"epoch": 1.6382692711615205,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004732672324415541,
|
|
"loss": 5.372,
|
|
"mean_token_accuracy": 0.1754430741071701,
|
|
"num_tokens": 35959531.0,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"entropy": 5.73360242843628,
|
|
"epoch": 1.6386893509766856,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004732530729578945,
|
|
"loss": 5.4361,
|
|
"mean_token_accuracy": 0.17509964853525162,
|
|
"num_tokens": 35969462.0,
|
|
"step": 19505
|
|
},
|
|
{
|
|
"entropy": 5.659942388534546,
|
|
"epoch": 1.6391094307918506,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004732389099622837,
|
|
"loss": 5.411,
|
|
"mean_token_accuracy": 0.16947837471961974,
|
|
"num_tokens": 35978022.0,
|
|
"step": 19510
|
|
},
|
|
{
|
|
"entropy": 5.7105179786682125,
|
|
"epoch": 1.6395295106070154,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047322474345497267,
|
|
"loss": 5.4246,
|
|
"mean_token_accuracy": 0.16419751197099686,
|
|
"num_tokens": 35988193.0,
|
|
"step": 19515
|
|
},
|
|
{
|
|
"entropy": 5.762126207351685,
|
|
"epoch": 1.6399495904221801,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047321057343621247,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.16807464212179185,
|
|
"num_tokens": 35997404.0,
|
|
"step": 19520
|
|
},
|
|
{
|
|
"entropy": 5.6027778625488285,
|
|
"epoch": 1.6403696702373451,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047319639990625395,
|
|
"loss": 5.3067,
|
|
"mean_token_accuracy": 0.1780134305357933,
|
|
"num_tokens": 36005356.0,
|
|
"step": 19525
|
|
},
|
|
{
|
|
"entropy": 5.74001407623291,
|
|
"epoch": 1.64078975005251,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00047318222286534824,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.16051921397447586,
|
|
"num_tokens": 36015305.0,
|
|
"step": 19530
|
|
},
|
|
{
|
|
"entropy": 5.77122483253479,
|
|
"epoch": 1.641209829867675,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00047316804231374663,
|
|
"loss": 5.4209,
|
|
"mean_token_accuracy": 0.1640459731221199,
|
|
"num_tokens": 36024278.0,
|
|
"step": 19535
|
|
},
|
|
{
|
|
"entropy": 5.6274620532989506,
|
|
"epoch": 1.6416299096828397,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004731538582517001,
|
|
"loss": 5.2479,
|
|
"mean_token_accuracy": 0.17768406867980957,
|
|
"num_tokens": 36032870.0,
|
|
"step": 19540
|
|
},
|
|
{
|
|
"entropy": 5.569975423812866,
|
|
"epoch": 1.6420499894980045,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00047313967067945996,
|
|
"loss": 5.2931,
|
|
"mean_token_accuracy": 0.17938766926527022,
|
|
"num_tokens": 36041725.0,
|
|
"step": 19545
|
|
},
|
|
{
|
|
"entropy": 5.649091005325317,
|
|
"epoch": 1.6424700693131695,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004731254795972777,
|
|
"loss": 5.423,
|
|
"mean_token_accuracy": 0.16832873672246934,
|
|
"num_tokens": 36050929.0,
|
|
"step": 19550
|
|
},
|
|
{
|
|
"entropy": 5.709831714630127,
|
|
"epoch": 1.6428901491283345,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004731112850054045,
|
|
"loss": 5.4119,
|
|
"mean_token_accuracy": 0.16599251627922057,
|
|
"num_tokens": 36060059.0,
|
|
"step": 19555
|
|
},
|
|
{
|
|
"entropy": 5.649776840209961,
|
|
"epoch": 1.6433102289434993,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004730970869040919,
|
|
"loss": 5.3525,
|
|
"mean_token_accuracy": 0.18190265446901321,
|
|
"num_tokens": 36069445.0,
|
|
"step": 19560
|
|
},
|
|
{
|
|
"entropy": 5.696929168701172,
|
|
"epoch": 1.643730308758664,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00047308288529359147,
|
|
"loss": 5.4943,
|
|
"mean_token_accuracy": 0.16712310314178466,
|
|
"num_tokens": 36079129.0,
|
|
"step": 19565
|
|
},
|
|
{
|
|
"entropy": 5.7188207626342775,
|
|
"epoch": 1.644150388573829,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004730686801741547,
|
|
"loss": 5.3679,
|
|
"mean_token_accuracy": 0.17080006003379822,
|
|
"num_tokens": 36088320.0,
|
|
"step": 19570
|
|
},
|
|
{
|
|
"entropy": 5.674493503570557,
|
|
"epoch": 1.644570468388994,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004730544715460332,
|
|
"loss": 5.4237,
|
|
"mean_token_accuracy": 0.17072638422250747,
|
|
"num_tokens": 36097728.0,
|
|
"step": 19575
|
|
},
|
|
{
|
|
"entropy": 5.724712228775024,
|
|
"epoch": 1.644990548204159,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00047304025940947875,
|
|
"loss": 5.4189,
|
|
"mean_token_accuracy": 0.1723160296678543,
|
|
"num_tokens": 36106566.0,
|
|
"step": 19580
|
|
},
|
|
{
|
|
"entropy": 5.699596214294433,
|
|
"epoch": 1.6454106280193237,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00047302604376474306,
|
|
"loss": 5.3691,
|
|
"mean_token_accuracy": 0.16786410212516784,
|
|
"num_tokens": 36115475.0,
|
|
"step": 19585
|
|
},
|
|
{
|
|
"entropy": 5.62215142250061,
|
|
"epoch": 1.6458307078344885,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047301182461207807,
|
|
"loss": 5.4812,
|
|
"mean_token_accuracy": 0.17268287092447282,
|
|
"num_tokens": 36124404.0,
|
|
"step": 19590
|
|
},
|
|
{
|
|
"entropy": 5.670156955718994,
|
|
"epoch": 1.6462507876496533,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00047299760195173554,
|
|
"loss": 5.3278,
|
|
"mean_token_accuracy": 0.1758397027850151,
|
|
"num_tokens": 36132987.0,
|
|
"step": 19595
|
|
},
|
|
{
|
|
"entropy": 5.701921844482422,
|
|
"epoch": 1.6466708674648183,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004729833757839673,
|
|
"loss": 5.4756,
|
|
"mean_token_accuracy": 0.17378847897052765,
|
|
"num_tokens": 36142163.0,
|
|
"step": 19600
|
|
},
|
|
{
|
|
"entropy": 5.707473468780518,
|
|
"epoch": 1.6470909472799833,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047296914610902565,
|
|
"loss": 5.4488,
|
|
"mean_token_accuracy": 0.16369751691818238,
|
|
"num_tokens": 36152561.0,
|
|
"step": 19605
|
|
},
|
|
{
|
|
"entropy": 5.710807847976684,
|
|
"epoch": 1.647511027095148,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00047295491292716245,
|
|
"loss": 5.363,
|
|
"mean_token_accuracy": 0.16720346361398697,
|
|
"num_tokens": 36161877.0,
|
|
"step": 19610
|
|
},
|
|
{
|
|
"entropy": 5.670904731750488,
|
|
"epoch": 1.6479311069103129,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00047294067623862996,
|
|
"loss": 5.3954,
|
|
"mean_token_accuracy": 0.164234559237957,
|
|
"num_tokens": 36171523.0,
|
|
"step": 19615
|
|
},
|
|
{
|
|
"entropy": 5.612199401855468,
|
|
"epoch": 1.6483511867254779,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00047292643604368025,
|
|
"loss": 5.3371,
|
|
"mean_token_accuracy": 0.1748445972800255,
|
|
"num_tokens": 36180339.0,
|
|
"step": 19620
|
|
},
|
|
{
|
|
"entropy": 5.7124796390533445,
|
|
"epoch": 1.6487712665406429,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004729121923425657,
|
|
"loss": 5.4309,
|
|
"mean_token_accuracy": 0.1659110963344574,
|
|
"num_tokens": 36191584.0,
|
|
"step": 19625
|
|
},
|
|
{
|
|
"entropy": 5.788698005676269,
|
|
"epoch": 1.6491913463558077,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004728979451355385,
|
|
"loss": 5.4677,
|
|
"mean_token_accuracy": 0.16967541128396987,
|
|
"num_tokens": 36200738.0,
|
|
"step": 19630
|
|
},
|
|
{
|
|
"entropy": 5.621402883529663,
|
|
"epoch": 1.6496114261709725,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00047288369442285115,
|
|
"loss": 5.2805,
|
|
"mean_token_accuracy": 0.18398987352848054,
|
|
"num_tokens": 36209394.0,
|
|
"step": 19635
|
|
},
|
|
{
|
|
"entropy": 5.628550434112549,
|
|
"epoch": 1.6500315059861372,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00047286944020475606,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.17032790631055833,
|
|
"num_tokens": 36218268.0,
|
|
"step": 19640
|
|
},
|
|
{
|
|
"entropy": 5.638523435592651,
|
|
"epoch": 1.6504515858013022,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004728551824815057,
|
|
"loss": 5.3451,
|
|
"mean_token_accuracy": 0.17313553392887115,
|
|
"num_tokens": 36226974.0,
|
|
"step": 19645
|
|
},
|
|
{
|
|
"entropy": 5.580386114120484,
|
|
"epoch": 1.6508716656164673,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047284092125335277,
|
|
"loss": 5.3191,
|
|
"mean_token_accuracy": 0.1764894738793373,
|
|
"num_tokens": 36235892.0,
|
|
"step": 19650
|
|
},
|
|
{
|
|
"entropy": 5.583609628677368,
|
|
"epoch": 1.651291745431632,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004728266565205497,
|
|
"loss": 5.3286,
|
|
"mean_token_accuracy": 0.17261691987514496,
|
|
"num_tokens": 36244750.0,
|
|
"step": 19655
|
|
},
|
|
{
|
|
"entropy": 5.665705299377441,
|
|
"epoch": 1.6517118252467968,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047281238828334924,
|
|
"loss": 5.3737,
|
|
"mean_token_accuracy": 0.17210416346788407,
|
|
"num_tokens": 36254902.0,
|
|
"step": 19660
|
|
},
|
|
{
|
|
"entropy": 5.684027051925659,
|
|
"epoch": 1.6521319050619616,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004727981165420042,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.16854705959558486,
|
|
"num_tokens": 36265546.0,
|
|
"step": 19665
|
|
},
|
|
{
|
|
"entropy": 5.635334634780884,
|
|
"epoch": 1.6525519848771266,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004727838412967674,
|
|
"loss": 5.3356,
|
|
"mean_token_accuracy": 0.1739551231265068,
|
|
"num_tokens": 36273978.0,
|
|
"step": 19670
|
|
},
|
|
{
|
|
"entropy": 5.694224214553833,
|
|
"epoch": 1.6529720646922916,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004727695625478917,
|
|
"loss": 5.3725,
|
|
"mean_token_accuracy": 0.16794622987508773,
|
|
"num_tokens": 36283117.0,
|
|
"step": 19675
|
|
},
|
|
{
|
|
"entropy": 5.7062092304229735,
|
|
"epoch": 1.6533921445074564,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00047275528029562996,
|
|
"loss": 5.37,
|
|
"mean_token_accuracy": 0.16468877643346785,
|
|
"num_tokens": 36293031.0,
|
|
"step": 19680
|
|
},
|
|
{
|
|
"entropy": 5.597905492782592,
|
|
"epoch": 1.6538122243226212,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00047274099454023535,
|
|
"loss": 5.3618,
|
|
"mean_token_accuracy": 0.1748396039009094,
|
|
"num_tokens": 36302080.0,
|
|
"step": 19685
|
|
},
|
|
{
|
|
"entropy": 5.6517712593078615,
|
|
"epoch": 1.6542323041377862,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047272670528196084,
|
|
"loss": 5.389,
|
|
"mean_token_accuracy": 0.1675845429301262,
|
|
"num_tokens": 36311077.0,
|
|
"step": 19690
|
|
},
|
|
{
|
|
"entropy": 5.637048244476318,
|
|
"epoch": 1.6546523839529512,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004727124125210595,
|
|
"loss": 5.3213,
|
|
"mean_token_accuracy": 0.1745500758290291,
|
|
"num_tokens": 36320300.0,
|
|
"step": 19695
|
|
},
|
|
{
|
|
"entropy": 5.641404485702514,
|
|
"epoch": 1.655072463768116,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047269811625778456,
|
|
"loss": 5.3872,
|
|
"mean_token_accuracy": 0.17139033675193788,
|
|
"num_tokens": 36330184.0,
|
|
"step": 19700
|
|
},
|
|
{
|
|
"entropy": 5.538795757293701,
|
|
"epoch": 1.6554925435832808,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004726838164923893,
|
|
"loss": 5.3895,
|
|
"mean_token_accuracy": 0.16786455661058425,
|
|
"num_tokens": 36339526.0,
|
|
"step": 19705
|
|
},
|
|
{
|
|
"entropy": 5.6508077621459964,
|
|
"epoch": 1.6559126233984456,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00047266951322512716,
|
|
"loss": 5.3813,
|
|
"mean_token_accuracy": 0.1695254623889923,
|
|
"num_tokens": 36348849.0,
|
|
"step": 19710
|
|
},
|
|
{
|
|
"entropy": 5.727986001968384,
|
|
"epoch": 1.6563327032136106,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047265520645625123,
|
|
"loss": 5.3911,
|
|
"mean_token_accuracy": 0.1646333172917366,
|
|
"num_tokens": 36358924.0,
|
|
"step": 19715
|
|
},
|
|
{
|
|
"entropy": 5.7471997261047365,
|
|
"epoch": 1.6567527830287756,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00047264089618601513,
|
|
"loss": 5.422,
|
|
"mean_token_accuracy": 0.17060866355895996,
|
|
"num_tokens": 36367130.0,
|
|
"step": 19720
|
|
},
|
|
{
|
|
"entropy": 5.6405291080474855,
|
|
"epoch": 1.6571728628439404,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004726265824146724,
|
|
"loss": 5.3726,
|
|
"mean_token_accuracy": 0.16610245555639266,
|
|
"num_tokens": 36376575.0,
|
|
"step": 19725
|
|
},
|
|
{
|
|
"entropy": 5.561537742614746,
|
|
"epoch": 1.6575929426591052,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004726122651424764,
|
|
"loss": 5.3,
|
|
"mean_token_accuracy": 0.1740986868739128,
|
|
"num_tokens": 36385010.0,
|
|
"step": 19730
|
|
},
|
|
{
|
|
"entropy": 5.581302356719971,
|
|
"epoch": 1.65801302247427,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000472597944369681,
|
|
"loss": 5.1033,
|
|
"mean_token_accuracy": 0.18641779869794844,
|
|
"num_tokens": 36393574.0,
|
|
"step": 19735
|
|
},
|
|
{
|
|
"entropy": 5.634199094772339,
|
|
"epoch": 1.658433102289435,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00047258362009653965,
|
|
"loss": 5.3236,
|
|
"mean_token_accuracy": 0.17412642389535904,
|
|
"num_tokens": 36401992.0,
|
|
"step": 19740
|
|
},
|
|
{
|
|
"entropy": 5.673167896270752,
|
|
"epoch": 1.6588531821046,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00047256929232330624,
|
|
"loss": 5.463,
|
|
"mean_token_accuracy": 0.160048608481884,
|
|
"num_tokens": 36411712.0,
|
|
"step": 19745
|
|
},
|
|
{
|
|
"entropy": 5.579254055023194,
|
|
"epoch": 1.6592732619197648,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004725549610502346,
|
|
"loss": 5.2837,
|
|
"mean_token_accuracy": 0.17299832701683043,
|
|
"num_tokens": 36420240.0,
|
|
"step": 19750
|
|
},
|
|
{
|
|
"entropy": 5.632542705535888,
|
|
"epoch": 1.6596933417349296,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047254062627757854,
|
|
"loss": 5.4063,
|
|
"mean_token_accuracy": 0.17789214998483657,
|
|
"num_tokens": 36430068.0,
|
|
"step": 19755
|
|
},
|
|
{
|
|
"entropy": 5.690257835388183,
|
|
"epoch": 1.6601134215500946,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.000472526288005592,
|
|
"loss": 5.4355,
|
|
"mean_token_accuracy": 0.16823179572820662,
|
|
"num_tokens": 36439808.0,
|
|
"step": 19760
|
|
},
|
|
{
|
|
"entropy": 5.611015462875367,
|
|
"epoch": 1.6605335013652593,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000472511946234529,
|
|
"loss": 5.3956,
|
|
"mean_token_accuracy": 0.17020961195230483,
|
|
"num_tokens": 36449609.0,
|
|
"step": 19765
|
|
},
|
|
{
|
|
"entropy": 5.7615532875061035,
|
|
"epoch": 1.6609535811804244,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004724976009646435,
|
|
"loss": 5.3424,
|
|
"mean_token_accuracy": 0.17360990196466447,
|
|
"num_tokens": 36457700.0,
|
|
"step": 19770
|
|
},
|
|
{
|
|
"entropy": 5.669061231613159,
|
|
"epoch": 1.6613736609955891,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004724832521961897,
|
|
"loss": 5.4023,
|
|
"mean_token_accuracy": 0.17211264073848725,
|
|
"num_tokens": 36466881.0,
|
|
"step": 19775
|
|
},
|
|
{
|
|
"entropy": 5.711100006103516,
|
|
"epoch": 1.661793740810754,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047246889992942187,
|
|
"loss": 5.495,
|
|
"mean_token_accuracy": 0.16188012808561325,
|
|
"num_tokens": 36475433.0,
|
|
"step": 19780
|
|
},
|
|
{
|
|
"entropy": 5.68057951927185,
|
|
"epoch": 1.662213820625919,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004724545441645941,
|
|
"loss": 5.4116,
|
|
"mean_token_accuracy": 0.16782844066619873,
|
|
"num_tokens": 36484232.0,
|
|
"step": 19785
|
|
},
|
|
{
|
|
"entropy": 5.754859256744385,
|
|
"epoch": 1.662633900441084,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004724401849019608,
|
|
"loss": 5.5269,
|
|
"mean_token_accuracy": 0.1602175533771515,
|
|
"num_tokens": 36493588.0,
|
|
"step": 19790
|
|
},
|
|
{
|
|
"entropy": 5.669810009002686,
|
|
"epoch": 1.6630539802562487,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047242582214177616,
|
|
"loss": 5.3045,
|
|
"mean_token_accuracy": 0.1697609916329384,
|
|
"num_tokens": 36502289.0,
|
|
"step": 19795
|
|
},
|
|
{
|
|
"entropy": 5.724186754226684,
|
|
"epoch": 1.6634740600714135,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047241145588429483,
|
|
"loss": 5.4492,
|
|
"mean_token_accuracy": 0.1644959807395935,
|
|
"num_tokens": 36511978.0,
|
|
"step": 19800
|
|
},
|
|
{
|
|
"entropy": 5.680912446975708,
|
|
"epoch": 1.6638941398865783,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004723970861297712,
|
|
"loss": 5.4175,
|
|
"mean_token_accuracy": 0.17128399163484573,
|
|
"num_tokens": 36520378.0,
|
|
"step": 19805
|
|
},
|
|
{
|
|
"entropy": 5.655539083480835,
|
|
"epoch": 1.6643142197017433,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004723827128784599,
|
|
"loss": 5.4029,
|
|
"mean_token_accuracy": 0.16915369629859925,
|
|
"num_tokens": 36529965.0,
|
|
"step": 19810
|
|
},
|
|
{
|
|
"entropy": 5.836799001693725,
|
|
"epoch": 1.6647342995169083,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00047236833613061534,
|
|
"loss": 5.4194,
|
|
"mean_token_accuracy": 0.16969371736049652,
|
|
"num_tokens": 36539394.0,
|
|
"step": 19815
|
|
},
|
|
{
|
|
"entropy": 5.667885828018188,
|
|
"epoch": 1.665154379332073,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004723539558864925,
|
|
"loss": 5.4697,
|
|
"mean_token_accuracy": 0.17036117166280745,
|
|
"num_tokens": 36548608.0,
|
|
"step": 19820
|
|
},
|
|
{
|
|
"entropy": 5.670717477798462,
|
|
"epoch": 1.665574459147238,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004723395721463459,
|
|
"loss": 5.3393,
|
|
"mean_token_accuracy": 0.1704514279961586,
|
|
"num_tokens": 36557736.0,
|
|
"step": 19825
|
|
},
|
|
{
|
|
"entropy": 5.6675090312957765,
|
|
"epoch": 1.665994538962403,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004723251849104303,
|
|
"loss": 5.3703,
|
|
"mean_token_accuracy": 0.16267035156488419,
|
|
"num_tokens": 36566745.0,
|
|
"step": 19830
|
|
},
|
|
{
|
|
"entropy": 5.575802850723266,
|
|
"epoch": 1.6664146187775677,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00047231079417900076,
|
|
"loss": 5.3086,
|
|
"mean_token_accuracy": 0.1693269893527031,
|
|
"num_tokens": 36575956.0,
|
|
"step": 19835
|
|
},
|
|
{
|
|
"entropy": 5.638355350494384,
|
|
"epoch": 1.6668346985927327,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000472296399952312,
|
|
"loss": 5.3651,
|
|
"mean_token_accuracy": 0.17209307253360748,
|
|
"num_tokens": 36584673.0,
|
|
"step": 19840
|
|
},
|
|
{
|
|
"entropy": 5.703708839416504,
|
|
"epoch": 1.6672547784078975,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004722820022306192,
|
|
"loss": 5.422,
|
|
"mean_token_accuracy": 0.17276596128940583,
|
|
"num_tokens": 36593758.0,
|
|
"step": 19845
|
|
},
|
|
{
|
|
"entropy": 5.591260862350464,
|
|
"epoch": 1.6676748582230623,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004722676010141773,
|
|
"loss": 5.2767,
|
|
"mean_token_accuracy": 0.16923788189888,
|
|
"num_tokens": 36603722.0,
|
|
"step": 19850
|
|
},
|
|
{
|
|
"entropy": 5.6357824325561525,
|
|
"epoch": 1.6680949380382273,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047225319630324136,
|
|
"loss": 5.3335,
|
|
"mean_token_accuracy": 0.17396993786096573,
|
|
"num_tokens": 36612478.0,
|
|
"step": 19855
|
|
},
|
|
{
|
|
"entropy": 5.656694173812866,
|
|
"epoch": 1.6685150178533923,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004722387880980667,
|
|
"loss": 5.535,
|
|
"mean_token_accuracy": 0.16138018071651458,
|
|
"num_tokens": 36622399.0,
|
|
"step": 19860
|
|
},
|
|
{
|
|
"entropy": 5.709191513061524,
|
|
"epoch": 1.668935097668557,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047222437639890844,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.17041545510292053,
|
|
"num_tokens": 36631798.0,
|
|
"step": 19865
|
|
},
|
|
{
|
|
"entropy": 5.570785617828369,
|
|
"epoch": 1.6693551774837219,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.00047220996120602197,
|
|
"loss": 5.3879,
|
|
"mean_token_accuracy": 0.1724646970629692,
|
|
"num_tokens": 36640405.0,
|
|
"step": 19870
|
|
},
|
|
{
|
|
"entropy": 5.717275476455688,
|
|
"epoch": 1.6697752572988867,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047219554251966246,
|
|
"loss": 5.5201,
|
|
"mean_token_accuracy": 0.1616477571427822,
|
|
"num_tokens": 36650209.0,
|
|
"step": 19875
|
|
},
|
|
{
|
|
"entropy": 5.780755186080933,
|
|
"epoch": 1.6701953371140517,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004721811203400855,
|
|
"loss": 5.4614,
|
|
"mean_token_accuracy": 0.16350688189268112,
|
|
"num_tokens": 36660248.0,
|
|
"step": 19880
|
|
},
|
|
{
|
|
"entropy": 5.65017991065979,
|
|
"epoch": 1.6706154169292167,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047216669466754657,
|
|
"loss": 5.3575,
|
|
"mean_token_accuracy": 0.17064955681562424,
|
|
"num_tokens": 36669938.0,
|
|
"step": 19885
|
|
},
|
|
{
|
|
"entropy": 5.548468828201294,
|
|
"epoch": 1.6710354967443815,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004721522655023012,
|
|
"loss": 5.375,
|
|
"mean_token_accuracy": 0.17481788247823715,
|
|
"num_tokens": 36679903.0,
|
|
"step": 19890
|
|
},
|
|
{
|
|
"entropy": 5.753811597824097,
|
|
"epoch": 1.6714555765595462,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004721378328446049,
|
|
"loss": 5.4404,
|
|
"mean_token_accuracy": 0.17175495326519014,
|
|
"num_tokens": 36688424.0,
|
|
"step": 19895
|
|
},
|
|
{
|
|
"entropy": 5.7923060894012455,
|
|
"epoch": 1.6718756563747112,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004721233966947134,
|
|
"loss": 5.5084,
|
|
"mean_token_accuracy": 0.16471525579690932,
|
|
"num_tokens": 36698715.0,
|
|
"step": 19900
|
|
},
|
|
{
|
|
"entropy": 5.536203193664551,
|
|
"epoch": 1.672295736189876,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00047210895705288237,
|
|
"loss": 5.3675,
|
|
"mean_token_accuracy": 0.18398713916540146,
|
|
"num_tokens": 36708456.0,
|
|
"step": 19905
|
|
},
|
|
{
|
|
"entropy": 5.648340320587158,
|
|
"epoch": 1.672715816005041,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004720945139193678,
|
|
"loss": 5.3991,
|
|
"mean_token_accuracy": 0.1710827425122261,
|
|
"num_tokens": 36717596.0,
|
|
"step": 19910
|
|
},
|
|
{
|
|
"entropy": 5.751449108123779,
|
|
"epoch": 1.6731358958202058,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004720800672944253,
|
|
"loss": 5.483,
|
|
"mean_token_accuracy": 0.16295073330402374,
|
|
"num_tokens": 36727092.0,
|
|
"step": 19915
|
|
},
|
|
{
|
|
"entropy": 5.658854913711548,
|
|
"epoch": 1.6735559756353706,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004720656171783109,
|
|
"loss": 5.2087,
|
|
"mean_token_accuracy": 0.18139244765043258,
|
|
"num_tokens": 36735910.0,
|
|
"step": 19920
|
|
},
|
|
{
|
|
"entropy": 5.5950675964355465,
|
|
"epoch": 1.6739760554505356,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004720511635712806,
|
|
"loss": 5.3288,
|
|
"mean_token_accuracy": 0.17601545453071593,
|
|
"num_tokens": 36745237.0,
|
|
"step": 19925
|
|
},
|
|
{
|
|
"entropy": 5.65685772895813,
|
|
"epoch": 1.6743961352657006,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047203670647359035,
|
|
"loss": 5.466,
|
|
"mean_token_accuracy": 0.16994206011295318,
|
|
"num_tokens": 36753603.0,
|
|
"step": 19930
|
|
},
|
|
{
|
|
"entropy": 5.777238512039185,
|
|
"epoch": 1.6748162150808654,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004720222458854964,
|
|
"loss": 5.4552,
|
|
"mean_token_accuracy": 0.16490527987480164,
|
|
"num_tokens": 36763010.0,
|
|
"step": 19935
|
|
},
|
|
{
|
|
"entropy": 5.711502265930176,
|
|
"epoch": 1.6752362948960302,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047200778180725477,
|
|
"loss": 5.384,
|
|
"mean_token_accuracy": 0.17073629200458526,
|
|
"num_tokens": 36772156.0,
|
|
"step": 19940
|
|
},
|
|
{
|
|
"entropy": 5.617578077316284,
|
|
"epoch": 1.675656374711195,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00047199331423912174,
|
|
"loss": 5.2788,
|
|
"mean_token_accuracy": 0.175381575524807,
|
|
"num_tokens": 36781386.0,
|
|
"step": 19945
|
|
},
|
|
{
|
|
"entropy": 5.661901426315308,
|
|
"epoch": 1.67607645452636,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004719788431813536,
|
|
"loss": 5.4426,
|
|
"mean_token_accuracy": 0.1666231006383896,
|
|
"num_tokens": 36790754.0,
|
|
"step": 19950
|
|
},
|
|
{
|
|
"entropy": 5.7037333965301515,
|
|
"epoch": 1.676496534341525,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004719643686342066,
|
|
"loss": 5.411,
|
|
"mean_token_accuracy": 0.1669971838593483,
|
|
"num_tokens": 36799623.0,
|
|
"step": 19955
|
|
},
|
|
{
|
|
"entropy": 5.560089445114135,
|
|
"epoch": 1.6769166141566898,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004719498905979373,
|
|
"loss": 5.2094,
|
|
"mean_token_accuracy": 0.18330834209918975,
|
|
"num_tokens": 36808662.0,
|
|
"step": 19960
|
|
},
|
|
{
|
|
"entropy": 5.662607908248901,
|
|
"epoch": 1.6773366939718546,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004719354090728021,
|
|
"loss": 5.3575,
|
|
"mean_token_accuracy": 0.1706179365515709,
|
|
"num_tokens": 36817730.0,
|
|
"step": 19965
|
|
},
|
|
{
|
|
"entropy": 5.664571619033813,
|
|
"epoch": 1.6777567737870194,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00047192092405905743,
|
|
"loss": 5.3373,
|
|
"mean_token_accuracy": 0.1712536782026291,
|
|
"num_tokens": 36827203.0,
|
|
"step": 19970
|
|
},
|
|
{
|
|
"entropy": 5.685041427612305,
|
|
"epoch": 1.6781768536021844,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004719064355569601,
|
|
"loss": 5.5026,
|
|
"mean_token_accuracy": 0.1671118676662445,
|
|
"num_tokens": 36836145.0,
|
|
"step": 19975
|
|
},
|
|
{
|
|
"entropy": 5.684621858596802,
|
|
"epoch": 1.6785969334173494,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00047189194356676666,
|
|
"loss": 5.4991,
|
|
"mean_token_accuracy": 0.16675533056259156,
|
|
"num_tokens": 36845609.0,
|
|
"step": 19980
|
|
},
|
|
{
|
|
"entropy": 5.687040328979492,
|
|
"epoch": 1.6790170132325142,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047187744808873386,
|
|
"loss": 5.5006,
|
|
"mean_token_accuracy": 0.16970218122005462,
|
|
"num_tokens": 36855367.0,
|
|
"step": 19985
|
|
},
|
|
{
|
|
"entropy": 5.6927672863006595,
|
|
"epoch": 1.679437093047679,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.00047186294912311835,
|
|
"loss": 5.4542,
|
|
"mean_token_accuracy": 0.16267849504947662,
|
|
"num_tokens": 36864808.0,
|
|
"step": 19990
|
|
},
|
|
{
|
|
"entropy": 5.654680156707764,
|
|
"epoch": 1.679857172862844,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047184844667017705,
|
|
"loss": 5.3155,
|
|
"mean_token_accuracy": 0.1753552258014679,
|
|
"num_tokens": 36873651.0,
|
|
"step": 19995
|
|
},
|
|
{
|
|
"entropy": 5.636945676803589,
|
|
"epoch": 1.680277252678009,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00047183394073016695,
|
|
"loss": 5.4605,
|
|
"mean_token_accuracy": 0.1641372784972191,
|
|
"num_tokens": 36883227.0,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"entropy": 5.602861928939819,
|
|
"epoch": 1.6806973324931738,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00047181943130334493,
|
|
"loss": 5.2416,
|
|
"mean_token_accuracy": 0.1794225737452507,
|
|
"num_tokens": 36891628.0,
|
|
"step": 20005
|
|
},
|
|
{
|
|
"entropy": 5.613242959976196,
|
|
"epoch": 1.6811174123083386,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000471804918389968,
|
|
"loss": 5.3644,
|
|
"mean_token_accuracy": 0.16712576299905776,
|
|
"num_tokens": 36901819.0,
|
|
"step": 20010
|
|
},
|
|
{
|
|
"entropy": 5.6311595916748045,
|
|
"epoch": 1.6815374921235033,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004717904019902933,
|
|
"loss": 5.4003,
|
|
"mean_token_accuracy": 0.17059791535139085,
|
|
"num_tokens": 36911206.0,
|
|
"step": 20015
|
|
},
|
|
{
|
|
"entropy": 5.639288139343262,
|
|
"epoch": 1.6819575719386683,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000471775882104578,
|
|
"loss": 5.3459,
|
|
"mean_token_accuracy": 0.17099965065717698,
|
|
"num_tokens": 36920830.0,
|
|
"step": 20020
|
|
},
|
|
{
|
|
"entropy": 5.543249130249023,
|
|
"epoch": 1.6823776517538334,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00047176135873307917,
|
|
"loss": 5.2633,
|
|
"mean_token_accuracy": 0.17037912011146544,
|
|
"num_tokens": 36929702.0,
|
|
"step": 20025
|
|
},
|
|
{
|
|
"entropy": 5.723000860214233,
|
|
"epoch": 1.6827977315689981,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004717468318760543,
|
|
"loss": 5.4725,
|
|
"mean_token_accuracy": 0.16794218271970748,
|
|
"num_tokens": 36938423.0,
|
|
"step": 20030
|
|
},
|
|
{
|
|
"entropy": 5.695086097717285,
|
|
"epoch": 1.683217811384163,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047173230153376057,
|
|
"loss": 5.3934,
|
|
"mean_token_accuracy": 0.16773395538330077,
|
|
"num_tokens": 36947198.0,
|
|
"step": 20035
|
|
},
|
|
{
|
|
"entropy": 5.658504676818848,
|
|
"epoch": 1.6836378911993277,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004717177677064554,
|
|
"loss": 5.3724,
|
|
"mean_token_accuracy": 0.17398134768009185,
|
|
"num_tokens": 36955636.0,
|
|
"step": 20040
|
|
},
|
|
{
|
|
"entropy": 5.590145826339722,
|
|
"epoch": 1.6840579710144927,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00047170323039439634,
|
|
"loss": 5.3286,
|
|
"mean_token_accuracy": 0.17025046944618225,
|
|
"num_tokens": 36964463.0,
|
|
"step": 20045
|
|
},
|
|
{
|
|
"entropy": 5.70718035697937,
|
|
"epoch": 1.6844780508296577,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004716886895978408,
|
|
"loss": 5.4353,
|
|
"mean_token_accuracy": 0.1722966879606247,
|
|
"num_tokens": 36974043.0,
|
|
"step": 20050
|
|
},
|
|
{
|
|
"entropy": 5.650777006149292,
|
|
"epoch": 1.6848981306448225,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00047167414531704637,
|
|
"loss": 5.3406,
|
|
"mean_token_accuracy": 0.17258572578430176,
|
|
"num_tokens": 36983856.0,
|
|
"step": 20055
|
|
},
|
|
{
|
|
"entropy": 5.618655967712402,
|
|
"epoch": 1.6853182104599873,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047165959755227077,
|
|
"loss": 5.3678,
|
|
"mean_token_accuracy": 0.17598632574081421,
|
|
"num_tokens": 36992664.0,
|
|
"step": 20060
|
|
},
|
|
{
|
|
"entropy": 5.616749095916748,
|
|
"epoch": 1.6857382902751523,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047164504630377166,
|
|
"loss": 5.4167,
|
|
"mean_token_accuracy": 0.17754430770874025,
|
|
"num_tokens": 37001826.0,
|
|
"step": 20065
|
|
},
|
|
{
|
|
"entropy": 5.744650173187256,
|
|
"epoch": 1.6861583700903173,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00047163049157180676,
|
|
"loss": 5.4431,
|
|
"mean_token_accuracy": 0.16668420433998107,
|
|
"num_tokens": 37010821.0,
|
|
"step": 20070
|
|
},
|
|
{
|
|
"entropy": 5.711326599121094,
|
|
"epoch": 1.6865784499054821,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000471615933356634,
|
|
"loss": 5.5325,
|
|
"mean_token_accuracy": 0.15652497559785844,
|
|
"num_tokens": 37021293.0,
|
|
"step": 20075
|
|
},
|
|
{
|
|
"entropy": 5.657518434524536,
|
|
"epoch": 1.686998529720647,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004716013716585112,
|
|
"loss": 5.3066,
|
|
"mean_token_accuracy": 0.17586107850074767,
|
|
"num_tokens": 37031063.0,
|
|
"step": 20080
|
|
},
|
|
{
|
|
"entropy": 5.612600946426392,
|
|
"epoch": 1.6874186095358117,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004715868064776964,
|
|
"loss": 5.3682,
|
|
"mean_token_accuracy": 0.17950290441513062,
|
|
"num_tokens": 37040879.0,
|
|
"step": 20085
|
|
},
|
|
{
|
|
"entropy": 5.600485229492188,
|
|
"epoch": 1.6878386893509767,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004715722378144474,
|
|
"loss": 5.2522,
|
|
"mean_token_accuracy": 0.17968133985996246,
|
|
"num_tokens": 37049452.0,
|
|
"step": 20090
|
|
},
|
|
{
|
|
"entropy": 5.524720573425293,
|
|
"epoch": 1.6882587691661417,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004715576656690225,
|
|
"loss": 5.2317,
|
|
"mean_token_accuracy": 0.17775061279535292,
|
|
"num_tokens": 37058010.0,
|
|
"step": 20095
|
|
},
|
|
{
|
|
"entropy": 5.663621473312378,
|
|
"epoch": 1.6886788489813065,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00047154309004167984,
|
|
"loss": 5.4581,
|
|
"mean_token_accuracy": 0.1619523733854294,
|
|
"num_tokens": 37067580.0,
|
|
"step": 20100
|
|
},
|
|
{
|
|
"entropy": 5.626581048965454,
|
|
"epoch": 1.6890989287964713,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047152851093267744,
|
|
"loss": 5.3434,
|
|
"mean_token_accuracy": 0.17342365384101868,
|
|
"num_tokens": 37076584.0,
|
|
"step": 20105
|
|
},
|
|
{
|
|
"entropy": 5.6316078186035154,
|
|
"epoch": 1.689519008611636,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004715139283422737,
|
|
"loss": 5.3632,
|
|
"mean_token_accuracy": 0.16921617537736894,
|
|
"num_tokens": 37086330.0,
|
|
"step": 20110
|
|
},
|
|
{
|
|
"entropy": 5.696176671981812,
|
|
"epoch": 1.689939088426801,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000471499342270727,
|
|
"loss": 5.4194,
|
|
"mean_token_accuracy": 0.16318628638982774,
|
|
"num_tokens": 37096323.0,
|
|
"step": 20115
|
|
},
|
|
{
|
|
"entropy": 5.619508266448975,
|
|
"epoch": 1.690359168241966,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00047148475271829556,
|
|
"loss": 5.3484,
|
|
"mean_token_accuracy": 0.1682300463318825,
|
|
"num_tokens": 37106281.0,
|
|
"step": 20120
|
|
},
|
|
{
|
|
"entropy": 5.5695782661437985,
|
|
"epoch": 1.6907792480571309,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004714701596852379,
|
|
"loss": 5.3293,
|
|
"mean_token_accuracy": 0.1787579908967018,
|
|
"num_tokens": 37116002.0,
|
|
"step": 20125
|
|
},
|
|
{
|
|
"entropy": 5.626379442214966,
|
|
"epoch": 1.6911993278722957,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004714555631718125,
|
|
"loss": 5.4004,
|
|
"mean_token_accuracy": 0.17309577763080597,
|
|
"num_tokens": 37125125.0,
|
|
"step": 20130
|
|
},
|
|
{
|
|
"entropy": 5.601164245605469,
|
|
"epoch": 1.6916194076874607,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000471440963178278,
|
|
"loss": 5.2532,
|
|
"mean_token_accuracy": 0.18026716858148575,
|
|
"num_tokens": 37134358.0,
|
|
"step": 20135
|
|
},
|
|
{
|
|
"entropy": 5.707911014556885,
|
|
"epoch": 1.6920394875026254,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047142635970489293,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.16907861083745956,
|
|
"num_tokens": 37143732.0,
|
|
"step": 20140
|
|
},
|
|
{
|
|
"entropy": 5.634232664108277,
|
|
"epoch": 1.6924595673177905,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004714117527519161,
|
|
"loss": 5.3242,
|
|
"mean_token_accuracy": 0.17292115837335587,
|
|
"num_tokens": 37153809.0,
|
|
"step": 20145
|
|
},
|
|
{
|
|
"entropy": 5.632751035690307,
|
|
"epoch": 1.6928796471329552,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047139714231960616,
|
|
"loss": 5.3578,
|
|
"mean_token_accuracy": 0.16431571841239928,
|
|
"num_tokens": 37163272.0,
|
|
"step": 20150
|
|
},
|
|
{
|
|
"entropy": 5.591974878311158,
|
|
"epoch": 1.69329972694812,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000471382528408222,
|
|
"loss": 5.2814,
|
|
"mean_token_accuracy": 0.17594963163137436,
|
|
"num_tokens": 37172323.0,
|
|
"step": 20155
|
|
},
|
|
{
|
|
"entropy": 5.712676620483398,
|
|
"epoch": 1.693719806763285,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004713679110180225,
|
|
"loss": 5.4905,
|
|
"mean_token_accuracy": 0.16899462938308715,
|
|
"num_tokens": 37181262.0,
|
|
"step": 20160
|
|
},
|
|
{
|
|
"entropy": 5.6906595706939695,
|
|
"epoch": 1.69413988657845,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004713532901492666,
|
|
"loss": 5.4426,
|
|
"mean_token_accuracy": 0.17433411180973052,
|
|
"num_tokens": 37189576.0,
|
|
"step": 20165
|
|
},
|
|
{
|
|
"entropy": 5.718492841720581,
|
|
"epoch": 1.6945599663936148,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004713386658022132,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.16342198550701142,
|
|
"num_tokens": 37199502.0,
|
|
"step": 20170
|
|
},
|
|
{
|
|
"entropy": 5.702072095870972,
|
|
"epoch": 1.6949800462087796,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004713240379771214,
|
|
"loss": 5.3477,
|
|
"mean_token_accuracy": 0.16661544740200043,
|
|
"num_tokens": 37209028.0,
|
|
"step": 20175
|
|
},
|
|
{
|
|
"entropy": 5.704798460006714,
|
|
"epoch": 1.6954001260239444,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004713094066742505,
|
|
"loss": 5.4943,
|
|
"mean_token_accuracy": 0.16960543841123582,
|
|
"num_tokens": 37218087.0,
|
|
"step": 20180
|
|
},
|
|
{
|
|
"entropy": 5.675905656814575,
|
|
"epoch": 1.6958202058391094,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00047129477189385946,
|
|
"loss": 5.4475,
|
|
"mean_token_accuracy": 0.16832420825958253,
|
|
"num_tokens": 37227345.0,
|
|
"step": 20185
|
|
},
|
|
{
|
|
"entropy": 5.73945164680481,
|
|
"epoch": 1.6962402856542744,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004712801336362076,
|
|
"loss": 5.3736,
|
|
"mean_token_accuracy": 0.16931984573602676,
|
|
"num_tokens": 37236011.0,
|
|
"step": 20190
|
|
},
|
|
{
|
|
"entropy": 5.620118522644043,
|
|
"epoch": 1.6966603654694392,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004712654919015543,
|
|
"loss": 5.3576,
|
|
"mean_token_accuracy": 0.17278312891721725,
|
|
"num_tokens": 37244613.0,
|
|
"step": 20195
|
|
},
|
|
{
|
|
"entropy": 5.61281909942627,
|
|
"epoch": 1.697080445284604,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004712508466901588,
|
|
"loss": 5.3743,
|
|
"mean_token_accuracy": 0.1720852240920067,
|
|
"num_tokens": 37253768.0,
|
|
"step": 20200
|
|
},
|
|
{
|
|
"entropy": 5.713197374343872,
|
|
"epoch": 1.697500525099769,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047123619800228057,
|
|
"loss": 5.4486,
|
|
"mean_token_accuracy": 0.1585058517754078,
|
|
"num_tokens": 37263230.0,
|
|
"step": 20205
|
|
},
|
|
{
|
|
"entropy": 5.66543140411377,
|
|
"epoch": 1.6979206049149338,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004712215458381792,
|
|
"loss": 5.344,
|
|
"mean_token_accuracy": 0.1704501375555992,
|
|
"num_tokens": 37272752.0,
|
|
"step": 20210
|
|
},
|
|
{
|
|
"entropy": 5.69918270111084,
|
|
"epoch": 1.6983406847300988,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004712068901981142,
|
|
"loss": 5.3909,
|
|
"mean_token_accuracy": 0.17387653589248658,
|
|
"num_tokens": 37281465.0,
|
|
"step": 20215
|
|
},
|
|
{
|
|
"entropy": 5.658880043029785,
|
|
"epoch": 1.6987607645452636,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004711922310823452,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.16725497990846633,
|
|
"num_tokens": 37290408.0,
|
|
"step": 20220
|
|
},
|
|
{
|
|
"entropy": 5.644626569747925,
|
|
"epoch": 1.6991808443604284,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004711775684911318,
|
|
"loss": 5.3498,
|
|
"mean_token_accuracy": 0.1716018721461296,
|
|
"num_tokens": 37298890.0,
|
|
"step": 20225
|
|
},
|
|
{
|
|
"entropy": 5.60590615272522,
|
|
"epoch": 1.6996009241755934,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00047116290242473375,
|
|
"loss": 5.3494,
|
|
"mean_token_accuracy": 0.16820138245820998,
|
|
"num_tokens": 37307720.0,
|
|
"step": 20230
|
|
},
|
|
{
|
|
"entropy": 5.641182088851929,
|
|
"epoch": 1.7000210039907584,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.000471148232883411,
|
|
"loss": 5.3946,
|
|
"mean_token_accuracy": 0.16923058927059173,
|
|
"num_tokens": 37317145.0,
|
|
"step": 20235
|
|
},
|
|
{
|
|
"entropy": 5.6542257308959964,
|
|
"epoch": 1.7004410838059232,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00047113355986742325,
|
|
"loss": 5.329,
|
|
"mean_token_accuracy": 0.17771051228046417,
|
|
"num_tokens": 37326579.0,
|
|
"step": 20240
|
|
},
|
|
{
|
|
"entropy": 5.677807474136353,
|
|
"epoch": 1.700861163621088,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00047111888337703046,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.17049338668584824,
|
|
"num_tokens": 37336065.0,
|
|
"step": 20245
|
|
},
|
|
{
|
|
"entropy": 5.56732497215271,
|
|
"epoch": 1.7012812434362528,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004711042034124926,
|
|
"loss": 5.2807,
|
|
"mean_token_accuracy": 0.17862701117992402,
|
|
"num_tokens": 37345297.0,
|
|
"step": 20250
|
|
},
|
|
{
|
|
"entropy": 5.668249082565308,
|
|
"epoch": 1.7017013232514178,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004710895199740698,
|
|
"loss": 5.42,
|
|
"mean_token_accuracy": 0.16874612122774124,
|
|
"num_tokens": 37354942.0,
|
|
"step": 20255
|
|
},
|
|
{
|
|
"entropy": 5.729604482650757,
|
|
"epoch": 1.7021214030665828,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004710748330620222,
|
|
"loss": 5.3187,
|
|
"mean_token_accuracy": 0.17622058391571044,
|
|
"num_tokens": 37364068.0,
|
|
"step": 20260
|
|
},
|
|
{
|
|
"entropy": 5.6129645824432375,
|
|
"epoch": 1.7025414828817476,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004710601426766098,
|
|
"loss": 5.4302,
|
|
"mean_token_accuracy": 0.16786112040281295,
|
|
"num_tokens": 37373256.0,
|
|
"step": 20265
|
|
},
|
|
{
|
|
"entropy": 5.576197624206543,
|
|
"epoch": 1.7029615626969123,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00047104544881809295,
|
|
"loss": 5.2813,
|
|
"mean_token_accuracy": 0.17993906289339065,
|
|
"num_tokens": 37382098.0,
|
|
"step": 20270
|
|
},
|
|
{
|
|
"entropy": 5.573770999908447,
|
|
"epoch": 1.7033816425120771,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004710307514867319,
|
|
"loss": 5.2724,
|
|
"mean_token_accuracy": 0.17502158433198928,
|
|
"num_tokens": 37390844.0,
|
|
"step": 20275
|
|
},
|
|
{
|
|
"entropy": 5.67983660697937,
|
|
"epoch": 1.7038017223272421,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004710160506827871,
|
|
"loss": 5.3478,
|
|
"mean_token_accuracy": 0.16562999337911605,
|
|
"num_tokens": 37399617.0,
|
|
"step": 20280
|
|
},
|
|
{
|
|
"entropy": 5.7143641948699955,
|
|
"epoch": 1.7042218021424071,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004710013464065189,
|
|
"loss": 5.4787,
|
|
"mean_token_accuracy": 0.16709637641906738,
|
|
"num_tokens": 37409368.0,
|
|
"step": 20285
|
|
},
|
|
{
|
|
"entropy": 5.6330140113830565,
|
|
"epoch": 1.704641881957572,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004709866386581877,
|
|
"loss": 5.2808,
|
|
"mean_token_accuracy": 0.1773850664496422,
|
|
"num_tokens": 37418026.0,
|
|
"step": 20290
|
|
},
|
|
{
|
|
"entropy": 5.621044492721557,
|
|
"epoch": 1.7050619617727367,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00047097192743805413,
|
|
"loss": 5.3021,
|
|
"mean_token_accuracy": 0.1740890622138977,
|
|
"num_tokens": 37426850.0,
|
|
"step": 20295
|
|
},
|
|
{
|
|
"entropy": 5.63762059211731,
|
|
"epoch": 1.7054820415879017,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004709572127463788,
|
|
"loss": 5.3505,
|
|
"mean_token_accuracy": 0.1763610526919365,
|
|
"num_tokens": 37436631.0,
|
|
"step": 20300
|
|
},
|
|
{
|
|
"entropy": 5.673188161849976,
|
|
"epoch": 1.7059021214030667,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004709424945834223,
|
|
"loss": 5.3697,
|
|
"mean_token_accuracy": 0.1696738511323929,
|
|
"num_tokens": 37445619.0,
|
|
"step": 20305
|
|
},
|
|
{
|
|
"entropy": 5.609205055236816,
|
|
"epoch": 1.7063222012182315,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047092777294944544,
|
|
"loss": 5.3223,
|
|
"mean_token_accuracy": 0.17436521351337433,
|
|
"num_tokens": 37454205.0,
|
|
"step": 20310
|
|
},
|
|
{
|
|
"entropy": 5.672186851501465,
|
|
"epoch": 1.7067422810333963,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.000470913047844709,
|
|
"loss": 5.4272,
|
|
"mean_token_accuracy": 0.17115625292062758,
|
|
"num_tokens": 37463301.0,
|
|
"step": 20315
|
|
},
|
|
{
|
|
"entropy": 5.664550542831421,
|
|
"epoch": 1.707162360848561,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047089831926947374,
|
|
"loss": 5.4153,
|
|
"mean_token_accuracy": 0.1740603879094124,
|
|
"num_tokens": 37471937.0,
|
|
"step": 20320
|
|
},
|
|
{
|
|
"entropy": 5.715552902221679,
|
|
"epoch": 1.707582440663726,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004708835872240007,
|
|
"loss": 5.378,
|
|
"mean_token_accuracy": 0.17142789512872697,
|
|
"num_tokens": 37480779.0,
|
|
"step": 20325
|
|
},
|
|
{
|
|
"entropy": 5.700094079971313,
|
|
"epoch": 1.7080025204788911,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00047086885170855074,
|
|
"loss": 5.4218,
|
|
"mean_token_accuracy": 0.16403729021549224,
|
|
"num_tokens": 37491053.0,
|
|
"step": 20330
|
|
},
|
|
{
|
|
"entropy": 5.68527364730835,
|
|
"epoch": 1.708422600294056,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000470854112723385,
|
|
"loss": 5.3663,
|
|
"mean_token_accuracy": 0.17164998948574067,
|
|
"num_tokens": 37499091.0,
|
|
"step": 20335
|
|
},
|
|
{
|
|
"entropy": 5.639491558074951,
|
|
"epoch": 1.7088426801092207,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004708393702687644,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.1666134625673294,
|
|
"num_tokens": 37507882.0,
|
|
"step": 20340
|
|
},
|
|
{
|
|
"entropy": 5.662171506881714,
|
|
"epoch": 1.7092627599243855,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00047082462434495015,
|
|
"loss": 5.3894,
|
|
"mean_token_accuracy": 0.17504524290561677,
|
|
"num_tokens": 37517048.0,
|
|
"step": 20345
|
|
},
|
|
{
|
|
"entropy": 5.745312738418579,
|
|
"epoch": 1.7096828397395505,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004708098749522036,
|
|
"loss": 5.4333,
|
|
"mean_token_accuracy": 0.16021379381418227,
|
|
"num_tokens": 37526355.0,
|
|
"step": 20350
|
|
},
|
|
{
|
|
"entropy": 5.697979307174682,
|
|
"epoch": 1.7101029195547155,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004707951220907859,
|
|
"loss": 5.4629,
|
|
"mean_token_accuracy": 0.1664559945464134,
|
|
"num_tokens": 37535746.0,
|
|
"step": 20355
|
|
},
|
|
{
|
|
"entropy": 5.711132001876831,
|
|
"epoch": 1.7105229993698803,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004707803657609585,
|
|
"loss": 5.4243,
|
|
"mean_token_accuracy": 0.16239014863967896,
|
|
"num_tokens": 37546479.0,
|
|
"step": 20360
|
|
},
|
|
{
|
|
"entropy": 5.745557022094727,
|
|
"epoch": 1.710943079185045,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00047076560596298275,
|
|
"loss": 5.4748,
|
|
"mean_token_accuracy": 0.1672067642211914,
|
|
"num_tokens": 37556805.0,
|
|
"step": 20365
|
|
},
|
|
{
|
|
"entropy": 5.74319806098938,
|
|
"epoch": 1.71136315900021,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00047075084269712,
|
|
"loss": 5.4602,
|
|
"mean_token_accuracy": 0.1733308419585228,
|
|
"num_tokens": 37564748.0,
|
|
"step": 20370
|
|
},
|
|
{
|
|
"entropy": 5.585902261734009,
|
|
"epoch": 1.711783238815375,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004707360759636319,
|
|
"loss": 5.2641,
|
|
"mean_token_accuracy": 0.1821661874651909,
|
|
"num_tokens": 37574674.0,
|
|
"step": 20375
|
|
},
|
|
{
|
|
"entropy": 5.665689754486084,
|
|
"epoch": 1.7122033186305399,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047072130576278,
|
|
"loss": 5.3839,
|
|
"mean_token_accuracy": 0.17048663049936294,
|
|
"num_tokens": 37584459.0,
|
|
"step": 20380
|
|
},
|
|
{
|
|
"entropy": 5.67416934967041,
|
|
"epoch": 1.7126233984457047,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004707065320948259,
|
|
"loss": 5.4119,
|
|
"mean_token_accuracy": 0.17284180521965026,
|
|
"num_tokens": 37593570.0,
|
|
"step": 20385
|
|
},
|
|
{
|
|
"entropy": 5.661170578002929,
|
|
"epoch": 1.7130434782608694,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00047069175496003147,
|
|
"loss": 5.4147,
|
|
"mean_token_accuracy": 0.16955641210079192,
|
|
"num_tokens": 37603032.0,
|
|
"step": 20390
|
|
},
|
|
{
|
|
"entropy": 5.6446874141693115,
|
|
"epoch": 1.7134635580760345,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004706769743586583,
|
|
"loss": 5.3464,
|
|
"mean_token_accuracy": 0.1723109945654869,
|
|
"num_tokens": 37612404.0,
|
|
"step": 20395
|
|
},
|
|
{
|
|
"entropy": 5.636924123764038,
|
|
"epoch": 1.7138836378911995,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00047066219029096837,
|
|
"loss": 5.3658,
|
|
"mean_token_accuracy": 0.1704767942428589,
|
|
"num_tokens": 37621933.0,
|
|
"step": 20400
|
|
},
|
|
{
|
|
"entropy": 5.732251310348511,
|
|
"epoch": 1.7143037177063642,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004706474027572234,
|
|
"loss": 5.3965,
|
|
"mean_token_accuracy": 0.17179838567972183,
|
|
"num_tokens": 37632078.0,
|
|
"step": 20405
|
|
},
|
|
{
|
|
"entropy": 5.536679124832153,
|
|
"epoch": 1.714723797521529,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00047063261175768543,
|
|
"loss": 5.3053,
|
|
"mean_token_accuracy": 0.17315014004707335,
|
|
"num_tokens": 37641665.0,
|
|
"step": 20410
|
|
},
|
|
{
|
|
"entropy": 5.708039617538452,
|
|
"epoch": 1.7151438773366938,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00047061781729261656,
|
|
"loss": 5.3721,
|
|
"mean_token_accuracy": 0.1656670242547989,
|
|
"num_tokens": 37650751.0,
|
|
"step": 20415
|
|
},
|
|
{
|
|
"entropy": 5.628295135498047,
|
|
"epoch": 1.7155639571518588,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00047060301936227865,
|
|
"loss": 5.3617,
|
|
"mean_token_accuracy": 0.17506831139326096,
|
|
"num_tokens": 37659165.0,
|
|
"step": 20420
|
|
},
|
|
{
|
|
"entropy": 5.646256732940674,
|
|
"epoch": 1.7159840369670238,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004705882179669341,
|
|
"loss": 5.3544,
|
|
"mean_token_accuracy": 0.17101034224033357,
|
|
"num_tokens": 37668057.0,
|
|
"step": 20425
|
|
},
|
|
{
|
|
"entropy": 5.685383367538452,
|
|
"epoch": 1.7164041167821886,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004705734131068449,
|
|
"loss": 5.348,
|
|
"mean_token_accuracy": 0.16941767185926437,
|
|
"num_tokens": 37677674.0,
|
|
"step": 20430
|
|
},
|
|
{
|
|
"entropy": 5.602097034454346,
|
|
"epoch": 1.7168241965973534,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004705586047822734,
|
|
"loss": 5.3825,
|
|
"mean_token_accuracy": 0.17536012828350067,
|
|
"num_tokens": 37687009.0,
|
|
"step": 20435
|
|
},
|
|
{
|
|
"entropy": 5.629279613494873,
|
|
"epoch": 1.7172442764125184,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00047054379299348194,
|
|
"loss": 5.2569,
|
|
"mean_token_accuracy": 0.17184604406356813,
|
|
"num_tokens": 37696723.0,
|
|
"step": 20440
|
|
},
|
|
{
|
|
"entropy": 5.608349704742432,
|
|
"epoch": 1.7176643562276832,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00047052897774073295,
|
|
"loss": 5.3778,
|
|
"mean_token_accuracy": 0.17021260857582093,
|
|
"num_tokens": 37706560.0,
|
|
"step": 20445
|
|
},
|
|
{
|
|
"entropy": 5.6732789993286135,
|
|
"epoch": 1.7180844360428482,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00047051415902428875,
|
|
"loss": 5.3945,
|
|
"mean_token_accuracy": 0.1690693438053131,
|
|
"num_tokens": 37715176.0,
|
|
"step": 20450
|
|
},
|
|
{
|
|
"entropy": 5.639693117141723,
|
|
"epoch": 1.718504515858013,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004704993368444119,
|
|
"loss": 5.3816,
|
|
"mean_token_accuracy": 0.16992994248867035,
|
|
"num_tokens": 37723956.0,
|
|
"step": 20455
|
|
},
|
|
{
|
|
"entropy": 5.684892559051514,
|
|
"epoch": 1.7189245956731778,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004704845112013649,
|
|
"loss": 5.3845,
|
|
"mean_token_accuracy": 0.17116268277168273,
|
|
"num_tokens": 37733236.0,
|
|
"step": 20460
|
|
},
|
|
{
|
|
"entropy": 5.705689287185669,
|
|
"epoch": 1.7193446754883428,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004704696820954105,
|
|
"loss": 5.441,
|
|
"mean_token_accuracy": 0.16739957481622697,
|
|
"num_tokens": 37742626.0,
|
|
"step": 20465
|
|
},
|
|
{
|
|
"entropy": 5.627845668792725,
|
|
"epoch": 1.7197647553035078,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004704548495268113,
|
|
"loss": 5.3161,
|
|
"mean_token_accuracy": 0.1832030311226845,
|
|
"num_tokens": 37751854.0,
|
|
"step": 20470
|
|
},
|
|
{
|
|
"entropy": 5.622863864898681,
|
|
"epoch": 1.7201848351186726,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00047044001349583,
|
|
"loss": 5.3535,
|
|
"mean_token_accuracy": 0.16801756620407104,
|
|
"num_tokens": 37760993.0,
|
|
"step": 20475
|
|
},
|
|
{
|
|
"entropy": 5.647179698944091,
|
|
"epoch": 1.7206049149338374,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00047042517400272966,
|
|
"loss": 5.4368,
|
|
"mean_token_accuracy": 0.17291858792304993,
|
|
"num_tokens": 37771714.0,
|
|
"step": 20480
|
|
},
|
|
{
|
|
"entropy": 5.68183217048645,
|
|
"epoch": 1.7210249947490022,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004704103310477729,
|
|
"loss": 5.3595,
|
|
"mean_token_accuracy": 0.17705655097961426,
|
|
"num_tokens": 37780653.0,
|
|
"step": 20485
|
|
},
|
|
{
|
|
"entropy": 5.679630851745605,
|
|
"epoch": 1.7214450745641672,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004703954846312228,
|
|
"loss": 5.4293,
|
|
"mean_token_accuracy": 0.1708232581615448,
|
|
"num_tokens": 37790450.0,
|
|
"step": 20490
|
|
},
|
|
{
|
|
"entropy": 5.649170446395874,
|
|
"epoch": 1.7218651543793322,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004703806347533423,
|
|
"loss": 5.4189,
|
|
"mean_token_accuracy": 0.16525555849075318,
|
|
"num_tokens": 37800450.0,
|
|
"step": 20495
|
|
},
|
|
{
|
|
"entropy": 5.679272603988648,
|
|
"epoch": 1.722285234194497,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004703657814143945,
|
|
"loss": 5.4314,
|
|
"mean_token_accuracy": 0.16290275305509566,
|
|
"num_tokens": 37809261.0,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"entropy": 5.642959403991699,
|
|
"epoch": 1.7227053140096618,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004703509246146424,
|
|
"loss": 5.2126,
|
|
"mean_token_accuracy": 0.17658686637878418,
|
|
"num_tokens": 37818244.0,
|
|
"step": 20505
|
|
},
|
|
{
|
|
"entropy": 5.628348398208618,
|
|
"epoch": 1.7231253938248268,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004703360643543493,
|
|
"loss": 5.3409,
|
|
"mean_token_accuracy": 0.17722394019365312,
|
|
"num_tokens": 37828555.0,
|
|
"step": 20510
|
|
},
|
|
{
|
|
"entropy": 5.563862991333008,
|
|
"epoch": 1.7235454736399916,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00047032120063377836,
|
|
"loss": 5.3109,
|
|
"mean_token_accuracy": 0.17544028162956238,
|
|
"num_tokens": 37837840.0,
|
|
"step": 20515
|
|
},
|
|
{
|
|
"entropy": 5.6779731750488285,
|
|
"epoch": 1.7239655534551566,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00047030633345319293,
|
|
"loss": 5.3616,
|
|
"mean_token_accuracy": 0.16884265542030336,
|
|
"num_tokens": 37846910.0,
|
|
"step": 20520
|
|
},
|
|
{
|
|
"entropy": 5.50715069770813,
|
|
"epoch": 1.7243856332703213,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00047029146281285647,
|
|
"loss": 5.2011,
|
|
"mean_token_accuracy": 0.1903439998626709,
|
|
"num_tokens": 37855642.0,
|
|
"step": 20525
|
|
},
|
|
{
|
|
"entropy": 5.667270755767822,
|
|
"epoch": 1.7248057130854861,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004702765887130322,
|
|
"loss": 5.3833,
|
|
"mean_token_accuracy": 0.17016415446996688,
|
|
"num_tokens": 37864439.0,
|
|
"step": 20530
|
|
},
|
|
{
|
|
"entropy": 5.749380970001221,
|
|
"epoch": 1.7252257929006511,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047026171115398377,
|
|
"loss": 5.4523,
|
|
"mean_token_accuracy": 0.1656784437596798,
|
|
"num_tokens": 37873801.0,
|
|
"step": 20535
|
|
},
|
|
{
|
|
"entropy": 5.5730626583099365,
|
|
"epoch": 1.7256458727158162,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0004702468301359746,
|
|
"loss": 5.3311,
|
|
"mean_token_accuracy": 0.17896921038627625,
|
|
"num_tokens": 37883915.0,
|
|
"step": 20540
|
|
},
|
|
{
|
|
"entropy": 5.662897348403931,
|
|
"epoch": 1.726065952530981,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004702319456592684,
|
|
"loss": 5.4436,
|
|
"mean_token_accuracy": 0.16838170140981673,
|
|
"num_tokens": 37894083.0,
|
|
"step": 20545
|
|
},
|
|
{
|
|
"entropy": 5.6904213428497314,
|
|
"epoch": 1.7264860323461457,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00047021705772412885,
|
|
"loss": 5.4377,
|
|
"mean_token_accuracy": 0.16888969093561174,
|
|
"num_tokens": 37902264.0,
|
|
"step": 20550
|
|
},
|
|
{
|
|
"entropy": 5.643442630767822,
|
|
"epoch": 1.7269061121613105,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00047020216633081964,
|
|
"loss": 5.341,
|
|
"mean_token_accuracy": 0.17365592420101167,
|
|
"num_tokens": 37911071.0,
|
|
"step": 20555
|
|
},
|
|
{
|
|
"entropy": 5.628277730941773,
|
|
"epoch": 1.7273261919764755,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00047018727147960453,
|
|
"loss": 5.4221,
|
|
"mean_token_accuracy": 0.16485991030931474,
|
|
"num_tokens": 37920048.0,
|
|
"step": 20560
|
|
},
|
|
{
|
|
"entropy": 5.698462057113647,
|
|
"epoch": 1.7277462717916405,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00047017237317074743,
|
|
"loss": 5.3894,
|
|
"mean_token_accuracy": 0.17410711497068404,
|
|
"num_tokens": 37928877.0,
|
|
"step": 20565
|
|
},
|
|
{
|
|
"entropy": 5.709046506881714,
|
|
"epoch": 1.7281663516068053,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004701574714045123,
|
|
"loss": 5.4051,
|
|
"mean_token_accuracy": 0.16448906511068345,
|
|
"num_tokens": 37937860.0,
|
|
"step": 20570
|
|
},
|
|
{
|
|
"entropy": 5.6509918689727785,
|
|
"epoch": 1.72858643142197,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00047014256618116304,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.1658877193927765,
|
|
"num_tokens": 37947588.0,
|
|
"step": 20575
|
|
},
|
|
{
|
|
"entropy": 5.638443422317505,
|
|
"epoch": 1.729006511237135,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00047012765750096365,
|
|
"loss": 5.3205,
|
|
"mean_token_accuracy": 0.1806677833199501,
|
|
"num_tokens": 37957598.0,
|
|
"step": 20580
|
|
},
|
|
{
|
|
"entropy": 5.641557359695435,
|
|
"epoch": 1.7294265910523,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00047011274536417827,
|
|
"loss": 5.3013,
|
|
"mean_token_accuracy": 0.17997593879699708,
|
|
"num_tokens": 37965294.0,
|
|
"step": 20585
|
|
},
|
|
{
|
|
"entropy": 5.590178346633911,
|
|
"epoch": 1.729846670867465,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00047009782977107113,
|
|
"loss": 5.3207,
|
|
"mean_token_accuracy": 0.1827242076396942,
|
|
"num_tokens": 37973977.0,
|
|
"step": 20590
|
|
},
|
|
{
|
|
"entropy": 5.771245050430298,
|
|
"epoch": 1.7302667506826297,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00047008291072190634,
|
|
"loss": 5.4798,
|
|
"mean_token_accuracy": 0.1616050750017166,
|
|
"num_tokens": 37984492.0,
|
|
"step": 20595
|
|
},
|
|
{
|
|
"entropy": 5.745875120162964,
|
|
"epoch": 1.7306868304977945,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004700679882169482,
|
|
"loss": 5.3922,
|
|
"mean_token_accuracy": 0.17045068442821504,
|
|
"num_tokens": 37994404.0,
|
|
"step": 20600
|
|
},
|
|
{
|
|
"entropy": 5.543208265304566,
|
|
"epoch": 1.7311069103129595,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004700530622564613,
|
|
"loss": 5.3057,
|
|
"mean_token_accuracy": 0.18024921864271165,
|
|
"num_tokens": 38002659.0,
|
|
"step": 20605
|
|
},
|
|
{
|
|
"entropy": 5.619626903533936,
|
|
"epoch": 1.7315269901281245,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004700381328407096,
|
|
"loss": 5.2932,
|
|
"mean_token_accuracy": 0.17747585326433182,
|
|
"num_tokens": 38012290.0,
|
|
"step": 20610
|
|
},
|
|
{
|
|
"entropy": 5.699101209640503,
|
|
"epoch": 1.7319470699432893,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004700231999699579,
|
|
"loss": 5.4263,
|
|
"mean_token_accuracy": 0.16802889853715897,
|
|
"num_tokens": 38022163.0,
|
|
"step": 20615
|
|
},
|
|
{
|
|
"entropy": 5.6378819942474365,
|
|
"epoch": 1.732367149758454,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004700082636444706,
|
|
"loss": 5.3703,
|
|
"mean_token_accuracy": 0.16259206235408782,
|
|
"num_tokens": 38031051.0,
|
|
"step": 20620
|
|
},
|
|
{
|
|
"entropy": 5.6816980838775635,
|
|
"epoch": 1.7327872295736189,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00046999332386451245,
|
|
"loss": 5.4231,
|
|
"mean_token_accuracy": 0.16787817180156708,
|
|
"num_tokens": 38040474.0,
|
|
"step": 20625
|
|
},
|
|
{
|
|
"entropy": 5.6875709056854244,
|
|
"epoch": 1.7332073093887839,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046997838063034784,
|
|
"loss": 5.3934,
|
|
"mean_token_accuracy": 0.1709348142147064,
|
|
"num_tokens": 38049620.0,
|
|
"step": 20630
|
|
},
|
|
{
|
|
"entropy": 5.58522481918335,
|
|
"epoch": 1.7336273892039489,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00046996343394224173,
|
|
"loss": 5.3489,
|
|
"mean_token_accuracy": 0.17310173362493514,
|
|
"num_tokens": 38059866.0,
|
|
"step": 20635
|
|
},
|
|
{
|
|
"entropy": 5.6092894077301025,
|
|
"epoch": 1.7340474690191137,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046994848380045866,
|
|
"loss": 5.3227,
|
|
"mean_token_accuracy": 0.16931509375572204,
|
|
"num_tokens": 38068948.0,
|
|
"step": 20640
|
|
},
|
|
{
|
|
"entropy": 5.730233001708984,
|
|
"epoch": 1.7344675488342784,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00046993353020526366,
|
|
"loss": 5.5148,
|
|
"mean_token_accuracy": 0.17122802436351775,
|
|
"num_tokens": 38079239.0,
|
|
"step": 20645
|
|
},
|
|
{
|
|
"entropy": 5.666778707504273,
|
|
"epoch": 1.7348876286494432,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004699185731569215,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.17148027569055557,
|
|
"num_tokens": 38087999.0,
|
|
"step": 20650
|
|
},
|
|
{
|
|
"entropy": 5.669428873062134,
|
|
"epoch": 1.7353077084646082,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004699036126556972,
|
|
"loss": 5.3704,
|
|
"mean_token_accuracy": 0.17108169794082642,
|
|
"num_tokens": 38096586.0,
|
|
"step": 20655
|
|
},
|
|
{
|
|
"entropy": 5.577715730667114,
|
|
"epoch": 1.7357277882797733,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004698886487018558,
|
|
"loss": 5.3346,
|
|
"mean_token_accuracy": 0.1717136487364769,
|
|
"num_tokens": 38104766.0,
|
|
"step": 20660
|
|
},
|
|
{
|
|
"entropy": 5.62361216545105,
|
|
"epoch": 1.736147868094938,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004698736812956623,
|
|
"loss": 5.3684,
|
|
"mean_token_accuracy": 0.17202869206666946,
|
|
"num_tokens": 38113574.0,
|
|
"step": 20665
|
|
},
|
|
{
|
|
"entropy": 5.6415934562683105,
|
|
"epoch": 1.7365679479101028,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004698587104373819,
|
|
"loss": 5.3325,
|
|
"mean_token_accuracy": 0.16672066748142242,
|
|
"num_tokens": 38122513.0,
|
|
"step": 20670
|
|
},
|
|
{
|
|
"entropy": 5.526204442977905,
|
|
"epoch": 1.7369880277252678,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046984373612727975,
|
|
"loss": 5.3066,
|
|
"mean_token_accuracy": 0.16567323356866837,
|
|
"num_tokens": 38131105.0,
|
|
"step": 20675
|
|
},
|
|
{
|
|
"entropy": 5.6237061500549315,
|
|
"epoch": 1.7374081075404328,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00046982875836562116,
|
|
"loss": 5.3868,
|
|
"mean_token_accuracy": 0.16423740088939667,
|
|
"num_tokens": 38140106.0,
|
|
"step": 20680
|
|
},
|
|
{
|
|
"entropy": 5.661822700500489,
|
|
"epoch": 1.7378281873555976,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00046981377715267145,
|
|
"loss": 5.3491,
|
|
"mean_token_accuracy": 0.17514974921941756,
|
|
"num_tokens": 38149215.0,
|
|
"step": 20685
|
|
},
|
|
{
|
|
"entropy": 5.637057638168335,
|
|
"epoch": 1.7382482671707624,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000469798792488696,
|
|
"loss": 5.2793,
|
|
"mean_token_accuracy": 0.17926838994026184,
|
|
"num_tokens": 38157591.0,
|
|
"step": 20690
|
|
},
|
|
{
|
|
"entropy": 5.580015373229981,
|
|
"epoch": 1.7386683469859272,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004697838043739602,
|
|
"loss": 5.4022,
|
|
"mean_token_accuracy": 0.16713829338550568,
|
|
"num_tokens": 38167673.0,
|
|
"step": 20695
|
|
},
|
|
{
|
|
"entropy": 5.708221006393432,
|
|
"epoch": 1.7390884268010922,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00046976881280872974,
|
|
"loss": 5.368,
|
|
"mean_token_accuracy": 0.1714918613433838,
|
|
"num_tokens": 38177586.0,
|
|
"step": 20700
|
|
},
|
|
{
|
|
"entropy": 5.71192569732666,
|
|
"epoch": 1.7395085066162572,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004697538177932699,
|
|
"loss": 5.3698,
|
|
"mean_token_accuracy": 0.16908372268080712,
|
|
"num_tokens": 38187020.0,
|
|
"step": 20705
|
|
},
|
|
{
|
|
"entropy": 5.527950620651245,
|
|
"epoch": 1.739928586431422,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004697388193278465,
|
|
"loss": 5.1499,
|
|
"mean_token_accuracy": 0.1834670916199684,
|
|
"num_tokens": 38195705.0,
|
|
"step": 20710
|
|
},
|
|
{
|
|
"entropy": 5.576827144622802,
|
|
"epoch": 1.7403486662465868,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004697238174127252,
|
|
"loss": 5.2747,
|
|
"mean_token_accuracy": 0.1754479631781578,
|
|
"num_tokens": 38204726.0,
|
|
"step": 20715
|
|
},
|
|
{
|
|
"entropy": 5.591728734970093,
|
|
"epoch": 1.7407687460617516,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004697088120481717,
|
|
"loss": 5.3875,
|
|
"mean_token_accuracy": 0.16983902752399443,
|
|
"num_tokens": 38214376.0,
|
|
"step": 20720
|
|
},
|
|
{
|
|
"entropy": 5.610480928421021,
|
|
"epoch": 1.7411888258769166,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004696938032344519,
|
|
"loss": 5.298,
|
|
"mean_token_accuracy": 0.17367589026689528,
|
|
"num_tokens": 38223631.0,
|
|
"step": 20725
|
|
},
|
|
{
|
|
"entropy": 5.627554512023925,
|
|
"epoch": 1.7416089056920816,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004696787909718317,
|
|
"loss": 5.3183,
|
|
"mean_token_accuracy": 0.18182352632284166,
|
|
"num_tokens": 38233519.0,
|
|
"step": 20730
|
|
},
|
|
{
|
|
"entropy": 5.636379337310791,
|
|
"epoch": 1.7420289855072464,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00046966377526057686,
|
|
"loss": 5.2841,
|
|
"mean_token_accuracy": 0.1782074749469757,
|
|
"num_tokens": 38242340.0,
|
|
"step": 20735
|
|
},
|
|
{
|
|
"entropy": 5.582876539230346,
|
|
"epoch": 1.7424490653224112,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004696487561009535,
|
|
"loss": 5.2942,
|
|
"mean_token_accuracy": 0.17328224033117295,
|
|
"num_tokens": 38251194.0,
|
|
"step": 20740
|
|
},
|
|
{
|
|
"entropy": 5.637811088562012,
|
|
"epoch": 1.7428691451375762,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004696337334932277,
|
|
"loss": 5.3531,
|
|
"mean_token_accuracy": 0.17145794332027436,
|
|
"num_tokens": 38259938.0,
|
|
"step": 20745
|
|
},
|
|
{
|
|
"entropy": 5.654774141311646,
|
|
"epoch": 1.743289224952741,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00046961870743766546,
|
|
"loss": 5.3472,
|
|
"mean_token_accuracy": 0.17386607378721236,
|
|
"num_tokens": 38268073.0,
|
|
"step": 20750
|
|
},
|
|
{
|
|
"entropy": 5.666212892532348,
|
|
"epoch": 1.743709304767906,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00046960367793453313,
|
|
"loss": 5.4556,
|
|
"mean_token_accuracy": 0.16973720118403435,
|
|
"num_tokens": 38277667.0,
|
|
"step": 20755
|
|
},
|
|
{
|
|
"entropy": 5.710540676116944,
|
|
"epoch": 1.7441293845830708,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00046958864498409673,
|
|
"loss": 5.4055,
|
|
"mean_token_accuracy": 0.17234568446874618,
|
|
"num_tokens": 38287142.0,
|
|
"step": 20760
|
|
},
|
|
{
|
|
"entropy": 5.692324304580689,
|
|
"epoch": 1.7445494643982355,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00046957360858662276,
|
|
"loss": 5.3783,
|
|
"mean_token_accuracy": 0.17638524919748305,
|
|
"num_tokens": 38296199.0,
|
|
"step": 20765
|
|
},
|
|
{
|
|
"entropy": 5.645661878585815,
|
|
"epoch": 1.7449695442134006,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004695585687423775,
|
|
"loss": 5.3891,
|
|
"mean_token_accuracy": 0.17083698213100434,
|
|
"num_tokens": 38305412.0,
|
|
"step": 20770
|
|
},
|
|
{
|
|
"entropy": 5.592067527770996,
|
|
"epoch": 1.7453896240285656,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004695435254516273,
|
|
"loss": 5.3152,
|
|
"mean_token_accuracy": 0.18210890293121337,
|
|
"num_tokens": 38313898.0,
|
|
"step": 20775
|
|
},
|
|
{
|
|
"entropy": 5.671021890640259,
|
|
"epoch": 1.7458097038437304,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004695284787146388,
|
|
"loss": 5.4325,
|
|
"mean_token_accuracy": 0.1672999680042267,
|
|
"num_tokens": 38322835.0,
|
|
"step": 20780
|
|
},
|
|
{
|
|
"entropy": 5.610225439071655,
|
|
"epoch": 1.7462297836588951,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004695134285316784,
|
|
"loss": 5.2361,
|
|
"mean_token_accuracy": 0.18298912942409515,
|
|
"num_tokens": 38331448.0,
|
|
"step": 20785
|
|
},
|
|
{
|
|
"entropy": 5.620502758026123,
|
|
"epoch": 1.74664986347406,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00046949837490301293,
|
|
"loss": 5.3828,
|
|
"mean_token_accuracy": 0.16921331137418746,
|
|
"num_tokens": 38340837.0,
|
|
"step": 20790
|
|
},
|
|
{
|
|
"entropy": 5.626954984664917,
|
|
"epoch": 1.747069943289225,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004694833178289088,
|
|
"loss": 5.3406,
|
|
"mean_token_accuracy": 0.1766287937760353,
|
|
"num_tokens": 38349363.0,
|
|
"step": 20795
|
|
},
|
|
{
|
|
"entropy": 5.631927633285523,
|
|
"epoch": 1.74749002310439,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004694682573096328,
|
|
"loss": 5.376,
|
|
"mean_token_accuracy": 0.17368592023849488,
|
|
"num_tokens": 38358017.0,
|
|
"step": 20800
|
|
},
|
|
{
|
|
"entropy": 5.6352317333221436,
|
|
"epoch": 1.7479101029195547,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00046945319334545184,
|
|
"loss": 5.3588,
|
|
"mean_token_accuracy": 0.17234770804643632,
|
|
"num_tokens": 38367256.0,
|
|
"step": 20805
|
|
},
|
|
{
|
|
"entropy": 5.618623685836792,
|
|
"epoch": 1.7483301827347195,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004694381259366327,
|
|
"loss": 5.3577,
|
|
"mean_token_accuracy": 0.17468070536851882,
|
|
"num_tokens": 38376169.0,
|
|
"step": 20810
|
|
},
|
|
{
|
|
"entropy": 5.641800165176392,
|
|
"epoch": 1.7487502625498845,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046942305508344216,
|
|
"loss": 5.3273,
|
|
"mean_token_accuracy": 0.17379536628723144,
|
|
"num_tokens": 38385379.0,
|
|
"step": 20815
|
|
},
|
|
{
|
|
"entropy": 5.693554830551148,
|
|
"epoch": 1.7491703423650493,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004694079807861473,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.1681118994951248,
|
|
"num_tokens": 38395217.0,
|
|
"step": 20820
|
|
},
|
|
{
|
|
"entropy": 5.636894845962525,
|
|
"epoch": 1.7495904221802143,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004693929030450153,
|
|
"loss": 5.3247,
|
|
"mean_token_accuracy": 0.17704234570264815,
|
|
"num_tokens": 38404347.0,
|
|
"step": 20825
|
|
},
|
|
{
|
|
"entropy": 5.6810730457305905,
|
|
"epoch": 1.750010501995379,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00046937782186031303,
|
|
"loss": 5.3081,
|
|
"mean_token_accuracy": 0.1747249722480774,
|
|
"num_tokens": 38413394.0,
|
|
"step": 20830
|
|
},
|
|
{
|
|
"entropy": 5.676941013336181,
|
|
"epoch": 1.750430581810544,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004693627372323078,
|
|
"loss": 5.3446,
|
|
"mean_token_accuracy": 0.17214433401823043,
|
|
"num_tokens": 38422043.0,
|
|
"step": 20835
|
|
},
|
|
{
|
|
"entropy": 5.753418397903443,
|
|
"epoch": 1.750850661625709,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004693476491612667,
|
|
"loss": 5.5131,
|
|
"mean_token_accuracy": 0.1660939335823059,
|
|
"num_tokens": 38430792.0,
|
|
"step": 20840
|
|
},
|
|
{
|
|
"entropy": 5.56128044128418,
|
|
"epoch": 1.751270741440874,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004693325576474571,
|
|
"loss": 5.299,
|
|
"mean_token_accuracy": 0.17610928416252136,
|
|
"num_tokens": 38439105.0,
|
|
"step": 20845
|
|
},
|
|
{
|
|
"entropy": 5.644918298721313,
|
|
"epoch": 1.7516908212560387,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004693174626911463,
|
|
"loss": 5.3261,
|
|
"mean_token_accuracy": 0.1766454264521599,
|
|
"num_tokens": 38447944.0,
|
|
"step": 20850
|
|
},
|
|
{
|
|
"entropy": 5.628182983398437,
|
|
"epoch": 1.7521109010712035,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00046930236429260173,
|
|
"loss": 5.3694,
|
|
"mean_token_accuracy": 0.16761911809444427,
|
|
"num_tokens": 38457206.0,
|
|
"step": 20855
|
|
},
|
|
{
|
|
"entropy": 5.685393190383911,
|
|
"epoch": 1.7525309808863683,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004692872624520908,
|
|
"loss": 5.446,
|
|
"mean_token_accuracy": 0.16450470089912414,
|
|
"num_tokens": 38467085.0,
|
|
"step": 20860
|
|
},
|
|
{
|
|
"entropy": 5.687595844268799,
|
|
"epoch": 1.7529510607015333,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000469272157169881,
|
|
"loss": 5.2827,
|
|
"mean_token_accuracy": 0.17452918142080306,
|
|
"num_tokens": 38475970.0,
|
|
"step": 20865
|
|
},
|
|
{
|
|
"entropy": 5.637504386901855,
|
|
"epoch": 1.7533711405166983,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004692570484462401,
|
|
"loss": 5.4291,
|
|
"mean_token_accuracy": 0.17007501125335694,
|
|
"num_tokens": 38484579.0,
|
|
"step": 20870
|
|
},
|
|
{
|
|
"entropy": 5.683751344680786,
|
|
"epoch": 1.753791220331863,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046924193628143554,
|
|
"loss": 5.4706,
|
|
"mean_token_accuracy": 0.16491821259260178,
|
|
"num_tokens": 38495107.0,
|
|
"step": 20875
|
|
},
|
|
{
|
|
"entropy": 5.733100080490113,
|
|
"epoch": 1.7542113001470279,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00046922682067573516,
|
|
"loss": 5.455,
|
|
"mean_token_accuracy": 0.1720812901854515,
|
|
"num_tokens": 38505731.0,
|
|
"step": 20880
|
|
},
|
|
{
|
|
"entropy": 5.629334449768066,
|
|
"epoch": 1.7546313799621929,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00046921170162940657,
|
|
"loss": 5.3422,
|
|
"mean_token_accuracy": 0.1781423345208168,
|
|
"num_tokens": 38514483.0,
|
|
"step": 20885
|
|
},
|
|
{
|
|
"entropy": 5.6288830757141115,
|
|
"epoch": 1.7550514597773577,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00046919657914271774,
|
|
"loss": 5.2621,
|
|
"mean_token_accuracy": 0.18058374375104905,
|
|
"num_tokens": 38522953.0,
|
|
"step": 20890
|
|
},
|
|
{
|
|
"entropy": 5.567493963241577,
|
|
"epoch": 1.7554715395925227,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 0.0004691814532159365,
|
|
"loss": 5.2562,
|
|
"mean_token_accuracy": 0.18670934140682222,
|
|
"num_tokens": 38531891.0,
|
|
"step": 20895
|
|
},
|
|
{
|
|
"entropy": 5.650929737091064,
|
|
"epoch": 1.7558916194076875,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004691663238493308,
|
|
"loss": 5.431,
|
|
"mean_token_accuracy": 0.1708792820572853,
|
|
"num_tokens": 38541609.0,
|
|
"step": 20900
|
|
},
|
|
{
|
|
"entropy": 5.714797496795654,
|
|
"epoch": 1.7563116992228522,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004691511910431686,
|
|
"loss": 5.4352,
|
|
"mean_token_accuracy": 0.17311373427510263,
|
|
"num_tokens": 38550348.0,
|
|
"step": 20905
|
|
},
|
|
{
|
|
"entropy": 5.609110689163208,
|
|
"epoch": 1.7567317790380172,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004691360547977181,
|
|
"loss": 5.2661,
|
|
"mean_token_accuracy": 0.17832353860139846,
|
|
"num_tokens": 38559493.0,
|
|
"step": 20910
|
|
},
|
|
{
|
|
"entropy": 5.621959161758423,
|
|
"epoch": 1.7571518588531823,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004691209151132474,
|
|
"loss": 5.3231,
|
|
"mean_token_accuracy": 0.1581482857465744,
|
|
"num_tokens": 38567888.0,
|
|
"step": 20915
|
|
},
|
|
{
|
|
"entropy": 5.6945287704467775,
|
|
"epoch": 1.757571938668347,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004691057719900246,
|
|
"loss": 5.3927,
|
|
"mean_token_accuracy": 0.17266636341810226,
|
|
"num_tokens": 38577216.0,
|
|
"step": 20920
|
|
},
|
|
{
|
|
"entropy": 5.6431300163269045,
|
|
"epoch": 1.7579920184835118,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00046909062542831794,
|
|
"loss": 5.34,
|
|
"mean_token_accuracy": 0.17715939432382583,
|
|
"num_tokens": 38586258.0,
|
|
"step": 20925
|
|
},
|
|
{
|
|
"entropy": 5.642459106445313,
|
|
"epoch": 1.7584120982986766,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004690754754283959,
|
|
"loss": 5.2895,
|
|
"mean_token_accuracy": 0.17726175487041473,
|
|
"num_tokens": 38594900.0,
|
|
"step": 20930
|
|
},
|
|
{
|
|
"entropy": 5.594657468795776,
|
|
"epoch": 1.7588321781138416,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004690603219905266,
|
|
"loss": 5.3709,
|
|
"mean_token_accuracy": 0.171932390332222,
|
|
"num_tokens": 38603980.0,
|
|
"step": 20935
|
|
},
|
|
{
|
|
"entropy": 5.678670597076416,
|
|
"epoch": 1.7592522579290066,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00046904516511497873,
|
|
"loss": 5.4647,
|
|
"mean_token_accuracy": 0.16452773064374923,
|
|
"num_tokens": 38613888.0,
|
|
"step": 20940
|
|
},
|
|
{
|
|
"entropy": 5.754366111755371,
|
|
"epoch": 1.7596723377441714,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046903000480202065,
|
|
"loss": 5.3917,
|
|
"mean_token_accuracy": 0.1681995779275894,
|
|
"num_tokens": 38623969.0,
|
|
"step": 20945
|
|
},
|
|
{
|
|
"entropy": 5.62518458366394,
|
|
"epoch": 1.7600924175593362,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00046901484105192094,
|
|
"loss": 5.3453,
|
|
"mean_token_accuracy": 0.17296512126922609,
|
|
"num_tokens": 38633387.0,
|
|
"step": 20950
|
|
},
|
|
{
|
|
"entropy": 5.6430340766906735,
|
|
"epoch": 1.760512497374501,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046899967386494816,
|
|
"loss": 5.4,
|
|
"mean_token_accuracy": 0.16604579389095306,
|
|
"num_tokens": 38642481.0,
|
|
"step": 20955
|
|
},
|
|
{
|
|
"entropy": 5.687196922302246,
|
|
"epoch": 1.760932577189666,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004689845032413712,
|
|
"loss": 5.3981,
|
|
"mean_token_accuracy": 0.1664348542690277,
|
|
"num_tokens": 38652345.0,
|
|
"step": 20960
|
|
},
|
|
{
|
|
"entropy": 5.732553148269654,
|
|
"epoch": 1.761352657004831,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004689693291814586,
|
|
"loss": 5.4189,
|
|
"mean_token_accuracy": 0.16699230074882507,
|
|
"num_tokens": 38661529.0,
|
|
"step": 20965
|
|
},
|
|
{
|
|
"entropy": 5.602785253524781,
|
|
"epoch": 1.7617727368199958,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004689541516854791,
|
|
"loss": 5.3202,
|
|
"mean_token_accuracy": 0.17832910716533662,
|
|
"num_tokens": 38670191.0,
|
|
"step": 20970
|
|
},
|
|
{
|
|
"entropy": 5.621751117706299,
|
|
"epoch": 1.7621928166351606,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004689389707537018,
|
|
"loss": 5.4132,
|
|
"mean_token_accuracy": 0.16694632470607756,
|
|
"num_tokens": 38679089.0,
|
|
"step": 20975
|
|
},
|
|
{
|
|
"entropy": 5.660399055480957,
|
|
"epoch": 1.7626128964503256,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046892378638639545,
|
|
"loss": 5.3529,
|
|
"mean_token_accuracy": 0.1763218879699707,
|
|
"num_tokens": 38688821.0,
|
|
"step": 20980
|
|
},
|
|
{
|
|
"entropy": 5.709231901168823,
|
|
"epoch": 1.7630329762654906,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046890859858382913,
|
|
"loss": 5.4325,
|
|
"mean_token_accuracy": 0.16355552822351455,
|
|
"num_tokens": 38698232.0,
|
|
"step": 20985
|
|
},
|
|
{
|
|
"entropy": 5.778678321838379,
|
|
"epoch": 1.7634530560806554,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004688934073462718,
|
|
"loss": 5.5,
|
|
"mean_token_accuracy": 0.15904544815421104,
|
|
"num_tokens": 38708090.0,
|
|
"step": 20990
|
|
},
|
|
{
|
|
"entropy": 5.694181299209594,
|
|
"epoch": 1.7638731358958202,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046887821267399256,
|
|
"loss": 5.4005,
|
|
"mean_token_accuracy": 0.17715791165828704,
|
|
"num_tokens": 38717370.0,
|
|
"step": 20995
|
|
},
|
|
{
|
|
"entropy": 5.667404508590698,
|
|
"epoch": 1.764293215710985,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004688630145672607,
|
|
"loss": 5.3688,
|
|
"mean_token_accuracy": 0.17490747272968293,
|
|
"num_tokens": 38726758.0,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 1.764293215710985,
|
|
"eval_entropy": 5.447259841907512,
|
|
"eval_loss": 5.429024696350098,
|
|
"eval_mean_token_accuracy": 0.17760649738501136,
|
|
"eval_num_tokens": 38726758.0,
|
|
"eval_runtime": 27.2768,
|
|
"eval_samples_per_second": 1369.882,
|
|
"eval_steps_per_second": 171.244,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"entropy": 5.620334959030151,
|
|
"epoch": 1.76471329552615,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004688478130263453,
|
|
"loss": 5.3613,
|
|
"mean_token_accuracy": 0.1682727813720703,
|
|
"num_tokens": 38736180.0,
|
|
"step": 21005
|
|
},
|
|
{
|
|
"entropy": 5.655771541595459,
|
|
"epoch": 1.765133375341315,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004688326080515157,
|
|
"loss": 5.3121,
|
|
"mean_token_accuracy": 0.17605517357587813,
|
|
"num_tokens": 38744529.0,
|
|
"step": 21010
|
|
},
|
|
{
|
|
"entropy": 5.517810726165772,
|
|
"epoch": 1.7655534551564798,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046881739964304127,
|
|
"loss": 5.2272,
|
|
"mean_token_accuracy": 0.18033822625875473,
|
|
"num_tokens": 38753434.0,
|
|
"step": 21015
|
|
},
|
|
{
|
|
"entropy": 5.597821426391602,
|
|
"epoch": 1.7659735349716446,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046880218780119136,
|
|
"loss": 5.3471,
|
|
"mean_token_accuracy": 0.17827894389629365,
|
|
"num_tokens": 38762021.0,
|
|
"step": 21020
|
|
},
|
|
{
|
|
"entropy": 5.703983736038208,
|
|
"epoch": 1.7663936147868093,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004687869725262356,
|
|
"loss": 5.4687,
|
|
"mean_token_accuracy": 0.1671355977654457,
|
|
"num_tokens": 38771373.0,
|
|
"step": 21025
|
|
},
|
|
{
|
|
"entropy": 5.684408521652221,
|
|
"epoch": 1.7668136946019743,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004687717538184433,
|
|
"loss": 5.427,
|
|
"mean_token_accuracy": 0.1724289759993553,
|
|
"num_tokens": 38780388.0,
|
|
"step": 21030
|
|
},
|
|
{
|
|
"entropy": 5.61069803237915,
|
|
"epoch": 1.7672337744171394,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00046875653167808423,
|
|
"loss": 5.26,
|
|
"mean_token_accuracy": 0.1805383160710335,
|
|
"num_tokens": 38789285.0,
|
|
"step": 21035
|
|
},
|
|
{
|
|
"entropy": 5.5840356826782225,
|
|
"epoch": 1.7676538542323041,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00046874130610542796,
|
|
"loss": 5.3548,
|
|
"mean_token_accuracy": 0.17195963561534883,
|
|
"num_tokens": 38799321.0,
|
|
"step": 21040
|
|
},
|
|
{
|
|
"entropy": 5.69988784790039,
|
|
"epoch": 1.768073934047469,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004687260771007442,
|
|
"loss": 5.3414,
|
|
"mean_token_accuracy": 0.16847853660583495,
|
|
"num_tokens": 38808515.0,
|
|
"step": 21045
|
|
},
|
|
{
|
|
"entropy": 5.594510459899903,
|
|
"epoch": 1.768494013862634,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004687108446643027,
|
|
"loss": 5.3048,
|
|
"mean_token_accuracy": 0.17457255125045776,
|
|
"num_tokens": 38817634.0,
|
|
"step": 21050
|
|
},
|
|
{
|
|
"entropy": 5.740410614013672,
|
|
"epoch": 1.7689140936777987,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004686956087963734,
|
|
"loss": 5.5311,
|
|
"mean_token_accuracy": 0.1675797998905182,
|
|
"num_tokens": 38826766.0,
|
|
"step": 21055
|
|
},
|
|
{
|
|
"entropy": 5.64251217842102,
|
|
"epoch": 1.7693341734929637,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004686803694972261,
|
|
"loss": 5.2846,
|
|
"mean_token_accuracy": 0.17146496325731278,
|
|
"num_tokens": 38835942.0,
|
|
"step": 21060
|
|
},
|
|
{
|
|
"entropy": 5.637530994415283,
|
|
"epoch": 1.7697542533081285,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00046866512676713075,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.16483051627874373,
|
|
"num_tokens": 38845691.0,
|
|
"step": 21065
|
|
},
|
|
{
|
|
"entropy": 5.6026856899261475,
|
|
"epoch": 1.7701743331232933,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00046864988060635744,
|
|
"loss": 5.3686,
|
|
"mean_token_accuracy": 0.16854404360055925,
|
|
"num_tokens": 38855737.0,
|
|
"step": 21070
|
|
},
|
|
{
|
|
"entropy": 5.634773826599121,
|
|
"epoch": 1.7705944129384583,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004686346310151762,
|
|
"loss": 5.3817,
|
|
"mean_token_accuracy": 0.17483728677034377,
|
|
"num_tokens": 38864887.0,
|
|
"step": 21075
|
|
},
|
|
{
|
|
"entropy": 5.710461950302124,
|
|
"epoch": 1.7710144927536233,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046861937799385717,
|
|
"loss": 5.3603,
|
|
"mean_token_accuracy": 0.1777254104614258,
|
|
"num_tokens": 38873924.0,
|
|
"step": 21080
|
|
},
|
|
{
|
|
"entropy": 5.648996734619141,
|
|
"epoch": 1.7714345725687881,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004686041215426706,
|
|
"loss": 5.4071,
|
|
"mean_token_accuracy": 0.1716112896800041,
|
|
"num_tokens": 38883447.0,
|
|
"step": 21085
|
|
},
|
|
{
|
|
"entropy": 5.647649192810059,
|
|
"epoch": 1.771854652383953,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004685888616618867,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.17345526367425917,
|
|
"num_tokens": 38892389.0,
|
|
"step": 21090
|
|
},
|
|
{
|
|
"entropy": 5.688521671295166,
|
|
"epoch": 1.7722747321991177,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046857359835177575,
|
|
"loss": 5.4408,
|
|
"mean_token_accuracy": 0.16651444435119628,
|
|
"num_tokens": 38901574.0,
|
|
"step": 21095
|
|
},
|
|
{
|
|
"entropy": 5.710891914367676,
|
|
"epoch": 1.7726948120142827,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00046855833161260825,
|
|
"loss": 5.4205,
|
|
"mean_token_accuracy": 0.1721094399690628,
|
|
"num_tokens": 38910070.0,
|
|
"step": 21100
|
|
},
|
|
{
|
|
"entropy": 5.6420543670654295,
|
|
"epoch": 1.7731148918294477,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004685430614446545,
|
|
"loss": 5.3168,
|
|
"mean_token_accuracy": 0.17222830057144164,
|
|
"num_tokens": 38919868.0,
|
|
"step": 21105
|
|
},
|
|
{
|
|
"entropy": 5.66776967048645,
|
|
"epoch": 1.7735349716446125,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004685277878481852,
|
|
"loss": 5.3784,
|
|
"mean_token_accuracy": 0.16582091450691222,
|
|
"num_tokens": 38928840.0,
|
|
"step": 21110
|
|
},
|
|
{
|
|
"entropy": 5.662716102600098,
|
|
"epoch": 1.7739550514597773,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046851251082347063,
|
|
"loss": 5.455,
|
|
"mean_token_accuracy": 0.166241654753685,
|
|
"num_tokens": 38938112.0,
|
|
"step": 21115
|
|
},
|
|
{
|
|
"entropy": 5.7086883068084715,
|
|
"epoch": 1.7743751312749423,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004684972303707816,
|
|
"loss": 5.3755,
|
|
"mean_token_accuracy": 0.1721594288945198,
|
|
"num_tokens": 38947463.0,
|
|
"step": 21120
|
|
},
|
|
{
|
|
"entropy": 5.71903281211853,
|
|
"epoch": 1.774795211090107,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004684819464903888,
|
|
"loss": 5.5394,
|
|
"mean_token_accuracy": 0.16309396475553511,
|
|
"num_tokens": 38957221.0,
|
|
"step": 21125
|
|
},
|
|
{
|
|
"entropy": 5.59781403541565,
|
|
"epoch": 1.775215290905272,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000468466659182563,
|
|
"loss": 5.2735,
|
|
"mean_token_accuracy": 0.1779392898082733,
|
|
"num_tokens": 38966656.0,
|
|
"step": 21130
|
|
},
|
|
{
|
|
"entropy": 5.600368213653565,
|
|
"epoch": 1.7756353707204369,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004684513684475749,
|
|
"loss": 5.28,
|
|
"mean_token_accuracy": 0.17913274914026261,
|
|
"num_tokens": 38975281.0,
|
|
"step": 21135
|
|
},
|
|
{
|
|
"entropy": 5.711512088775635,
|
|
"epoch": 1.7760554505356017,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046843607428569546,
|
|
"loss": 5.4295,
|
|
"mean_token_accuracy": 0.17240019291639327,
|
|
"num_tokens": 38985147.0,
|
|
"step": 21140
|
|
},
|
|
{
|
|
"entropy": 5.63455982208252,
|
|
"epoch": 1.7764755303507667,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00046842077669719554,
|
|
"loss": 5.2079,
|
|
"mean_token_accuracy": 0.1831870675086975,
|
|
"num_tokens": 38994104.0,
|
|
"step": 21145
|
|
},
|
|
{
|
|
"entropy": 5.631388187408447,
|
|
"epoch": 1.7768956101659317,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046840547568234613,
|
|
"loss": 5.4063,
|
|
"mean_token_accuracy": 0.1688321650028229,
|
|
"num_tokens": 39003983.0,
|
|
"step": 21150
|
|
},
|
|
{
|
|
"entropy": 5.6240592956542965,
|
|
"epoch": 1.7773156899810965,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046839017124141835,
|
|
"loss": 5.3136,
|
|
"mean_token_accuracy": 0.17558915317058563,
|
|
"num_tokens": 39012636.0,
|
|
"step": 21155
|
|
},
|
|
{
|
|
"entropy": 5.648619031906128,
|
|
"epoch": 1.7777357697962612,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00046837486337468335,
|
|
"loss": 5.4367,
|
|
"mean_token_accuracy": 0.16739535629749297,
|
|
"num_tokens": 39022173.0,
|
|
"step": 21160
|
|
},
|
|
{
|
|
"entropy": 5.746690368652343,
|
|
"epoch": 1.778155849611426,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000468359552082412,
|
|
"loss": 5.4438,
|
|
"mean_token_accuracy": 0.16601206958293915,
|
|
"num_tokens": 39032651.0,
|
|
"step": 21165
|
|
},
|
|
{
|
|
"entropy": 5.650369501113891,
|
|
"epoch": 1.778575929426591,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004683442373648759,
|
|
"loss": 5.3624,
|
|
"mean_token_accuracy": 0.16653727144002914,
|
|
"num_tokens": 39041543.0,
|
|
"step": 21170
|
|
},
|
|
{
|
|
"entropy": 5.610308504104614,
|
|
"epoch": 1.778996009241756,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004683289192223462,
|
|
"loss": 5.336,
|
|
"mean_token_accuracy": 0.17248573154211044,
|
|
"num_tokens": 39050467.0,
|
|
"step": 21175
|
|
},
|
|
{
|
|
"entropy": 5.684257221221924,
|
|
"epoch": 1.7794160890569208,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00046831359765509424,
|
|
"loss": 5.3996,
|
|
"mean_token_accuracy": 0.16482697874307634,
|
|
"num_tokens": 39059224.0,
|
|
"step": 21180
|
|
},
|
|
{
|
|
"entropy": 5.65388126373291,
|
|
"epoch": 1.7798361688720856,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046829827266339134,
|
|
"loss": 5.4226,
|
|
"mean_token_accuracy": 0.17068626284599303,
|
|
"num_tokens": 39067884.0,
|
|
"step": 21185
|
|
},
|
|
{
|
|
"entropy": 5.682791662216187,
|
|
"epoch": 1.7802562486872506,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00046828294424750916,
|
|
"loss": 5.3776,
|
|
"mean_token_accuracy": 0.1663289338350296,
|
|
"num_tokens": 39076774.0,
|
|
"step": 21190
|
|
},
|
|
{
|
|
"entropy": 5.6760657787322994,
|
|
"epoch": 1.7806763285024154,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004682676124077192,
|
|
"loss": 5.312,
|
|
"mean_token_accuracy": 0.17417764961719512,
|
|
"num_tokens": 39086021.0,
|
|
"step": 21195
|
|
},
|
|
{
|
|
"entropy": 5.687941169738769,
|
|
"epoch": 1.7810964083175804,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046825227714429287,
|
|
"loss": 5.3043,
|
|
"mean_token_accuracy": 0.17446549832820893,
|
|
"num_tokens": 39095682.0,
|
|
"step": 21200
|
|
},
|
|
{
|
|
"entropy": 5.591732406616211,
|
|
"epoch": 1.7815164881327452,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046823693845750205,
|
|
"loss": 5.3381,
|
|
"mean_token_accuracy": 0.17597149461507797,
|
|
"num_tokens": 39104904.0,
|
|
"step": 21205
|
|
},
|
|
{
|
|
"entropy": 5.669663190841675,
|
|
"epoch": 1.78193656794791,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00046822159634761837,
|
|
"loss": 5.4867,
|
|
"mean_token_accuracy": 0.16566276848316192,
|
|
"num_tokens": 39113128.0,
|
|
"step": 21210
|
|
},
|
|
{
|
|
"entropy": 5.5798241138458256,
|
|
"epoch": 1.782356647763075,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004682062508149136,
|
|
"loss": 5.3373,
|
|
"mean_token_accuracy": 0.17040848433971406,
|
|
"num_tokens": 39122503.0,
|
|
"step": 21215
|
|
},
|
|
{
|
|
"entropy": 5.65609712600708,
|
|
"epoch": 1.78277672757824,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004681909018596595,
|
|
"loss": 5.3367,
|
|
"mean_token_accuracy": 0.17275859266519547,
|
|
"num_tokens": 39132020.0,
|
|
"step": 21220
|
|
},
|
|
{
|
|
"entropy": 5.674847936630249,
|
|
"epoch": 1.7831968073934048,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00046817554948212813,
|
|
"loss": 5.3719,
|
|
"mean_token_accuracy": 0.17056983560323716,
|
|
"num_tokens": 39141542.0,
|
|
"step": 21225
|
|
},
|
|
{
|
|
"entropy": 5.6901304721832275,
|
|
"epoch": 1.7836168872085696,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00046816019368259136,
|
|
"loss": 5.3959,
|
|
"mean_token_accuracy": 0.1733367383480072,
|
|
"num_tokens": 39151573.0,
|
|
"step": 21230
|
|
},
|
|
{
|
|
"entropy": 5.597964191436768,
|
|
"epoch": 1.7840369670237344,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004681448344613212,
|
|
"loss": 5.3772,
|
|
"mean_token_accuracy": 0.18744425475597382,
|
|
"num_tokens": 39160023.0,
|
|
"step": 21235
|
|
},
|
|
{
|
|
"entropy": 5.585873651504516,
|
|
"epoch": 1.7844570468388994,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00046812947181858986,
|
|
"loss": 5.3522,
|
|
"mean_token_accuracy": 0.17375268936157226,
|
|
"num_tokens": 39169335.0,
|
|
"step": 21240
|
|
},
|
|
{
|
|
"entropy": 5.7382103443145756,
|
|
"epoch": 1.7848771266540644,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004681141057546693,
|
|
"loss": 5.4522,
|
|
"mean_token_accuracy": 0.1610276386141777,
|
|
"num_tokens": 39177953.0,
|
|
"step": 21245
|
|
},
|
|
{
|
|
"entropy": 5.6841898441314695,
|
|
"epoch": 1.7852972064692292,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00046809873626983174,
|
|
"loss": 5.3873,
|
|
"mean_token_accuracy": 0.16958157420158387,
|
|
"num_tokens": 39188984.0,
|
|
"step": 21250
|
|
},
|
|
{
|
|
"entropy": 5.650718355178833,
|
|
"epoch": 1.785717286284394,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00046808336336434946,
|
|
"loss": 5.354,
|
|
"mean_token_accuracy": 0.1693144455552101,
|
|
"num_tokens": 39198033.0,
|
|
"step": 21255
|
|
},
|
|
{
|
|
"entropy": 5.640344429016113,
|
|
"epoch": 1.7861373660995588,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046806798703849495,
|
|
"loss": 5.3114,
|
|
"mean_token_accuracy": 0.17812950164079666,
|
|
"num_tokens": 39207429.0,
|
|
"step": 21260
|
|
},
|
|
{
|
|
"entropy": 5.668394279479981,
|
|
"epoch": 1.7865574459147238,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004680526072925404,
|
|
"loss": 5.3638,
|
|
"mean_token_accuracy": 0.17503189891576768,
|
|
"num_tokens": 39216484.0,
|
|
"step": 21265
|
|
},
|
|
{
|
|
"entropy": 5.751850509643555,
|
|
"epoch": 1.7869775257298888,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046803722412675836,
|
|
"loss": 5.4421,
|
|
"mean_token_accuracy": 0.16722988039255143,
|
|
"num_tokens": 39226385.0,
|
|
"step": 21270
|
|
},
|
|
{
|
|
"entropy": 5.659515428543091,
|
|
"epoch": 1.7873976055450536,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00046802183754142125,
|
|
"loss": 5.355,
|
|
"mean_token_accuracy": 0.17532113194465637,
|
|
"num_tokens": 39235424.0,
|
|
"step": 21275
|
|
},
|
|
{
|
|
"entropy": 5.615523481369019,
|
|
"epoch": 1.7878176853602183,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004680064475368017,
|
|
"loss": 5.334,
|
|
"mean_token_accuracy": 0.17136083245277406,
|
|
"num_tokens": 39244109.0,
|
|
"step": 21280
|
|
},
|
|
{
|
|
"entropy": 5.638778781890869,
|
|
"epoch": 1.7882377651753834,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00046799105411317234,
|
|
"loss": 5.3672,
|
|
"mean_token_accuracy": 0.18030614107847215,
|
|
"num_tokens": 39253685.0,
|
|
"step": 21285
|
|
},
|
|
{
|
|
"entropy": 5.636345529556275,
|
|
"epoch": 1.7886578449905484,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046797565727080585,
|
|
"loss": 5.3363,
|
|
"mean_token_accuracy": 0.1694641187787056,
|
|
"num_tokens": 39262743.0,
|
|
"step": 21290
|
|
},
|
|
{
|
|
"entropy": 5.595978879928589,
|
|
"epoch": 1.7890779248057131,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046796025700997484,
|
|
"loss": 5.2617,
|
|
"mean_token_accuracy": 0.1859144985675812,
|
|
"num_tokens": 39270962.0,
|
|
"step": 21295
|
|
},
|
|
{
|
|
"entropy": 5.629796028137207,
|
|
"epoch": 1.789498004620878,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004679448533309523,
|
|
"loss": 5.357,
|
|
"mean_token_accuracy": 0.1806061625480652,
|
|
"num_tokens": 39279994.0,
|
|
"step": 21300
|
|
},
|
|
{
|
|
"entropy": 5.648918485641479,
|
|
"epoch": 1.7899180844360427,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046792944623401107,
|
|
"loss": 5.3957,
|
|
"mean_token_accuracy": 0.17086594551801682,
|
|
"num_tokens": 39289481.0,
|
|
"step": 21305
|
|
},
|
|
{
|
|
"entropy": 5.726909589767456,
|
|
"epoch": 1.7903381642512077,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046791403571942405,
|
|
"loss": 5.4798,
|
|
"mean_token_accuracy": 0.16001774370670319,
|
|
"num_tokens": 39298383.0,
|
|
"step": 21310
|
|
},
|
|
{
|
|
"entropy": 5.628692245483398,
|
|
"epoch": 1.7907582440663727,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004678986217874642,
|
|
"loss": 5.3709,
|
|
"mean_token_accuracy": 0.17079650610685349,
|
|
"num_tokens": 39307809.0,
|
|
"step": 21315
|
|
},
|
|
{
|
|
"entropy": 5.601756286621094,
|
|
"epoch": 1.7911783238815375,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046788320443840457,
|
|
"loss": 5.2556,
|
|
"mean_token_accuracy": 0.18573263436555862,
|
|
"num_tokens": 39316332.0,
|
|
"step": 21320
|
|
},
|
|
{
|
|
"entropy": 5.617982006072998,
|
|
"epoch": 1.7915984036967023,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00046786778367251833,
|
|
"loss": 5.292,
|
|
"mean_token_accuracy": 0.17370064407587052,
|
|
"num_tokens": 39325672.0,
|
|
"step": 21325
|
|
},
|
|
{
|
|
"entropy": 5.591927242279053,
|
|
"epoch": 1.792018483511867,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046785235949007854,
|
|
"loss": 5.3672,
|
|
"mean_token_accuracy": 0.1754762977361679,
|
|
"num_tokens": 39334478.0,
|
|
"step": 21330
|
|
},
|
|
{
|
|
"entropy": 5.4915365219116214,
|
|
"epoch": 1.792438563327032,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046783693189135863,
|
|
"loss": 5.2474,
|
|
"mean_token_accuracy": 0.17552462220191956,
|
|
"num_tokens": 39343573.0,
|
|
"step": 21335
|
|
},
|
|
{
|
|
"entropy": 5.642029523849487,
|
|
"epoch": 1.7928586431421971,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046782150087663167,
|
|
"loss": 5.3067,
|
|
"mean_token_accuracy": 0.18337966054677962,
|
|
"num_tokens": 39351956.0,
|
|
"step": 21340
|
|
},
|
|
{
|
|
"entropy": 5.699854373931885,
|
|
"epoch": 1.793278722957362,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004678060664461711,
|
|
"loss": 5.4656,
|
|
"mean_token_accuracy": 0.16409681141376495,
|
|
"num_tokens": 39361911.0,
|
|
"step": 21345
|
|
},
|
|
{
|
|
"entropy": 5.689766883850098,
|
|
"epoch": 1.7936988027725267,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004677906286002504,
|
|
"loss": 5.3918,
|
|
"mean_token_accuracy": 0.1700123593211174,
|
|
"num_tokens": 39370916.0,
|
|
"step": 21350
|
|
},
|
|
{
|
|
"entropy": 5.681654787063598,
|
|
"epoch": 1.7941188825876917,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004677751873391429,
|
|
"loss": 5.4125,
|
|
"mean_token_accuracy": 0.16728848665952684,
|
|
"num_tokens": 39380662.0,
|
|
"step": 21355
|
|
},
|
|
{
|
|
"entropy": 5.632915210723877,
|
|
"epoch": 1.7945389624028567,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046775974266312234,
|
|
"loss": 5.3231,
|
|
"mean_token_accuracy": 0.18093785941600798,
|
|
"num_tokens": 39389644.0,
|
|
"step": 21360
|
|
},
|
|
{
|
|
"entropy": 5.627950620651245,
|
|
"epoch": 1.7949590422180215,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00046774429457246215,
|
|
"loss": 5.317,
|
|
"mean_token_accuracy": 0.1713301122188568,
|
|
"num_tokens": 39398662.0,
|
|
"step": 21365
|
|
},
|
|
{
|
|
"entropy": 5.665298891067505,
|
|
"epoch": 1.7953791220331863,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000467728843067436,
|
|
"loss": 5.4089,
|
|
"mean_token_accuracy": 0.17280863374471664,
|
|
"num_tokens": 39408064.0,
|
|
"step": 21370
|
|
},
|
|
{
|
|
"entropy": 5.68812518119812,
|
|
"epoch": 1.795799201848351,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004677133881483177,
|
|
"loss": 5.4316,
|
|
"mean_token_accuracy": 0.16077583879232407,
|
|
"num_tokens": 39418991.0,
|
|
"step": 21375
|
|
},
|
|
{
|
|
"entropy": 5.6052967548370365,
|
|
"epoch": 1.796219281663516,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004676979298153809,
|
|
"loss": 5.2948,
|
|
"mean_token_accuracy": 0.17317767292261124,
|
|
"num_tokens": 39428707.0,
|
|
"step": 21380
|
|
},
|
|
{
|
|
"entropy": 5.718463802337647,
|
|
"epoch": 1.796639361478681,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004676824680688996,
|
|
"loss": 5.4489,
|
|
"mean_token_accuracy": 0.17518044412136077,
|
|
"num_tokens": 39437173.0,
|
|
"step": 21385
|
|
},
|
|
{
|
|
"entropy": 5.70597095489502,
|
|
"epoch": 1.7970594412938459,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046766700290914743,
|
|
"loss": 5.3734,
|
|
"mean_token_accuracy": 0.16496011465787888,
|
|
"num_tokens": 39446336.0,
|
|
"step": 21390
|
|
},
|
|
{
|
|
"entropy": 5.687495326995849,
|
|
"epoch": 1.7974795211090107,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046765153433639856,
|
|
"loss": 5.5444,
|
|
"mean_token_accuracy": 0.16359457075595857,
|
|
"num_tokens": 39456129.0,
|
|
"step": 21395
|
|
},
|
|
{
|
|
"entropy": 5.662699794769287,
|
|
"epoch": 1.7978996009241754,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046763606235092705,
|
|
"loss": 5.3918,
|
|
"mean_token_accuracy": 0.173219533264637,
|
|
"num_tokens": 39465386.0,
|
|
"step": 21400
|
|
},
|
|
{
|
|
"entropy": 5.6809672832489015,
|
|
"epoch": 1.7983196807393405,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004676205869530068,
|
|
"loss": 5.4419,
|
|
"mean_token_accuracy": 0.17025604397058486,
|
|
"num_tokens": 39475085.0,
|
|
"step": 21405
|
|
},
|
|
{
|
|
"entropy": 5.678685855865479,
|
|
"epoch": 1.7987397605545055,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00046760510814291206,
|
|
"loss": 5.4574,
|
|
"mean_token_accuracy": 0.16565362811088563,
|
|
"num_tokens": 39484500.0,
|
|
"step": 21410
|
|
},
|
|
{
|
|
"entropy": 5.675810527801514,
|
|
"epoch": 1.7991598403696702,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000467589625920917,
|
|
"loss": 5.3463,
|
|
"mean_token_accuracy": 0.17084126621484758,
|
|
"num_tokens": 39494049.0,
|
|
"step": 21415
|
|
},
|
|
{
|
|
"entropy": 5.617605352401734,
|
|
"epoch": 1.799579920184835,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000467574140287296,
|
|
"loss": 5.3515,
|
|
"mean_token_accuracy": 0.17164410948753356,
|
|
"num_tokens": 39502874.0,
|
|
"step": 21420
|
|
},
|
|
{
|
|
"entropy": 5.603321266174317,
|
|
"epoch": 1.8,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004675586512423231,
|
|
"loss": 5.3848,
|
|
"mean_token_accuracy": 0.16502818018198012,
|
|
"num_tokens": 39512371.0,
|
|
"step": 21425
|
|
},
|
|
{
|
|
"entropy": 5.67237401008606,
|
|
"epoch": 1.8004200798151648,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000467543158786273,
|
|
"loss": 5.4078,
|
|
"mean_token_accuracy": 0.1716100186109543,
|
|
"num_tokens": 39521477.0,
|
|
"step": 21430
|
|
},
|
|
{
|
|
"entropy": 5.677791595458984,
|
|
"epoch": 1.8008401596303298,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046752766291941985,
|
|
"loss": 5.418,
|
|
"mean_token_accuracy": 0.1607919916510582,
|
|
"num_tokens": 39530072.0,
|
|
"step": 21435
|
|
},
|
|
{
|
|
"entropy": 5.66340913772583,
|
|
"epoch": 1.8012602394454946,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004675121636420383,
|
|
"loss": 5.3903,
|
|
"mean_token_accuracy": 0.16702970415353774,
|
|
"num_tokens": 39540762.0,
|
|
"step": 21440
|
|
},
|
|
{
|
|
"entropy": 5.696033906936646,
|
|
"epoch": 1.8016803192606594,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000467496660954403,
|
|
"loss": 5.4174,
|
|
"mean_token_accuracy": 0.16265884339809417,
|
|
"num_tokens": 39549699.0,
|
|
"step": 21445
|
|
},
|
|
{
|
|
"entropy": 5.677773523330688,
|
|
"epoch": 1.8021003990758244,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046748115485678837,
|
|
"loss": 5.4414,
|
|
"mean_token_accuracy": 0.1688990116119385,
|
|
"num_tokens": 39558725.0,
|
|
"step": 21450
|
|
},
|
|
{
|
|
"entropy": 5.60676121711731,
|
|
"epoch": 1.8025204788909894,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00046746564534946926,
|
|
"loss": 5.2994,
|
|
"mean_token_accuracy": 0.17619529366493225,
|
|
"num_tokens": 39567357.0,
|
|
"step": 21455
|
|
},
|
|
{
|
|
"entropy": 5.635344457626343,
|
|
"epoch": 1.8029405587061542,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004674501324327203,
|
|
"loss": 5.2869,
|
|
"mean_token_accuracy": 0.17789805233478545,
|
|
"num_tokens": 39576147.0,
|
|
"step": 21460
|
|
},
|
|
{
|
|
"entropy": 5.669049167633057,
|
|
"epoch": 1.803360638521319,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046743461610681636,
|
|
"loss": 5.4369,
|
|
"mean_token_accuracy": 0.17405525892972945,
|
|
"num_tokens": 39584963.0,
|
|
"step": 21465
|
|
},
|
|
{
|
|
"entropy": 5.590832757949829,
|
|
"epoch": 1.8037807183364838,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004674190963720323,
|
|
"loss": 5.2983,
|
|
"mean_token_accuracy": 0.17730980813503266,
|
|
"num_tokens": 39594420.0,
|
|
"step": 21470
|
|
},
|
|
{
|
|
"entropy": 5.597025918960571,
|
|
"epoch": 1.8042007981516488,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000467403573228643,
|
|
"loss": 5.394,
|
|
"mean_token_accuracy": 0.16461124569177626,
|
|
"num_tokens": 39603276.0,
|
|
"step": 21475
|
|
},
|
|
{
|
|
"entropy": 5.600082731246948,
|
|
"epoch": 1.8046208779668138,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004673880466769235,
|
|
"loss": 5.4545,
|
|
"mean_token_accuracy": 0.16378810703754426,
|
|
"num_tokens": 39613161.0,
|
|
"step": 21480
|
|
},
|
|
{
|
|
"entropy": 5.631666612625122,
|
|
"epoch": 1.8050409577819786,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00046737251671714886,
|
|
"loss": 5.3009,
|
|
"mean_token_accuracy": 0.17678849697113036,
|
|
"num_tokens": 39621889.0,
|
|
"step": 21485
|
|
},
|
|
{
|
|
"entropy": 5.751594495773316,
|
|
"epoch": 1.8054610375971434,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046735698334959407,
|
|
"loss": 5.4888,
|
|
"mean_token_accuracy": 0.17027620673179628,
|
|
"num_tokens": 39632009.0,
|
|
"step": 21490
|
|
},
|
|
{
|
|
"entropy": 5.7419802188873295,
|
|
"epoch": 1.8058811174123084,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00046734144657453443,
|
|
"loss": 5.3736,
|
|
"mean_token_accuracy": 0.17260289043188096,
|
|
"num_tokens": 39640639.0,
|
|
"step": 21495
|
|
},
|
|
{
|
|
"entropy": 5.6267815113067625,
|
|
"epoch": 1.8063011972274732,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046732590639224505,
|
|
"loss": 5.394,
|
|
"mean_token_accuracy": 0.17710949927568437,
|
|
"num_tokens": 39649837.0,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"entropy": 5.635099458694458,
|
|
"epoch": 1.8067212770426382,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046731036280300126,
|
|
"loss": 5.4226,
|
|
"mean_token_accuracy": 0.17313309758901596,
|
|
"num_tokens": 39659890.0,
|
|
"step": 21505
|
|
},
|
|
{
|
|
"entropy": 5.6682960987091064,
|
|
"epoch": 1.807141356857803,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00046729481580707846,
|
|
"loss": 5.3342,
|
|
"mean_token_accuracy": 0.17116763591766357,
|
|
"num_tokens": 39669550.0,
|
|
"step": 21510
|
|
},
|
|
{
|
|
"entropy": 5.6409660339355465,
|
|
"epoch": 1.8075614366729678,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00046727926540475207,
|
|
"loss": 5.3313,
|
|
"mean_token_accuracy": 0.16743680387735366,
|
|
"num_tokens": 39678471.0,
|
|
"step": 21515
|
|
},
|
|
{
|
|
"entropy": 5.514544820785522,
|
|
"epoch": 1.8079815164881328,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004672637115962974,
|
|
"loss": 5.2649,
|
|
"mean_token_accuracy": 0.17956244349479675,
|
|
"num_tokens": 39686600.0,
|
|
"step": 21520
|
|
},
|
|
{
|
|
"entropy": 5.597471857070923,
|
|
"epoch": 1.8084015963032978,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046724815438199007,
|
|
"loss": 5.3991,
|
|
"mean_token_accuracy": 0.1686672165989876,
|
|
"num_tokens": 39696848.0,
|
|
"step": 21525
|
|
},
|
|
{
|
|
"entropy": 5.61803035736084,
|
|
"epoch": 1.8088216761184626,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00046723259376210577,
|
|
"loss": 5.335,
|
|
"mean_token_accuracy": 0.17923670560121535,
|
|
"num_tokens": 39706051.0,
|
|
"step": 21530
|
|
},
|
|
{
|
|
"entropy": 5.691323709487915,
|
|
"epoch": 1.8092417559336273,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046721702973692,
|
|
"loss": 5.3996,
|
|
"mean_token_accuracy": 0.16498573273420333,
|
|
"num_tokens": 39716035.0,
|
|
"step": 21535
|
|
},
|
|
{
|
|
"entropy": 5.6498912334442135,
|
|
"epoch": 1.8096618357487921,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00046720146230670853,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.16898033916950225,
|
|
"num_tokens": 39725717.0,
|
|
"step": 21540
|
|
},
|
|
{
|
|
"entropy": 5.623174715042114,
|
|
"epoch": 1.8100819155639571,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004671858914717471,
|
|
"loss": 5.3948,
|
|
"mean_token_accuracy": 0.16846336126327516,
|
|
"num_tokens": 39734543.0,
|
|
"step": 21545
|
|
},
|
|
{
|
|
"entropy": 5.647709131240845,
|
|
"epoch": 1.8105019953791222,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.00046717031723231164,
|
|
"loss": 5.4131,
|
|
"mean_token_accuracy": 0.17280775755643846,
|
|
"num_tokens": 39744503.0,
|
|
"step": 21550
|
|
},
|
|
{
|
|
"entropy": 5.638943243026733,
|
|
"epoch": 1.810922075194287,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004671547395886779,
|
|
"loss": 5.3921,
|
|
"mean_token_accuracy": 0.16712662130594252,
|
|
"num_tokens": 39753484.0,
|
|
"step": 21555
|
|
},
|
|
{
|
|
"entropy": 5.610015249252319,
|
|
"epoch": 1.8113421550094517,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004671391585411219,
|
|
"loss": 5.3029,
|
|
"mean_token_accuracy": 0.1781844601035118,
|
|
"num_tokens": 39762673.0,
|
|
"step": 21560
|
|
},
|
|
{
|
|
"entropy": 5.645753812789917,
|
|
"epoch": 1.8117622348246165,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046712357408991965,
|
|
"loss": 5.4587,
|
|
"mean_token_accuracy": 0.16241314709186555,
|
|
"num_tokens": 39773138.0,
|
|
"step": 21565
|
|
},
|
|
{
|
|
"entropy": 5.722913217544556,
|
|
"epoch": 1.8121823146397815,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004671079862353472,
|
|
"loss": 5.4498,
|
|
"mean_token_accuracy": 0.168387970328331,
|
|
"num_tokens": 39782282.0,
|
|
"step": 21570
|
|
},
|
|
{
|
|
"entropy": 5.623279857635498,
|
|
"epoch": 1.8126023944549465,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046709239497768067,
|
|
"loss": 5.3519,
|
|
"mean_token_accuracy": 0.1776757076382637,
|
|
"num_tokens": 39792035.0,
|
|
"step": 21575
|
|
},
|
|
{
|
|
"entropy": 5.724744987487793,
|
|
"epoch": 1.8130224742701113,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046707680031719633,
|
|
"loss": 5.4498,
|
|
"mean_token_accuracy": 0.16576552540063857,
|
|
"num_tokens": 39801696.0,
|
|
"step": 21580
|
|
},
|
|
{
|
|
"entropy": 5.743741226196289,
|
|
"epoch": 1.813442554085276,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004670612022541705,
|
|
"loss": 5.4751,
|
|
"mean_token_accuracy": 0.16882607191801072,
|
|
"num_tokens": 39811449.0,
|
|
"step": 21585
|
|
},
|
|
{
|
|
"entropy": 5.6892838954925535,
|
|
"epoch": 1.8138626339004411,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004670456007888792,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.16952537894248962,
|
|
"num_tokens": 39820339.0,
|
|
"step": 21590
|
|
},
|
|
{
|
|
"entropy": 5.61853666305542,
|
|
"epoch": 1.8142827137156061,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004670299959215989,
|
|
"loss": 5.3599,
|
|
"mean_token_accuracy": 0.17586547285318374,
|
|
"num_tokens": 39829861.0,
|
|
"step": 21595
|
|
},
|
|
{
|
|
"entropy": 5.645865345001221,
|
|
"epoch": 1.814702793530771,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004670143876526062,
|
|
"loss": 5.3182,
|
|
"mean_token_accuracy": 0.17726973295211793,
|
|
"num_tokens": 39838568.0,
|
|
"step": 21600
|
|
},
|
|
{
|
|
"entropy": 5.625951051712036,
|
|
"epoch": 1.8151228733459357,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046699877598217754,
|
|
"loss": 5.3338,
|
|
"mean_token_accuracy": 0.1771585986018181,
|
|
"num_tokens": 39847705.0,
|
|
"step": 21605
|
|
},
|
|
{
|
|
"entropy": 5.651676988601684,
|
|
"epoch": 1.8155429531611005,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046698316091058946,
|
|
"loss": 5.4239,
|
|
"mean_token_accuracy": 0.16565542817115783,
|
|
"num_tokens": 39856700.0,
|
|
"step": 21610
|
|
},
|
|
{
|
|
"entropy": 5.672735357284546,
|
|
"epoch": 1.8159630329762655,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00046696754243811845,
|
|
"loss": 5.3138,
|
|
"mean_token_accuracy": 0.17889431715011597,
|
|
"num_tokens": 39865647.0,
|
|
"step": 21615
|
|
},
|
|
{
|
|
"entropy": 5.7311060428619385,
|
|
"epoch": 1.8163831127914305,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004669519205650413,
|
|
"loss": 5.4334,
|
|
"mean_token_accuracy": 0.1666131630539894,
|
|
"num_tokens": 39874705.0,
|
|
"step": 21620
|
|
},
|
|
{
|
|
"entropy": 5.6093682765960695,
|
|
"epoch": 1.8168031926065953,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00046693629529163467,
|
|
"loss": 5.2633,
|
|
"mean_token_accuracy": 0.17741246819496154,
|
|
"num_tokens": 39883795.0,
|
|
"step": 21625
|
|
},
|
|
{
|
|
"entropy": 5.648662471771241,
|
|
"epoch": 1.81722327242176,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004669206666181755,
|
|
"loss": 5.3502,
|
|
"mean_token_accuracy": 0.17730291932821274,
|
|
"num_tokens": 39893165.0,
|
|
"step": 21630
|
|
},
|
|
{
|
|
"entropy": 5.58931975364685,
|
|
"epoch": 1.8176433522369249,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004669050345449404,
|
|
"loss": 5.3901,
|
|
"mean_token_accuracy": 0.17008297443389891,
|
|
"num_tokens": 39902241.0,
|
|
"step": 21635
|
|
},
|
|
{
|
|
"entropy": 5.598491477966308,
|
|
"epoch": 1.8180634320520899,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004668893990722066,
|
|
"loss": 5.3486,
|
|
"mean_token_accuracy": 0.1675383910536766,
|
|
"num_tokens": 39911211.0,
|
|
"step": 21640
|
|
},
|
|
{
|
|
"entropy": 5.629481792449951,
|
|
"epoch": 1.8184835118672549,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004668737602002508,
|
|
"loss": 5.3409,
|
|
"mean_token_accuracy": 0.17022158205509186,
|
|
"num_tokens": 39920192.0,
|
|
"step": 21645
|
|
},
|
|
{
|
|
"entropy": 5.676052808761597,
|
|
"epoch": 1.8189035916824197,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046685811792935016,
|
|
"loss": 5.3769,
|
|
"mean_token_accuracy": 0.1712314024567604,
|
|
"num_tokens": 39929169.0,
|
|
"step": 21650
|
|
},
|
|
{
|
|
"entropy": 5.674107933044434,
|
|
"epoch": 1.8193236714975844,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046684247225978176,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.1656157284975052,
|
|
"num_tokens": 39939333.0,
|
|
"step": 21655
|
|
},
|
|
{
|
|
"entropy": 5.604327821731568,
|
|
"epoch": 1.8197437513127495,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046682682319182275,
|
|
"loss": 5.3847,
|
|
"mean_token_accuracy": 0.17021397948265077,
|
|
"num_tokens": 39948042.0,
|
|
"step": 21660
|
|
},
|
|
{
|
|
"entropy": 5.622416305541992,
|
|
"epoch": 1.8201638311279145,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046681117072575035,
|
|
"loss": 5.3134,
|
|
"mean_token_accuracy": 0.1767050787806511,
|
|
"num_tokens": 39956847.0,
|
|
"step": 21665
|
|
},
|
|
{
|
|
"entropy": 5.790766382217408,
|
|
"epoch": 1.8205839109430793,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004667955148618418,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.15933856070041658,
|
|
"num_tokens": 39966598.0,
|
|
"step": 21670
|
|
},
|
|
{
|
|
"entropy": 5.597654056549072,
|
|
"epoch": 1.821003990758244,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004667798556003745,
|
|
"loss": 5.2301,
|
|
"mean_token_accuracy": 0.1689037188887596,
|
|
"num_tokens": 39975236.0,
|
|
"step": 21675
|
|
},
|
|
{
|
|
"entropy": 5.608970832824707,
|
|
"epoch": 1.8214240705734088,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004667641929416258,
|
|
"loss": 5.3879,
|
|
"mean_token_accuracy": 0.17176640927791595,
|
|
"num_tokens": 39984582.0,
|
|
"step": 21680
|
|
},
|
|
{
|
|
"entropy": 5.623990154266357,
|
|
"epoch": 1.8218441503885738,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004667485268858731,
|
|
"loss": 5.3783,
|
|
"mean_token_accuracy": 0.17252393662929535,
|
|
"num_tokens": 39993122.0,
|
|
"step": 21685
|
|
},
|
|
{
|
|
"entropy": 5.652155160903931,
|
|
"epoch": 1.8222642302037388,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00046673285743339406,
|
|
"loss": 5.3438,
|
|
"mean_token_accuracy": 0.1751272648572922,
|
|
"num_tokens": 40002974.0,
|
|
"step": 21690
|
|
},
|
|
{
|
|
"entropy": 5.646127367019654,
|
|
"epoch": 1.8226843100189036,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046671718458446616,
|
|
"loss": 5.3852,
|
|
"mean_token_accuracy": 0.17070560306310653,
|
|
"num_tokens": 40011790.0,
|
|
"step": 21695
|
|
},
|
|
{
|
|
"entropy": 5.713921976089478,
|
|
"epoch": 1.8231043898340684,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004667015083393671,
|
|
"loss": 5.3966,
|
|
"mean_token_accuracy": 0.17200501561164855,
|
|
"num_tokens": 40021327.0,
|
|
"step": 21700
|
|
},
|
|
{
|
|
"entropy": 5.636666631698608,
|
|
"epoch": 1.8235244696492332,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004666858286983744,
|
|
"loss": 5.3929,
|
|
"mean_token_accuracy": 0.17091110199689866,
|
|
"num_tokens": 40030471.0,
|
|
"step": 21705
|
|
},
|
|
{
|
|
"entropy": 5.674646282196045,
|
|
"epoch": 1.8239445494643982,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004666701456617661,
|
|
"loss": 5.3948,
|
|
"mean_token_accuracy": 0.1720045655965805,
|
|
"num_tokens": 40039305.0,
|
|
"step": 21710
|
|
},
|
|
{
|
|
"entropy": 5.640139770507813,
|
|
"epoch": 1.8243646292795632,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046665445922981975,
|
|
"loss": 5.3103,
|
|
"mean_token_accuracy": 0.17814622223377227,
|
|
"num_tokens": 40047389.0,
|
|
"step": 21715
|
|
},
|
|
{
|
|
"entropy": 5.697626256942749,
|
|
"epoch": 1.824784709094728,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004666387694028134,
|
|
"loss": 5.3839,
|
|
"mean_token_accuracy": 0.17282926440238952,
|
|
"num_tokens": 40057640.0,
|
|
"step": 21720
|
|
},
|
|
{
|
|
"entropy": 5.588255500793457,
|
|
"epoch": 1.8252047889098928,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004666230761810249,
|
|
"loss": 5.3463,
|
|
"mean_token_accuracy": 0.1746961608529091,
|
|
"num_tokens": 40066770.0,
|
|
"step": 21725
|
|
},
|
|
{
|
|
"entropy": 5.5960245609283445,
|
|
"epoch": 1.8256248687250578,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004666073795647323,
|
|
"loss": 5.3288,
|
|
"mean_token_accuracy": 0.17138356268405913,
|
|
"num_tokens": 40075902.0,
|
|
"step": 21730
|
|
},
|
|
{
|
|
"entropy": 5.595876836776734,
|
|
"epoch": 1.8260449485402226,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046659167955421366,
|
|
"loss": 5.367,
|
|
"mean_token_accuracy": 0.16742293983697892,
|
|
"num_tokens": 40084945.0,
|
|
"step": 21735
|
|
},
|
|
{
|
|
"entropy": 5.574261808395386,
|
|
"epoch": 1.8264650283553876,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000466575976149747,
|
|
"loss": 5.2742,
|
|
"mean_token_accuracy": 0.1774066463112831,
|
|
"num_tokens": 40095104.0,
|
|
"step": 21740
|
|
},
|
|
{
|
|
"entropy": 5.708527374267578,
|
|
"epoch": 1.8268851081705524,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004665602693516106,
|
|
"loss": 5.4188,
|
|
"mean_token_accuracy": 0.17146946042776107,
|
|
"num_tokens": 40105329.0,
|
|
"step": 21745
|
|
},
|
|
{
|
|
"entropy": 5.589442586898803,
|
|
"epoch": 1.8273051879857172,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004665445591600827,
|
|
"loss": 5.2376,
|
|
"mean_token_accuracy": 0.18168216943740845,
|
|
"num_tokens": 40114555.0,
|
|
"step": 21750
|
|
},
|
|
{
|
|
"entropy": 5.624778461456299,
|
|
"epoch": 1.8277252678008822,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004665288455754415,
|
|
"loss": 5.2822,
|
|
"mean_token_accuracy": 0.18185721337795258,
|
|
"num_tokens": 40123314.0,
|
|
"step": 21755
|
|
},
|
|
{
|
|
"entropy": 5.613637542724609,
|
|
"epoch": 1.8281453476160472,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004665131285979655,
|
|
"loss": 5.3253,
|
|
"mean_token_accuracy": 0.17175379693508147,
|
|
"num_tokens": 40132483.0,
|
|
"step": 21760
|
|
},
|
|
{
|
|
"entropy": 5.63437066078186,
|
|
"epoch": 1.828565427431212,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00046649740822793303,
|
|
"loss": 5.355,
|
|
"mean_token_accuracy": 0.17184757441282272,
|
|
"num_tokens": 40141800.0,
|
|
"step": 21765
|
|
},
|
|
{
|
|
"entropy": 5.688283252716064,
|
|
"epoch": 1.8289855072463768,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004664816844656225,
|
|
"loss": 5.3415,
|
|
"mean_token_accuracy": 0.18133000284433365,
|
|
"num_tokens": 40149892.0,
|
|
"step": 21770
|
|
},
|
|
{
|
|
"entropy": 5.625762462615967,
|
|
"epoch": 1.8294055870615415,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00046646595731131263,
|
|
"loss": 5.3221,
|
|
"mean_token_accuracy": 0.17174559831619263,
|
|
"num_tokens": 40159376.0,
|
|
"step": 21775
|
|
},
|
|
{
|
|
"entropy": 5.614609718322754,
|
|
"epoch": 1.8298256668767066,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004664502267652819,
|
|
"loss": 5.2827,
|
|
"mean_token_accuracy": 0.17843341827392578,
|
|
"num_tokens": 40168497.0,
|
|
"step": 21780
|
|
},
|
|
{
|
|
"entropy": 5.674923658370972,
|
|
"epoch": 1.8302457466918716,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046643449282780894,
|
|
"loss": 5.3782,
|
|
"mean_token_accuracy": 0.16659992337226867,
|
|
"num_tokens": 40177432.0,
|
|
"step": 21785
|
|
},
|
|
{
|
|
"entropy": 5.654786205291748,
|
|
"epoch": 1.8306658265070364,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004664187554991725,
|
|
"loss": 5.2698,
|
|
"mean_token_accuracy": 0.17321840226650237,
|
|
"num_tokens": 40186582.0,
|
|
"step": 21790
|
|
},
|
|
{
|
|
"entropy": 5.637910079956055,
|
|
"epoch": 1.8310859063222011,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004664030147796514,
|
|
"loss": 5.3276,
|
|
"mean_token_accuracy": 0.17326397448778152,
|
|
"num_tokens": 40196094.0,
|
|
"step": 21795
|
|
},
|
|
{
|
|
"entropy": 5.5886390686035154,
|
|
"epoch": 1.8315059861373661,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004663872706695244,
|
|
"loss": 5.3779,
|
|
"mean_token_accuracy": 0.17434979230165482,
|
|
"num_tokens": 40205239.0,
|
|
"step": 21800
|
|
},
|
|
{
|
|
"entropy": 5.641726875305176,
|
|
"epoch": 1.831926065952531,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004663715231690706,
|
|
"loss": 5.4406,
|
|
"mean_token_accuracy": 0.1751034140586853,
|
|
"num_tokens": 40213908.0,
|
|
"step": 21805
|
|
},
|
|
{
|
|
"entropy": 5.709264898300171,
|
|
"epoch": 1.832346145767696,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046635577227856873,
|
|
"loss": 5.4025,
|
|
"mean_token_accuracy": 0.17268626689910888,
|
|
"num_tokens": 40223370.0,
|
|
"step": 21810
|
|
},
|
|
{
|
|
"entropy": 5.7190502166748045,
|
|
"epoch": 1.8327662255828607,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004663400179982978,
|
|
"loss": 5.487,
|
|
"mean_token_accuracy": 0.1673346996307373,
|
|
"num_tokens": 40233934.0,
|
|
"step": 21815
|
|
},
|
|
{
|
|
"entropy": 5.70527868270874,
|
|
"epoch": 1.8331863053980255,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00046632426032853705,
|
|
"loss": 5.366,
|
|
"mean_token_accuracy": 0.16772017180919646,
|
|
"num_tokens": 40244335.0,
|
|
"step": 21820
|
|
},
|
|
{
|
|
"entropy": 5.570014429092407,
|
|
"epoch": 1.8336063852131905,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046630849926956555,
|
|
"loss": 5.3147,
|
|
"mean_token_accuracy": 0.1714258924126625,
|
|
"num_tokens": 40254354.0,
|
|
"step": 21825
|
|
},
|
|
{
|
|
"entropy": 5.6582074642181395,
|
|
"epoch": 1.8340264650283555,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046629273482166244,
|
|
"loss": 5.3156,
|
|
"mean_token_accuracy": 0.17588206827640535,
|
|
"num_tokens": 40262748.0,
|
|
"step": 21830
|
|
},
|
|
{
|
|
"entropy": 5.697437191009522,
|
|
"epoch": 1.8344465448435203,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046627696698510706,
|
|
"loss": 5.4048,
|
|
"mean_token_accuracy": 0.17420812398195268,
|
|
"num_tokens": 40271818.0,
|
|
"step": 21835
|
|
},
|
|
{
|
|
"entropy": 5.632059001922608,
|
|
"epoch": 1.834866624658685,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004662611957601788,
|
|
"loss": 5.4213,
|
|
"mean_token_accuracy": 0.16834606230258942,
|
|
"num_tokens": 40280552.0,
|
|
"step": 21840
|
|
},
|
|
{
|
|
"entropy": 5.690567255020142,
|
|
"epoch": 1.83528670447385,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.00046624542114715687,
|
|
"loss": 5.3115,
|
|
"mean_token_accuracy": 0.1798562154173851,
|
|
"num_tokens": 40289368.0,
|
|
"step": 21845
|
|
},
|
|
{
|
|
"entropy": 5.7882728099823,
|
|
"epoch": 1.835706784289015,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004662296431463208,
|
|
"loss": 5.5121,
|
|
"mean_token_accuracy": 0.1584714248776436,
|
|
"num_tokens": 40298884.0,
|
|
"step": 21850
|
|
},
|
|
{
|
|
"entropy": 5.69461989402771,
|
|
"epoch": 1.83612686410418,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046621386175795,
|
|
"loss": 5.4196,
|
|
"mean_token_accuracy": 0.16526482701301576,
|
|
"num_tokens": 40307886.0,
|
|
"step": 21855
|
|
},
|
|
{
|
|
"entropy": 5.614387512207031,
|
|
"epoch": 1.8365469439193447,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00046619807698232413,
|
|
"loss": 5.3323,
|
|
"mean_token_accuracy": 0.16994198113679887,
|
|
"num_tokens": 40317688.0,
|
|
"step": 21860
|
|
},
|
|
{
|
|
"entropy": 5.691216659545899,
|
|
"epoch": 1.8369670237345095,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004661822888197228,
|
|
"loss": 5.391,
|
|
"mean_token_accuracy": 0.1661013074219227,
|
|
"num_tokens": 40327630.0,
|
|
"step": 21865
|
|
},
|
|
{
|
|
"entropy": 5.651998567581177,
|
|
"epoch": 1.8373871035496743,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00046616649727042564,
|
|
"loss": 5.3661,
|
|
"mean_token_accuracy": 0.17099616825580596,
|
|
"num_tokens": 40336613.0,
|
|
"step": 21870
|
|
},
|
|
{
|
|
"entropy": 5.646777057647705,
|
|
"epoch": 1.8378071833648393,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046615070233471244,
|
|
"loss": 5.4562,
|
|
"mean_token_accuracy": 0.1672051414847374,
|
|
"num_tokens": 40346582.0,
|
|
"step": 21875
|
|
},
|
|
{
|
|
"entropy": 5.751259517669678,
|
|
"epoch": 1.8382272631800043,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046613490401286304,
|
|
"loss": 5.4752,
|
|
"mean_token_accuracy": 0.1641298934817314,
|
|
"num_tokens": 40355960.0,
|
|
"step": 21880
|
|
},
|
|
{
|
|
"entropy": 5.773221445083618,
|
|
"epoch": 1.838647342995169,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046611910230515716,
|
|
"loss": 5.3246,
|
|
"mean_token_accuracy": 0.17324539572000502,
|
|
"num_tokens": 40366043.0,
|
|
"step": 21885
|
|
},
|
|
{
|
|
"entropy": 5.6297935962677,
|
|
"epoch": 1.8390674228103339,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004661032972118748,
|
|
"loss": 5.3792,
|
|
"mean_token_accuracy": 0.1739889457821846,
|
|
"num_tokens": 40374919.0,
|
|
"step": 21890
|
|
},
|
|
{
|
|
"entropy": 5.586809396743774,
|
|
"epoch": 1.8394875026254989,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046608748873329587,
|
|
"loss": 5.3512,
|
|
"mean_token_accuracy": 0.17698893696069717,
|
|
"num_tokens": 40383415.0,
|
|
"step": 21895
|
|
},
|
|
{
|
|
"entropy": 5.741325616836548,
|
|
"epoch": 1.8399075824406639,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004660716768697005,
|
|
"loss": 5.3999,
|
|
"mean_token_accuracy": 0.16888994574546815,
|
|
"num_tokens": 40392252.0,
|
|
"step": 21900
|
|
},
|
|
{
|
|
"entropy": 5.614504766464234,
|
|
"epoch": 1.8403276622558287,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004660558616213689,
|
|
"loss": 5.2419,
|
|
"mean_token_accuracy": 0.1856852650642395,
|
|
"num_tokens": 40400717.0,
|
|
"step": 21905
|
|
},
|
|
{
|
|
"entropy": 5.57713942527771,
|
|
"epoch": 1.8407477420709935,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046604004298858093,
|
|
"loss": 5.2895,
|
|
"mean_token_accuracy": 0.18077120929956436,
|
|
"num_tokens": 40409236.0,
|
|
"step": 21910
|
|
},
|
|
{
|
|
"entropy": 5.577246189117432,
|
|
"epoch": 1.8411678218861582,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004660242209716171,
|
|
"loss": 5.3133,
|
|
"mean_token_accuracy": 0.17522037625312806,
|
|
"num_tokens": 40419073.0,
|
|
"step": 21915
|
|
},
|
|
{
|
|
"entropy": 5.727301597595215,
|
|
"epoch": 1.8415879017013232,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004660083955707575,
|
|
"loss": 5.4115,
|
|
"mean_token_accuracy": 0.1722704529762268,
|
|
"num_tokens": 40428427.0,
|
|
"step": 21920
|
|
},
|
|
{
|
|
"entropy": 5.678405237197876,
|
|
"epoch": 1.8420079815164883,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004659925667862825,
|
|
"loss": 5.3801,
|
|
"mean_token_accuracy": 0.17540597915649414,
|
|
"num_tokens": 40437350.0,
|
|
"step": 21925
|
|
},
|
|
{
|
|
"entropy": 5.644293546676636,
|
|
"epoch": 1.842428061331653,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004659767346184725,
|
|
"loss": 5.3908,
|
|
"mean_token_accuracy": 0.17217467427253724,
|
|
"num_tokens": 40446059.0,
|
|
"step": 21930
|
|
},
|
|
{
|
|
"entropy": 5.648314523696899,
|
|
"epoch": 1.8428481411468178,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00046596089906760803,
|
|
"loss": 5.3842,
|
|
"mean_token_accuracy": 0.17176232039928435,
|
|
"num_tokens": 40454959.0,
|
|
"step": 21935
|
|
},
|
|
{
|
|
"entropy": 5.662789678573608,
|
|
"epoch": 1.8432682209619826,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004659450601339696,
|
|
"loss": 5.3968,
|
|
"mean_token_accuracy": 0.17346233129501343,
|
|
"num_tokens": 40464202.0,
|
|
"step": 21940
|
|
},
|
|
{
|
|
"entropy": 5.638479089736938,
|
|
"epoch": 1.8436883007771476,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004659292178178377,
|
|
"loss": 5.3427,
|
|
"mean_token_accuracy": 0.1746865801513195,
|
|
"num_tokens": 40473331.0,
|
|
"step": 21945
|
|
},
|
|
{
|
|
"entropy": 5.65547399520874,
|
|
"epoch": 1.8441083805923126,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000465913372119493,
|
|
"loss": 5.3089,
|
|
"mean_token_accuracy": 0.17193447351455687,
|
|
"num_tokens": 40482098.0,
|
|
"step": 21950
|
|
},
|
|
{
|
|
"entropy": 5.674790859222412,
|
|
"epoch": 1.8445284604074774,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004658975230392162,
|
|
"loss": 5.3536,
|
|
"mean_token_accuracy": 0.178443942964077,
|
|
"num_tokens": 40491134.0,
|
|
"step": 21955
|
|
},
|
|
{
|
|
"entropy": 5.706801652908325,
|
|
"epoch": 1.8449485402226422,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004658816705772882,
|
|
"loss": 5.4789,
|
|
"mean_token_accuracy": 0.16973639875650406,
|
|
"num_tokens": 40501488.0,
|
|
"step": 21960
|
|
},
|
|
{
|
|
"entropy": 5.581787538528443,
|
|
"epoch": 1.8453686200378072,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004658658147339896,
|
|
"loss": 5.2266,
|
|
"mean_token_accuracy": 0.18129412233829498,
|
|
"num_tokens": 40510506.0,
|
|
"step": 21965
|
|
},
|
|
{
|
|
"entropy": 5.672901391983032,
|
|
"epoch": 1.8457886998529722,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046584995550960146,
|
|
"loss": 5.3732,
|
|
"mean_token_accuracy": 0.17865750938653946,
|
|
"num_tokens": 40520222.0,
|
|
"step": 21970
|
|
},
|
|
{
|
|
"entropy": 5.584681797027588,
|
|
"epoch": 1.846208779668137,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046583409290440453,
|
|
"loss": 5.2809,
|
|
"mean_token_accuracy": 0.17908318787813188,
|
|
"num_tokens": 40528824.0,
|
|
"step": 21975
|
|
},
|
|
{
|
|
"entropy": 5.5373616218566895,
|
|
"epoch": 1.8466288594833018,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004658182269186799,
|
|
"loss": 5.3659,
|
|
"mean_token_accuracy": 0.16913065910339356,
|
|
"num_tokens": 40538144.0,
|
|
"step": 21980
|
|
},
|
|
{
|
|
"entropy": 5.64822678565979,
|
|
"epoch": 1.8470489392984666,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004658023575527087,
|
|
"loss": 5.4214,
|
|
"mean_token_accuracy": 0.17093602418899537,
|
|
"num_tokens": 40547457.0,
|
|
"step": 21985
|
|
},
|
|
{
|
|
"entropy": 5.724355268478393,
|
|
"epoch": 1.8474690191136316,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000465786484806772,
|
|
"loss": 5.2834,
|
|
"mean_token_accuracy": 0.1817471370100975,
|
|
"num_tokens": 40556005.0,
|
|
"step": 21990
|
|
},
|
|
{
|
|
"entropy": 5.489060354232788,
|
|
"epoch": 1.8478890989287966,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046577060868115095,
|
|
"loss": 5.2522,
|
|
"mean_token_accuracy": 0.17731622010469436,
|
|
"num_tokens": 40565018.0,
|
|
"step": 21995
|
|
},
|
|
{
|
|
"entropy": 5.5660477638244625,
|
|
"epoch": 1.8483091787439614,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004657547291761268,
|
|
"loss": 5.328,
|
|
"mean_token_accuracy": 0.16542342603206633,
|
|
"num_tokens": 40574931.0,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"entropy": 5.662542819976807,
|
|
"epoch": 1.8487292585591262,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046573884629198077,
|
|
"loss": 5.3064,
|
|
"mean_token_accuracy": 0.17560895532369614,
|
|
"num_tokens": 40584496.0,
|
|
"step": 22005
|
|
},
|
|
{
|
|
"entropy": 5.712861347198486,
|
|
"epoch": 1.849149338374291,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004657229600289944,
|
|
"loss": 5.4127,
|
|
"mean_token_accuracy": 0.16572435200214386,
|
|
"num_tokens": 40594363.0,
|
|
"step": 22010
|
|
},
|
|
{
|
|
"entropy": 5.641199684143066,
|
|
"epoch": 1.849569418189456,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004657070703874489,
|
|
"loss": 5.4345,
|
|
"mean_token_accuracy": 0.1685244232416153,
|
|
"num_tokens": 40603001.0,
|
|
"step": 22015
|
|
},
|
|
{
|
|
"entropy": 5.603077220916748,
|
|
"epoch": 1.849989498004621,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046569117736762597,
|
|
"loss": 5.3624,
|
|
"mean_token_accuracy": 0.1757808193564415,
|
|
"num_tokens": 40612660.0,
|
|
"step": 22020
|
|
},
|
|
{
|
|
"entropy": 5.599392080307007,
|
|
"epoch": 1.8504095778197858,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046567528096980686,
|
|
"loss": 5.2727,
|
|
"mean_token_accuracy": 0.17757227271795273,
|
|
"num_tokens": 40622209.0,
|
|
"step": 22025
|
|
},
|
|
{
|
|
"entropy": 5.689568948745728,
|
|
"epoch": 1.8508296576349506,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00046565938119427346,
|
|
"loss": 5.3844,
|
|
"mean_token_accuracy": 0.16621674001216888,
|
|
"num_tokens": 40632011.0,
|
|
"step": 22030
|
|
},
|
|
{
|
|
"entropy": 5.615991640090942,
|
|
"epoch": 1.8512497374501156,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004656434780413073,
|
|
"loss": 5.2703,
|
|
"mean_token_accuracy": 0.1767064481973648,
|
|
"num_tokens": 40641201.0,
|
|
"step": 22035
|
|
},
|
|
{
|
|
"entropy": 5.595028305053711,
|
|
"epoch": 1.8516698172652803,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046562757151119,
|
|
"loss": 5.3252,
|
|
"mean_token_accuracy": 0.17203227579593658,
|
|
"num_tokens": 40650752.0,
|
|
"step": 22040
|
|
},
|
|
{
|
|
"entropy": 5.58217225074768,
|
|
"epoch": 1.8520898970804454,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004656116616042035,
|
|
"loss": 5.322,
|
|
"mean_token_accuracy": 0.17230453789234162,
|
|
"num_tokens": 40659975.0,
|
|
"step": 22045
|
|
},
|
|
{
|
|
"entropy": 5.567688846588135,
|
|
"epoch": 1.8525099768956101,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046559574832062955,
|
|
"loss": 5.3465,
|
|
"mean_token_accuracy": 0.17881006896495819,
|
|
"num_tokens": 40668944.0,
|
|
"step": 22050
|
|
},
|
|
{
|
|
"entropy": 5.735271883010864,
|
|
"epoch": 1.852930056710775,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046557983166075,
|
|
"loss": 5.4974,
|
|
"mean_token_accuracy": 0.1705012798309326,
|
|
"num_tokens": 40678333.0,
|
|
"step": 22055
|
|
},
|
|
{
|
|
"entropy": 5.564062070846558,
|
|
"epoch": 1.85335013652594,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.00046556391162484696,
|
|
"loss": 5.2249,
|
|
"mean_token_accuracy": 0.17906277775764465,
|
|
"num_tokens": 40687781.0,
|
|
"step": 22060
|
|
},
|
|
{
|
|
"entropy": 5.588773012161255,
|
|
"epoch": 1.853770216341105,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004655479882132023,
|
|
"loss": 5.4058,
|
|
"mean_token_accuracy": 0.1747704863548279,
|
|
"num_tokens": 40697637.0,
|
|
"step": 22065
|
|
},
|
|
{
|
|
"entropy": 5.582477378845215,
|
|
"epoch": 1.8541902961562697,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004655320614260982,
|
|
"loss": 5.2545,
|
|
"mean_token_accuracy": 0.17724298536777497,
|
|
"num_tokens": 40707097.0,
|
|
"step": 22070
|
|
},
|
|
{
|
|
"entropy": 5.7074973583221436,
|
|
"epoch": 1.8546103759714345,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00046551613126381673,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.16699569970369338,
|
|
"num_tokens": 40716821.0,
|
|
"step": 22075
|
|
},
|
|
{
|
|
"entropy": 5.651368951797485,
|
|
"epoch": 1.8550304557865993,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004655001977266401,
|
|
"loss": 5.3392,
|
|
"mean_token_accuracy": 0.17080808728933333,
|
|
"num_tokens": 40726731.0,
|
|
"step": 22080
|
|
},
|
|
{
|
|
"entropy": 5.683496332168579,
|
|
"epoch": 1.8554505356017643,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046548426081485046,
|
|
"loss": 5.3118,
|
|
"mean_token_accuracy": 0.1734781190752983,
|
|
"num_tokens": 40736935.0,
|
|
"step": 22085
|
|
},
|
|
{
|
|
"entropy": 5.674536466598511,
|
|
"epoch": 1.8558706154169293,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00046546832052873026,
|
|
"loss": 5.4569,
|
|
"mean_token_accuracy": 0.16797720938920974,
|
|
"num_tokens": 40746643.0,
|
|
"step": 22090
|
|
},
|
|
{
|
|
"entropy": 5.773360300064087,
|
|
"epoch": 1.8562906952320941,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046545237686856195,
|
|
"loss": 5.5021,
|
|
"mean_token_accuracy": 0.16252224892377853,
|
|
"num_tokens": 40755713.0,
|
|
"step": 22095
|
|
},
|
|
{
|
|
"entropy": 5.745291662216187,
|
|
"epoch": 1.856710775047259,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00046543642983462775,
|
|
"loss": 5.4755,
|
|
"mean_token_accuracy": 0.17116216123104094,
|
|
"num_tokens": 40764878.0,
|
|
"step": 22100
|
|
},
|
|
{
|
|
"entropy": 5.70435905456543,
|
|
"epoch": 1.857130854862424,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046542047942721025,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.17294495701789855,
|
|
"num_tokens": 40774101.0,
|
|
"step": 22105
|
|
},
|
|
{
|
|
"entropy": 5.662673377990723,
|
|
"epoch": 1.8575509346775887,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000465404525646592,
|
|
"loss": 5.3546,
|
|
"mean_token_accuracy": 0.1730918511748314,
|
|
"num_tokens": 40783126.0,
|
|
"step": 22110
|
|
},
|
|
{
|
|
"entropy": 5.558742523193359,
|
|
"epoch": 1.8579710144927537,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004653885684930557,
|
|
"loss": 5.3408,
|
|
"mean_token_accuracy": 0.17543695122003555,
|
|
"num_tokens": 40792508.0,
|
|
"step": 22115
|
|
},
|
|
{
|
|
"entropy": 5.638851690292358,
|
|
"epoch": 1.8583910943079185,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004653726079668839,
|
|
"loss": 5.3736,
|
|
"mean_token_accuracy": 0.1744435727596283,
|
|
"num_tokens": 40802252.0,
|
|
"step": 22120
|
|
},
|
|
{
|
|
"entropy": 5.594110679626465,
|
|
"epoch": 1.8588111741230833,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004653566440683594,
|
|
"loss": 5.2069,
|
|
"mean_token_accuracy": 0.18530822545289993,
|
|
"num_tokens": 40811041.0,
|
|
"step": 22125
|
|
},
|
|
{
|
|
"entropy": 5.588617467880249,
|
|
"epoch": 1.8592312539382483,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000465340676797765,
|
|
"loss": 5.2472,
|
|
"mean_token_accuracy": 0.1726512759923935,
|
|
"num_tokens": 40819976.0,
|
|
"step": 22130
|
|
},
|
|
{
|
|
"entropy": 5.55437970161438,
|
|
"epoch": 1.8596513337534133,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046532470615538344,
|
|
"loss": 5.3,
|
|
"mean_token_accuracy": 0.17627189308404922,
|
|
"num_tokens": 40828544.0,
|
|
"step": 22135
|
|
},
|
|
{
|
|
"entropy": 5.619093751907348,
|
|
"epoch": 1.860071413568578,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046530873214149776,
|
|
"loss": 5.4031,
|
|
"mean_token_accuracy": 0.17888118773698808,
|
|
"num_tokens": 40838386.0,
|
|
"step": 22140
|
|
},
|
|
{
|
|
"entropy": 5.716306734085083,
|
|
"epoch": 1.8604914933837429,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004652927547563908,
|
|
"loss": 5.3773,
|
|
"mean_token_accuracy": 0.16736137866973877,
|
|
"num_tokens": 40847047.0,
|
|
"step": 22145
|
|
},
|
|
{
|
|
"entropy": 5.63306770324707,
|
|
"epoch": 1.8609115731989077,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004652767740003458,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.17356206327676774,
|
|
"num_tokens": 40856653.0,
|
|
"step": 22150
|
|
},
|
|
{
|
|
"entropy": 5.715381336212158,
|
|
"epoch": 1.8613316530140727,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046526078987364566,
|
|
"loss": 5.4628,
|
|
"mean_token_accuracy": 0.16745153367519378,
|
|
"num_tokens": 40865176.0,
|
|
"step": 22155
|
|
},
|
|
{
|
|
"entropy": 5.713296556472779,
|
|
"epoch": 1.8617517328292377,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004652448023765736,
|
|
"loss": 5.5087,
|
|
"mean_token_accuracy": 0.17654276341199876,
|
|
"num_tokens": 40874084.0,
|
|
"step": 22160
|
|
},
|
|
{
|
|
"entropy": 5.7095225811004635,
|
|
"epoch": 1.8621718126444025,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004652288115094129,
|
|
"loss": 5.3929,
|
|
"mean_token_accuracy": 0.1741943970322609,
|
|
"num_tokens": 40883704.0,
|
|
"step": 22165
|
|
},
|
|
{
|
|
"entropy": 5.669970941543579,
|
|
"epoch": 1.8625918924595672,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004652128172724466,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.16834318935871123,
|
|
"num_tokens": 40893232.0,
|
|
"step": 22170
|
|
},
|
|
{
|
|
"entropy": 5.623793125152588,
|
|
"epoch": 1.8630119722747323,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00046519681966595834,
|
|
"loss": 5.3053,
|
|
"mean_token_accuracy": 0.18128742128610612,
|
|
"num_tokens": 40902242.0,
|
|
"step": 22175
|
|
},
|
|
{
|
|
"entropy": 5.603348350524902,
|
|
"epoch": 1.863432052089897,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004651808186902313,
|
|
"loss": 5.3319,
|
|
"mean_token_accuracy": 0.17357497066259384,
|
|
"num_tokens": 40912349.0,
|
|
"step": 22180
|
|
},
|
|
{
|
|
"entropy": 5.63823561668396,
|
|
"epoch": 1.863852131905062,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000465164814345549,
|
|
"loss": 5.3295,
|
|
"mean_token_accuracy": 0.18075911849737167,
|
|
"num_tokens": 40922206.0,
|
|
"step": 22185
|
|
},
|
|
{
|
|
"entropy": 5.695439434051513,
|
|
"epoch": 1.8642722117202268,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00046514880663219493,
|
|
"loss": 5.3078,
|
|
"mean_token_accuracy": 0.17492891997098922,
|
|
"num_tokens": 40931145.0,
|
|
"step": 22190
|
|
},
|
|
{
|
|
"entropy": 5.581724739074707,
|
|
"epoch": 1.8646922915353916,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004651327955504526,
|
|
"loss": 5.227,
|
|
"mean_token_accuracy": 0.18555289059877395,
|
|
"num_tokens": 40939917.0,
|
|
"step": 22195
|
|
},
|
|
{
|
|
"entropy": 5.589807987213135,
|
|
"epoch": 1.8651123713505566,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004651167811006058,
|
|
"loss": 5.2953,
|
|
"mean_token_accuracy": 0.17969516515731812,
|
|
"num_tokens": 40947972.0,
|
|
"step": 22200
|
|
},
|
|
{
|
|
"entropy": 5.4864636898040775,
|
|
"epoch": 1.8655324511657216,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000465100763282938,
|
|
"loss": 5.2555,
|
|
"mean_token_accuracy": 0.1773657724261284,
|
|
"num_tokens": 40956999.0,
|
|
"step": 22205
|
|
},
|
|
{
|
|
"entropy": 5.5771567821502686,
|
|
"epoch": 1.8659525309808864,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004650847420977332,
|
|
"loss": 5.2257,
|
|
"mean_token_accuracy": 0.18388084918260575,
|
|
"num_tokens": 40965917.0,
|
|
"step": 22210
|
|
},
|
|
{
|
|
"entropy": 5.6036945343017575,
|
|
"epoch": 1.8663726107960512,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046506871754527495,
|
|
"loss": 5.3267,
|
|
"mean_token_accuracy": 0.17044262886047362,
|
|
"num_tokens": 40976545.0,
|
|
"step": 22215
|
|
},
|
|
{
|
|
"entropy": 5.667624187469483,
|
|
"epoch": 1.866792690611216,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046505268962584735,
|
|
"loss": 5.3608,
|
|
"mean_token_accuracy": 0.1779804602265358,
|
|
"num_tokens": 40985890.0,
|
|
"step": 22220
|
|
},
|
|
{
|
|
"entropy": 5.649150419235229,
|
|
"epoch": 1.867212770426381,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004650366583397342,
|
|
"loss": 5.3532,
|
|
"mean_token_accuracy": 0.1736404314637184,
|
|
"num_tokens": 40995255.0,
|
|
"step": 22225
|
|
},
|
|
{
|
|
"entropy": 5.658413934707641,
|
|
"epoch": 1.867632850241546,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004650206236872194,
|
|
"loss": 5.4099,
|
|
"mean_token_accuracy": 0.1688782885670662,
|
|
"num_tokens": 41004419.0,
|
|
"step": 22230
|
|
},
|
|
{
|
|
"entropy": 5.551290559768677,
|
|
"epoch": 1.8680529300567108,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 0.0004650045856685872,
|
|
"loss": 5.1684,
|
|
"mean_token_accuracy": 0.19379522502422333,
|
|
"num_tokens": 41013179.0,
|
|
"step": 22235
|
|
},
|
|
{
|
|
"entropy": 5.574545621871948,
|
|
"epoch": 1.8684730098718756,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046498854428412157,
|
|
"loss": 5.2586,
|
|
"mean_token_accuracy": 0.17610279172658921,
|
|
"num_tokens": 41022307.0,
|
|
"step": 22240
|
|
},
|
|
{
|
|
"entropy": 5.584465265274048,
|
|
"epoch": 1.8688930896870404,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046497249953410675,
|
|
"loss": 5.4041,
|
|
"mean_token_accuracy": 0.1725468173623085,
|
|
"num_tokens": 41032331.0,
|
|
"step": 22245
|
|
},
|
|
{
|
|
"entropy": 5.673091411590576,
|
|
"epoch": 1.8693131695022054,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004649564514188269,
|
|
"loss": 5.4283,
|
|
"mean_token_accuracy": 0.17023940831422807,
|
|
"num_tokens": 41041895.0,
|
|
"step": 22250
|
|
},
|
|
{
|
|
"entropy": 5.588696575164795,
|
|
"epoch": 1.8697332493173704,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004649403999385662,
|
|
"loss": 5.2562,
|
|
"mean_token_accuracy": 0.17987769544124604,
|
|
"num_tokens": 41051643.0,
|
|
"step": 22255
|
|
},
|
|
{
|
|
"entropy": 5.619039726257324,
|
|
"epoch": 1.8701533291325352,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004649243450936092,
|
|
"loss": 5.2732,
|
|
"mean_token_accuracy": 0.17971949875354767,
|
|
"num_tokens": 41060478.0,
|
|
"step": 22260
|
|
},
|
|
{
|
|
"entropy": 5.559855890274048,
|
|
"epoch": 1.8705734089477,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004649082868842403,
|
|
"loss": 5.342,
|
|
"mean_token_accuracy": 0.17189270853996277,
|
|
"num_tokens": 41069389.0,
|
|
"step": 22265
|
|
},
|
|
{
|
|
"entropy": 5.50737624168396,
|
|
"epoch": 1.870993488762865,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046489222531074376,
|
|
"loss": 5.2905,
|
|
"mean_token_accuracy": 0.1808660864830017,
|
|
"num_tokens": 41078529.0,
|
|
"step": 22270
|
|
},
|
|
{
|
|
"entropy": 5.698597478866577,
|
|
"epoch": 1.87141356857803,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00046487616037340436,
|
|
"loss": 5.3898,
|
|
"mean_token_accuracy": 0.17067276537418366,
|
|
"num_tokens": 41087593.0,
|
|
"step": 22275
|
|
},
|
|
{
|
|
"entropy": 5.727426338195801,
|
|
"epoch": 1.8718336483931948,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004648600920725065,
|
|
"loss": 5.3667,
|
|
"mean_token_accuracy": 0.1694189727306366,
|
|
"num_tokens": 41098317.0,
|
|
"step": 22280
|
|
},
|
|
{
|
|
"entropy": 5.639452648162842,
|
|
"epoch": 1.8722537282083596,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046484402040833486,
|
|
"loss": 5.3661,
|
|
"mean_token_accuracy": 0.1721063882112503,
|
|
"num_tokens": 41108659.0,
|
|
"step": 22285
|
|
},
|
|
{
|
|
"entropy": 5.701750898361206,
|
|
"epoch": 1.8726738080235243,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00046482794538117413,
|
|
"loss": 5.4564,
|
|
"mean_token_accuracy": 0.17068351805210114,
|
|
"num_tokens": 41117504.0,
|
|
"step": 22290
|
|
},
|
|
{
|
|
"entropy": 5.677620840072632,
|
|
"epoch": 1.8730938878386894,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046481186699130913,
|
|
"loss": 5.3154,
|
|
"mean_token_accuracy": 0.1742495611310005,
|
|
"num_tokens": 41126249.0,
|
|
"step": 22295
|
|
},
|
|
{
|
|
"entropy": 5.505521583557129,
|
|
"epoch": 1.8735139676538544,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004647957852390247,
|
|
"loss": 5.2023,
|
|
"mean_token_accuracy": 0.17846413403749467,
|
|
"num_tokens": 41134956.0,
|
|
"step": 22300
|
|
},
|
|
{
|
|
"entropy": 5.619848108291626,
|
|
"epoch": 1.8739340474690191,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00046477970012460555,
|
|
"loss": 5.3039,
|
|
"mean_token_accuracy": 0.17693169862031938,
|
|
"num_tokens": 41144340.0,
|
|
"step": 22305
|
|
},
|
|
{
|
|
"entropy": 5.574439334869385,
|
|
"epoch": 1.874354127284184,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004647636116483367,
|
|
"loss": 5.3211,
|
|
"mean_token_accuracy": 0.17290742844343185,
|
|
"num_tokens": 41152937.0,
|
|
"step": 22310
|
|
},
|
|
{
|
|
"entropy": 5.687628555297851,
|
|
"epoch": 1.8747742070993487,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00046474751981050334,
|
|
"loss": 5.4899,
|
|
"mean_token_accuracy": 0.16893676668405533,
|
|
"num_tokens": 41162361.0,
|
|
"step": 22315
|
|
},
|
|
{
|
|
"entropy": 5.744011449813843,
|
|
"epoch": 1.8751942869145137,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00046473142461139034,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.16351682245731353,
|
|
"num_tokens": 41171979.0,
|
|
"step": 22320
|
|
},
|
|
{
|
|
"entropy": 5.596357345581055,
|
|
"epoch": 1.8756143667296787,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004647153260512828,
|
|
"loss": 5.3187,
|
|
"mean_token_accuracy": 0.17824737578630448,
|
|
"num_tokens": 41182145.0,
|
|
"step": 22325
|
|
},
|
|
{
|
|
"entropy": 5.619386386871338,
|
|
"epoch": 1.8760344465448435,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004646992241304659,
|
|
"loss": 5.3443,
|
|
"mean_token_accuracy": 0.17120126634836197,
|
|
"num_tokens": 41191522.0,
|
|
"step": 22330
|
|
},
|
|
{
|
|
"entropy": 5.681074142456055,
|
|
"epoch": 1.8764545263600083,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000464683118849225,
|
|
"loss": 5.4375,
|
|
"mean_token_accuracy": 0.16661341339349747,
|
|
"num_tokens": 41201052.0,
|
|
"step": 22335
|
|
},
|
|
{
|
|
"entropy": 5.59029483795166,
|
|
"epoch": 1.8768746061751733,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004646670102078453,
|
|
"loss": 5.3007,
|
|
"mean_token_accuracy": 0.17739052772521974,
|
|
"num_tokens": 41210211.0,
|
|
"step": 22340
|
|
},
|
|
{
|
|
"entropy": 5.63611478805542,
|
|
"epoch": 1.8772946859903383,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004646508982066122,
|
|
"loss": 5.4579,
|
|
"mean_token_accuracy": 0.16615783870220185,
|
|
"num_tokens": 41219778.0,
|
|
"step": 22345
|
|
},
|
|
{
|
|
"entropy": 5.676466846466065,
|
|
"epoch": 1.8777147658055031,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00046463478284581114,
|
|
"loss": 5.4143,
|
|
"mean_token_accuracy": 0.17920258045196533,
|
|
"num_tokens": 41229550.0,
|
|
"step": 22350
|
|
},
|
|
{
|
|
"entropy": 5.6495026588439945,
|
|
"epoch": 1.878134845620668,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004646186641257275,
|
|
"loss": 5.2768,
|
|
"mean_token_accuracy": 0.18095650821924208,
|
|
"num_tokens": 41238130.0,
|
|
"step": 22355
|
|
},
|
|
{
|
|
"entropy": 5.608019781112671,
|
|
"epoch": 1.8785549254358327,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004646025420466468,
|
|
"loss": 5.2893,
|
|
"mean_token_accuracy": 0.17400604337453843,
|
|
"num_tokens": 41247324.0,
|
|
"step": 22360
|
|
},
|
|
{
|
|
"entropy": 5.572899055480957,
|
|
"epoch": 1.8789750052509977,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046458641660885474,
|
|
"loss": 5.3558,
|
|
"mean_token_accuracy": 0.17406723499298096,
|
|
"num_tokens": 41256131.0,
|
|
"step": 22365
|
|
},
|
|
{
|
|
"entropy": 5.612897682189941,
|
|
"epoch": 1.8793950850661627,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046457028781263693,
|
|
"loss": 5.3927,
|
|
"mean_token_accuracy": 0.16953571438789367,
|
|
"num_tokens": 41265225.0,
|
|
"step": 22370
|
|
},
|
|
{
|
|
"entropy": 5.715818929672241,
|
|
"epoch": 1.8798151648813275,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046455415565827907,
|
|
"loss": 5.3887,
|
|
"mean_token_accuracy": 0.1686493307352066,
|
|
"num_tokens": 41274023.0,
|
|
"step": 22375
|
|
},
|
|
{
|
|
"entropy": 5.70667200088501,
|
|
"epoch": 1.8802352446964923,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000464538020146067,
|
|
"loss": 5.4151,
|
|
"mean_token_accuracy": 0.16874268501996995,
|
|
"num_tokens": 41283030.0,
|
|
"step": 22380
|
|
},
|
|
{
|
|
"entropy": 5.729498672485351,
|
|
"epoch": 1.880655324511657,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004645218812762864,
|
|
"loss": 5.4709,
|
|
"mean_token_accuracy": 0.1674060821533203,
|
|
"num_tokens": 41292654.0,
|
|
"step": 22385
|
|
},
|
|
{
|
|
"entropy": 5.639959239959717,
|
|
"epoch": 1.881075404326822,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004645057390492234,
|
|
"loss": 5.2063,
|
|
"mean_token_accuracy": 0.18706902414560317,
|
|
"num_tokens": 41301838.0,
|
|
"step": 22390
|
|
},
|
|
{
|
|
"entropy": 5.597166204452515,
|
|
"epoch": 1.881495484141987,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004644895934651638,
|
|
"loss": 5.3138,
|
|
"mean_token_accuracy": 0.17899076640605927,
|
|
"num_tokens": 41311104.0,
|
|
"step": 22395
|
|
},
|
|
{
|
|
"entropy": 5.704216480255127,
|
|
"epoch": 1.8819155639571519,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046447344452439356,
|
|
"loss": 5.4198,
|
|
"mean_token_accuracy": 0.1649742141366005,
|
|
"num_tokens": 41320213.0,
|
|
"step": 22400
|
|
},
|
|
{
|
|
"entropy": 5.6293620586395265,
|
|
"epoch": 1.8823356437723167,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004644572922271988,
|
|
"loss": 5.2972,
|
|
"mean_token_accuracy": 0.17442207932472228,
|
|
"num_tokens": 41330027.0,
|
|
"step": 22405
|
|
},
|
|
{
|
|
"entropy": 5.657583141326905,
|
|
"epoch": 1.8827557235874817,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046444113657386567,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.16551758348941803,
|
|
"num_tokens": 41339481.0,
|
|
"step": 22410
|
|
},
|
|
{
|
|
"entropy": 5.704918766021729,
|
|
"epoch": 1.8831758034026465,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00046442497756468037,
|
|
"loss": 5.4275,
|
|
"mean_token_accuracy": 0.16912316530942917,
|
|
"num_tokens": 41348679.0,
|
|
"step": 22415
|
|
},
|
|
{
|
|
"entropy": 5.632734298706055,
|
|
"epoch": 1.8835958832178115,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046440881519992924,
|
|
"loss": 5.2812,
|
|
"mean_token_accuracy": 0.1797910824418068,
|
|
"num_tokens": 41358736.0,
|
|
"step": 22420
|
|
},
|
|
{
|
|
"entropy": 5.636936283111572,
|
|
"epoch": 1.8840159630329762,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004643926494798983,
|
|
"loss": 5.43,
|
|
"mean_token_accuracy": 0.16520747989416124,
|
|
"num_tokens": 41368284.0,
|
|
"step": 22425
|
|
},
|
|
{
|
|
"entropy": 5.653887033462524,
|
|
"epoch": 1.884436042848141,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046437648040487426,
|
|
"loss": 5.3153,
|
|
"mean_token_accuracy": 0.1689191997051239,
|
|
"num_tokens": 41377789.0,
|
|
"step": 22430
|
|
},
|
|
{
|
|
"entropy": 5.650376510620117,
|
|
"epoch": 1.884856122663306,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046436030797514325,
|
|
"loss": 5.3333,
|
|
"mean_token_accuracy": 0.17308636307716369,
|
|
"num_tokens": 41386909.0,
|
|
"step": 22435
|
|
},
|
|
{
|
|
"entropy": 5.675967454910278,
|
|
"epoch": 1.885276202478471,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004643441321909919,
|
|
"loss": 5.3553,
|
|
"mean_token_accuracy": 0.17917974442243575,
|
|
"num_tokens": 41396693.0,
|
|
"step": 22440
|
|
},
|
|
{
|
|
"entropy": 5.677432060241699,
|
|
"epoch": 1.8856962822936358,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046432795305270674,
|
|
"loss": 5.4418,
|
|
"mean_token_accuracy": 0.16401530504226686,
|
|
"num_tokens": 41407193.0,
|
|
"step": 22445
|
|
},
|
|
{
|
|
"entropy": 5.679871845245361,
|
|
"epoch": 1.8861163621088006,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046431177056057446,
|
|
"loss": 5.394,
|
|
"mean_token_accuracy": 0.17553680688142775,
|
|
"num_tokens": 41416567.0,
|
|
"step": 22450
|
|
},
|
|
{
|
|
"entropy": 5.5496378421783445,
|
|
"epoch": 1.8865364419239654,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046429558471488164,
|
|
"loss": 5.2553,
|
|
"mean_token_accuracy": 0.1786068633198738,
|
|
"num_tokens": 41425328.0,
|
|
"step": 22455
|
|
},
|
|
{
|
|
"entropy": 5.659866619110107,
|
|
"epoch": 1.8869565217391304,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000464279395515915,
|
|
"loss": 5.4151,
|
|
"mean_token_accuracy": 0.1715554863214493,
|
|
"num_tokens": 41435229.0,
|
|
"step": 22460
|
|
},
|
|
{
|
|
"entropy": 5.62566819190979,
|
|
"epoch": 1.8873766015542954,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00046426320296396136,
|
|
"loss": 5.3374,
|
|
"mean_token_accuracy": 0.17149607092142105,
|
|
"num_tokens": 41445471.0,
|
|
"step": 22465
|
|
},
|
|
{
|
|
"entropy": 5.568106746673584,
|
|
"epoch": 1.8877966813694602,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046424700705930745,
|
|
"loss": 5.247,
|
|
"mean_token_accuracy": 0.18762259185314178,
|
|
"num_tokens": 41454654.0,
|
|
"step": 22470
|
|
},
|
|
{
|
|
"entropy": 5.585866975784302,
|
|
"epoch": 1.888216761184625,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004642308078022403,
|
|
"loss": 5.274,
|
|
"mean_token_accuracy": 0.17488398551940917,
|
|
"num_tokens": 41463341.0,
|
|
"step": 22475
|
|
},
|
|
{
|
|
"entropy": 5.630618524551392,
|
|
"epoch": 1.88863684099979,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046421460519304684,
|
|
"loss": 5.3228,
|
|
"mean_token_accuracy": 0.17458095848560334,
|
|
"num_tokens": 41472677.0,
|
|
"step": 22480
|
|
},
|
|
{
|
|
"entropy": 5.686393451690674,
|
|
"epoch": 1.8890569208149548,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000464198399232014,
|
|
"loss": 5.4609,
|
|
"mean_token_accuracy": 0.16629096865653992,
|
|
"num_tokens": 41482867.0,
|
|
"step": 22485
|
|
},
|
|
{
|
|
"entropy": 5.745574474334717,
|
|
"epoch": 1.8894770006301198,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004641821899194291,
|
|
"loss": 5.4098,
|
|
"mean_token_accuracy": 0.16831042617559433,
|
|
"num_tokens": 41493432.0,
|
|
"step": 22490
|
|
},
|
|
{
|
|
"entropy": 5.7386678695678714,
|
|
"epoch": 1.8898970804452846,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046416597725557903,
|
|
"loss": 5.4352,
|
|
"mean_token_accuracy": 0.16686583310365677,
|
|
"num_tokens": 41503807.0,
|
|
"step": 22495
|
|
},
|
|
{
|
|
"entropy": 5.621123218536377,
|
|
"epoch": 1.8903171602604494,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000464149761240751,
|
|
"loss": 5.3121,
|
|
"mean_token_accuracy": 0.18151101171970369,
|
|
"num_tokens": 41512524.0,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"entropy": 5.663379192352295,
|
|
"epoch": 1.8907372400756144,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00046413354187523244,
|
|
"loss": 5.4507,
|
|
"mean_token_accuracy": 0.17097427397966386,
|
|
"num_tokens": 41521915.0,
|
|
"step": 22505
|
|
},
|
|
{
|
|
"entropy": 5.619107627868653,
|
|
"epoch": 1.8911573198907794,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004641173191593105,
|
|
"loss": 5.3515,
|
|
"mean_token_accuracy": 0.17258709371089936,
|
|
"num_tokens": 41530293.0,
|
|
"step": 22510
|
|
},
|
|
{
|
|
"entropy": 5.65953893661499,
|
|
"epoch": 1.8915773997059442,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046410109309327275,
|
|
"loss": 5.4098,
|
|
"mean_token_accuracy": 0.17016493827104567,
|
|
"num_tokens": 41538660.0,
|
|
"step": 22515
|
|
},
|
|
{
|
|
"entropy": 5.635321474075317,
|
|
"epoch": 1.891997479521109,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046408486367740647,
|
|
"loss": 5.3747,
|
|
"mean_token_accuracy": 0.17979306429624559,
|
|
"num_tokens": 41547952.0,
|
|
"step": 22520
|
|
},
|
|
{
|
|
"entropy": 5.630529260635376,
|
|
"epoch": 1.8924175593362738,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004640686309119992,
|
|
"loss": 5.3026,
|
|
"mean_token_accuracy": 0.18238568902015687,
|
|
"num_tokens": 41557093.0,
|
|
"step": 22525
|
|
},
|
|
{
|
|
"entropy": 5.610331726074219,
|
|
"epoch": 1.8928376391514388,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046405239479733844,
|
|
"loss": 5.3316,
|
|
"mean_token_accuracy": 0.1757591873407364,
|
|
"num_tokens": 41565836.0,
|
|
"step": 22530
|
|
},
|
|
{
|
|
"entropy": 5.570929670333863,
|
|
"epoch": 1.8932577189666038,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004640361553337119,
|
|
"loss": 5.3758,
|
|
"mean_token_accuracy": 0.18229353278875352,
|
|
"num_tokens": 41575365.0,
|
|
"step": 22535
|
|
},
|
|
{
|
|
"entropy": 5.628244113922119,
|
|
"epoch": 1.8936777987817686,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00046401991252140715,
|
|
"loss": 5.3339,
|
|
"mean_token_accuracy": 0.17711923271417618,
|
|
"num_tokens": 41583690.0,
|
|
"step": 22540
|
|
},
|
|
{
|
|
"entropy": 5.720776605606079,
|
|
"epoch": 1.8940978785969333,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000464003666360712,
|
|
"loss": 5.3243,
|
|
"mean_token_accuracy": 0.17557096034288405,
|
|
"num_tokens": 41593536.0,
|
|
"step": 22545
|
|
},
|
|
{
|
|
"entropy": 5.611479806900024,
|
|
"epoch": 1.8945179584120981,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004639874168519143,
|
|
"loss": 5.3045,
|
|
"mean_token_accuracy": 0.17414466589689254,
|
|
"num_tokens": 41602543.0,
|
|
"step": 22550
|
|
},
|
|
{
|
|
"entropy": 5.584366273880005,
|
|
"epoch": 1.8949380382272631,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004639711639953017,
|
|
"loss": 5.3845,
|
|
"mean_token_accuracy": 0.17162299007177353,
|
|
"num_tokens": 41611634.0,
|
|
"step": 22555
|
|
},
|
|
{
|
|
"entropy": 5.548893404006958,
|
|
"epoch": 1.8953581180424282,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004639549077911623,
|
|
"loss": 5.354,
|
|
"mean_token_accuracy": 0.16889655143022536,
|
|
"num_tokens": 41621400.0,
|
|
"step": 22560
|
|
},
|
|
{
|
|
"entropy": 5.694586753845215,
|
|
"epoch": 1.895778197857593,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046393864823978406,
|
|
"loss": 5.3317,
|
|
"mean_token_accuracy": 0.17640386521816254,
|
|
"num_tokens": 41631070.0,
|
|
"step": 22565
|
|
},
|
|
{
|
|
"entropy": 5.727872610092163,
|
|
"epoch": 1.8961982776727577,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004639223853414549,
|
|
"loss": 5.4155,
|
|
"mean_token_accuracy": 0.17031230032444,
|
|
"num_tokens": 41641442.0,
|
|
"step": 22570
|
|
},
|
|
{
|
|
"entropy": 5.665301179885864,
|
|
"epoch": 1.8966183574879227,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000463906119096463,
|
|
"loss": 5.4178,
|
|
"mean_token_accuracy": 0.1735200360417366,
|
|
"num_tokens": 41651616.0,
|
|
"step": 22575
|
|
},
|
|
{
|
|
"entropy": 5.676836919784546,
|
|
"epoch": 1.8970384373030877,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004638898495050963,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.17591539174318313,
|
|
"num_tokens": 41660704.0,
|
|
"step": 22580
|
|
},
|
|
{
|
|
"entropy": 5.612502813339233,
|
|
"epoch": 1.8974585171182525,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004638735765676434,
|
|
"loss": 5.393,
|
|
"mean_token_accuracy": 0.16929904073476792,
|
|
"num_tokens": 41669824.0,
|
|
"step": 22585
|
|
},
|
|
{
|
|
"entropy": 5.6595179557800295,
|
|
"epoch": 1.8978785969334173,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004638573002843922,
|
|
"loss": 5.3122,
|
|
"mean_token_accuracy": 0.1842661365866661,
|
|
"num_tokens": 41680082.0,
|
|
"step": 22590
|
|
},
|
|
{
|
|
"entropy": 5.603770017623901,
|
|
"epoch": 1.898298676748582,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004638410206556312,
|
|
"loss": 5.2665,
|
|
"mean_token_accuracy": 0.17830771952867508,
|
|
"num_tokens": 41689282.0,
|
|
"step": 22595
|
|
},
|
|
{
|
|
"entropy": 5.625360727310181,
|
|
"epoch": 1.8987187565637471,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004638247376816489,
|
|
"loss": 5.404,
|
|
"mean_token_accuracy": 0.1719541594386101,
|
|
"num_tokens": 41699059.0,
|
|
"step": 22600
|
|
},
|
|
{
|
|
"entropy": 5.763462495803833,
|
|
"epoch": 1.8991388363789121,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004638084513627335,
|
|
"loss": 5.4994,
|
|
"mean_token_accuracy": 0.1679268956184387,
|
|
"num_tokens": 41708674.0,
|
|
"step": 22605
|
|
},
|
|
{
|
|
"entropy": 5.718596315383911,
|
|
"epoch": 1.899558916194077,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00046379216169917356,
|
|
"loss": 5.4022,
|
|
"mean_token_accuracy": 0.16962596029043198,
|
|
"num_tokens": 41718418.0,
|
|
"step": 22610
|
|
},
|
|
{
|
|
"entropy": 5.609939289093018,
|
|
"epoch": 1.8999789960092417,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004637758686912577,
|
|
"loss": 5.4069,
|
|
"mean_token_accuracy": 0.16938821971416473,
|
|
"num_tokens": 41728229.0,
|
|
"step": 22615
|
|
},
|
|
{
|
|
"entropy": 5.617797803878784,
|
|
"epoch": 1.9003990758244065,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046375957233927456,
|
|
"loss": 5.365,
|
|
"mean_token_accuracy": 0.17396558225154876,
|
|
"num_tokens": 41737074.0,
|
|
"step": 22620
|
|
},
|
|
{
|
|
"entropy": 5.647723913192749,
|
|
"epoch": 1.9008191556395715,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046374327264351277,
|
|
"loss": 5.2549,
|
|
"mean_token_accuracy": 0.17883535474538803,
|
|
"num_tokens": 41745823.0,
|
|
"step": 22625
|
|
},
|
|
{
|
|
"entropy": 5.568923711776733,
|
|
"epoch": 1.9012392354547365,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046372696960426116,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.18322131186723709,
|
|
"num_tokens": 41754591.0,
|
|
"step": 22630
|
|
},
|
|
{
|
|
"entropy": 5.663699960708618,
|
|
"epoch": 1.9016593152699013,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00046371066322180846,
|
|
"loss": 5.3477,
|
|
"mean_token_accuracy": 0.1712099567055702,
|
|
"num_tokens": 41763585.0,
|
|
"step": 22635
|
|
},
|
|
{
|
|
"entropy": 5.690198802947998,
|
|
"epoch": 1.902079395085066,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046369435349644344,
|
|
"loss": 5.3829,
|
|
"mean_token_accuracy": 0.17371902912855147,
|
|
"num_tokens": 41772712.0,
|
|
"step": 22640
|
|
},
|
|
{
|
|
"entropy": 5.65671181678772,
|
|
"epoch": 1.902499474900231,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00046367804042845515,
|
|
"loss": 5.2653,
|
|
"mean_token_accuracy": 0.18572683036327362,
|
|
"num_tokens": 41781516.0,
|
|
"step": 22645
|
|
},
|
|
{
|
|
"entropy": 5.615236091613769,
|
|
"epoch": 1.902919554715396,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046366172401813253,
|
|
"loss": 5.3415,
|
|
"mean_token_accuracy": 0.17305969446897507,
|
|
"num_tokens": 41790731.0,
|
|
"step": 22650
|
|
},
|
|
{
|
|
"entropy": 5.680331754684448,
|
|
"epoch": 1.9033396345305609,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004636454042657647,
|
|
"loss": 5.368,
|
|
"mean_token_accuracy": 0.17193576842546462,
|
|
"num_tokens": 41799654.0,
|
|
"step": 22655
|
|
},
|
|
{
|
|
"entropy": 5.559491348266602,
|
|
"epoch": 1.9037597143457257,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046362908117164055,
|
|
"loss": 5.2569,
|
|
"mean_token_accuracy": 0.1793026253581047,
|
|
"num_tokens": 41809408.0,
|
|
"step": 22660
|
|
},
|
|
{
|
|
"entropy": 5.644749689102173,
|
|
"epoch": 1.9041797941608904,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004636127547360494,
|
|
"loss": 5.4225,
|
|
"mean_token_accuracy": 0.16808681786060334,
|
|
"num_tokens": 41818868.0,
|
|
"step": 22665
|
|
},
|
|
{
|
|
"entropy": 5.665660381317139,
|
|
"epoch": 1.9045998739760555,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004635964249592804,
|
|
"loss": 5.3304,
|
|
"mean_token_accuracy": 0.1782209351658821,
|
|
"num_tokens": 41827156.0,
|
|
"step": 22670
|
|
},
|
|
{
|
|
"entropy": 5.692537307739258,
|
|
"epoch": 1.9050199537912205,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004635800918416229,
|
|
"loss": 5.4402,
|
|
"mean_token_accuracy": 0.16368364691734313,
|
|
"num_tokens": 41837025.0,
|
|
"step": 22675
|
|
},
|
|
{
|
|
"entropy": 5.745527839660644,
|
|
"epoch": 1.9054400336063853,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046356375538336616,
|
|
"loss": 5.3507,
|
|
"mean_token_accuracy": 0.17565076798200607,
|
|
"num_tokens": 41846196.0,
|
|
"step": 22680
|
|
},
|
|
{
|
|
"entropy": 5.5358936309814455,
|
|
"epoch": 1.90586011342155,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00046354741558479956,
|
|
"loss": 5.3266,
|
|
"mean_token_accuracy": 0.16661422401666642,
|
|
"num_tokens": 41855030.0,
|
|
"step": 22685
|
|
},
|
|
{
|
|
"entropy": 5.6051513671875,
|
|
"epoch": 1.9062801932367148,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004635310724462126,
|
|
"loss": 5.2093,
|
|
"mean_token_accuracy": 0.17713478952646255,
|
|
"num_tokens": 41863740.0,
|
|
"step": 22690
|
|
},
|
|
{
|
|
"entropy": 5.604347562789917,
|
|
"epoch": 1.9067002730518798,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004635147259678948,
|
|
"loss": 5.3446,
|
|
"mean_token_accuracy": 0.17247212529182435,
|
|
"num_tokens": 41873376.0,
|
|
"step": 22695
|
|
},
|
|
{
|
|
"entropy": 5.698956775665283,
|
|
"epoch": 1.9071203528670448,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046349837615013563,
|
|
"loss": 5.4611,
|
|
"mean_token_accuracy": 0.16106533110141755,
|
|
"num_tokens": 41882491.0,
|
|
"step": 22700
|
|
},
|
|
{
|
|
"entropy": 5.6646346092224125,
|
|
"epoch": 1.9075404326822096,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004634820229932248,
|
|
"loss": 5.3108,
|
|
"mean_token_accuracy": 0.17672717124223708,
|
|
"num_tokens": 41891357.0,
|
|
"step": 22705
|
|
},
|
|
{
|
|
"entropy": 5.618336296081543,
|
|
"epoch": 1.9079605124973744,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00046346566649745205,
|
|
"loss": 5.3758,
|
|
"mean_token_accuracy": 0.17323821932077407,
|
|
"num_tokens": 41899874.0,
|
|
"step": 22710
|
|
},
|
|
{
|
|
"entropy": 5.639418315887451,
|
|
"epoch": 1.9083805923125394,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000463449306663107,
|
|
"loss": 5.3765,
|
|
"mean_token_accuracy": 0.17575515508651735,
|
|
"num_tokens": 41909673.0,
|
|
"step": 22715
|
|
},
|
|
{
|
|
"entropy": 5.713147306442261,
|
|
"epoch": 1.9088006721277042,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004634329434904796,
|
|
"loss": 5.4385,
|
|
"mean_token_accuracy": 0.16925620883703232,
|
|
"num_tokens": 41919126.0,
|
|
"step": 22720
|
|
},
|
|
{
|
|
"entropy": 5.597732830047607,
|
|
"epoch": 1.9092207519428692,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004634165769798596,
|
|
"loss": 5.2688,
|
|
"mean_token_accuracy": 0.17853163182735443,
|
|
"num_tokens": 41927751.0,
|
|
"step": 22725
|
|
},
|
|
{
|
|
"entropy": 5.636762285232544,
|
|
"epoch": 1.909640831758034,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004634002071315369,
|
|
"loss": 5.3374,
|
|
"mean_token_accuracy": 0.1791643977165222,
|
|
"num_tokens": 41937290.0,
|
|
"step": 22730
|
|
},
|
|
{
|
|
"entropy": 5.636068058013916,
|
|
"epoch": 1.9100609115731988,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046338383394580157,
|
|
"loss": 5.2968,
|
|
"mean_token_accuracy": 0.18056693077087402,
|
|
"num_tokens": 41947186.0,
|
|
"step": 22735
|
|
},
|
|
{
|
|
"entropy": 5.5747472763061525,
|
|
"epoch": 1.9104809913883638,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046336745742294366,
|
|
"loss": 5.306,
|
|
"mean_token_accuracy": 0.16971218585968018,
|
|
"num_tokens": 41956197.0,
|
|
"step": 22740
|
|
},
|
|
{
|
|
"entropy": 5.6662568092346195,
|
|
"epoch": 1.9109010712035288,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046335107756325316,
|
|
"loss": 5.2903,
|
|
"mean_token_accuracy": 0.17477723807096482,
|
|
"num_tokens": 41965881.0,
|
|
"step": 22745
|
|
},
|
|
{
|
|
"entropy": 5.65323395729065,
|
|
"epoch": 1.9113211510186936,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004633346943670204,
|
|
"loss": 5.3643,
|
|
"mean_token_accuracy": 0.16437687277793883,
|
|
"num_tokens": 41975031.0,
|
|
"step": 22750
|
|
},
|
|
{
|
|
"entropy": 5.643260192871094,
|
|
"epoch": 1.9117412308338584,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004633183078345355,
|
|
"loss": 5.3197,
|
|
"mean_token_accuracy": 0.17544271051883698,
|
|
"num_tokens": 41984187.0,
|
|
"step": 22755
|
|
},
|
|
{
|
|
"entropy": 5.675690174102783,
|
|
"epoch": 1.9121613106490232,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00046330191796608867,
|
|
"loss": 5.4277,
|
|
"mean_token_accuracy": 0.17010141164064407,
|
|
"num_tokens": 41993185.0,
|
|
"step": 22760
|
|
},
|
|
{
|
|
"entropy": 5.658887100219727,
|
|
"epoch": 1.9125813904641882,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004632855247619704,
|
|
"loss": 5.3799,
|
|
"mean_token_accuracy": 0.17760641872882843,
|
|
"num_tokens": 42002521.0,
|
|
"step": 22765
|
|
},
|
|
{
|
|
"entropy": 5.694884634017944,
|
|
"epoch": 1.9130014702793532,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.000463269128222471,
|
|
"loss": 5.4865,
|
|
"mean_token_accuracy": 0.1686519965529442,
|
|
"num_tokens": 42011444.0,
|
|
"step": 22770
|
|
},
|
|
{
|
|
"entropy": 5.666096448898315,
|
|
"epoch": 1.913421550094518,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004632527283478809,
|
|
"loss": 5.3742,
|
|
"mean_token_accuracy": 0.16956177055835725,
|
|
"num_tokens": 42020916.0,
|
|
"step": 22775
|
|
},
|
|
{
|
|
"entropy": 5.701362133026123,
|
|
"epoch": 1.9138416299096828,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046323632513849063,
|
|
"loss": 5.3941,
|
|
"mean_token_accuracy": 0.17448805570602416,
|
|
"num_tokens": 42029467.0,
|
|
"step": 22780
|
|
},
|
|
{
|
|
"entropy": 5.498393583297729,
|
|
"epoch": 1.9142617097248478,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004632199185945908,
|
|
"loss": 5.1487,
|
|
"mean_token_accuracy": 0.18760445863008499,
|
|
"num_tokens": 42037435.0,
|
|
"step": 22785
|
|
},
|
|
{
|
|
"entropy": 5.676510238647461,
|
|
"epoch": 1.9146817895400126,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004632035087164721,
|
|
"loss": 5.4044,
|
|
"mean_token_accuracy": 0.1724133387207985,
|
|
"num_tokens": 42046943.0,
|
|
"step": 22790
|
|
},
|
|
{
|
|
"entropy": 5.623393821716308,
|
|
"epoch": 1.9151018693551776,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004631870955044251,
|
|
"loss": 5.2769,
|
|
"mean_token_accuracy": 0.17786265760660172,
|
|
"num_tokens": 42055804.0,
|
|
"step": 22795
|
|
},
|
|
{
|
|
"entropy": 5.609493112564087,
|
|
"epoch": 1.9155219491703424,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00046317067895874063,
|
|
"loss": 5.2838,
|
|
"mean_token_accuracy": 0.18263700753450393,
|
|
"num_tokens": 42064655.0,
|
|
"step": 22800
|
|
},
|
|
{
|
|
"entropy": 5.665407276153564,
|
|
"epoch": 1.9159420289855071,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00046315425907970947,
|
|
"loss": 5.322,
|
|
"mean_token_accuracy": 0.17597611397504806,
|
|
"num_tokens": 42073663.0,
|
|
"step": 22805
|
|
},
|
|
{
|
|
"entropy": 5.667187929153442,
|
|
"epoch": 1.9163621088006721,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004631378358676225,
|
|
"loss": 5.4126,
|
|
"mean_token_accuracy": 0.1755566418170929,
|
|
"num_tokens": 42083931.0,
|
|
"step": 22810
|
|
},
|
|
{
|
|
"entropy": 5.715243768692017,
|
|
"epoch": 1.9167821886158372,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004631214093227706,
|
|
"loss": 5.381,
|
|
"mean_token_accuracy": 0.16978215724229812,
|
|
"num_tokens": 42093782.0,
|
|
"step": 22815
|
|
},
|
|
{
|
|
"entropy": 5.629796504974365,
|
|
"epoch": 1.917202268431002,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004631049794454448,
|
|
"loss": 5.3331,
|
|
"mean_token_accuracy": 0.17728287428617479,
|
|
"num_tokens": 42103392.0,
|
|
"step": 22820
|
|
},
|
|
{
|
|
"entropy": 5.656073045730591,
|
|
"epoch": 1.9176223482461667,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004630885462359362,
|
|
"loss": 5.2929,
|
|
"mean_token_accuracy": 0.1793659135699272,
|
|
"num_tokens": 42112051.0,
|
|
"step": 22825
|
|
},
|
|
{
|
|
"entropy": 5.5138551712036135,
|
|
"epoch": 1.9180424280613315,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004630721096945358,
|
|
"loss": 5.321,
|
|
"mean_token_accuracy": 0.1804273918271065,
|
|
"num_tokens": 42120156.0,
|
|
"step": 22830
|
|
},
|
|
{
|
|
"entropy": 5.682647180557251,
|
|
"epoch": 1.9184625078764965,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004630556698215349,
|
|
"loss": 5.4097,
|
|
"mean_token_accuracy": 0.17985475659370423,
|
|
"num_tokens": 42129564.0,
|
|
"step": 22835
|
|
},
|
|
{
|
|
"entropy": 5.691924667358398,
|
|
"epoch": 1.9188825876916615,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046303922661722466,
|
|
"loss": 5.4662,
|
|
"mean_token_accuracy": 0.16802815347909927,
|
|
"num_tokens": 42138144.0,
|
|
"step": 22840
|
|
},
|
|
{
|
|
"entropy": 5.572285509109497,
|
|
"epoch": 1.9193026675068263,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.00046302278008189627,
|
|
"loss": 5.2914,
|
|
"mean_token_accuracy": 0.1704635813832283,
|
|
"num_tokens": 42147701.0,
|
|
"step": 22845
|
|
},
|
|
{
|
|
"entropy": 5.540525960922241,
|
|
"epoch": 1.919722747321991,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004630063302158412,
|
|
"loss": 5.2657,
|
|
"mean_token_accuracy": 0.1806807652115822,
|
|
"num_tokens": 42156772.0,
|
|
"step": 22850
|
|
},
|
|
{
|
|
"entropy": 5.553380632400513,
|
|
"epoch": 1.920142827137156,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046298987701935066,
|
|
"loss": 5.2087,
|
|
"mean_token_accuracy": 0.18418700397014617,
|
|
"num_tokens": 42165227.0,
|
|
"step": 22855
|
|
},
|
|
{
|
|
"entropy": 5.556873607635498,
|
|
"epoch": 1.920562906952321,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004629734204927164,
|
|
"loss": 5.2462,
|
|
"mean_token_accuracy": 0.1809609055519104,
|
|
"num_tokens": 42174800.0,
|
|
"step": 22860
|
|
},
|
|
{
|
|
"entropy": 5.615631151199341,
|
|
"epoch": 1.920982986767486,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004629569606362298,
|
|
"loss": 5.3416,
|
|
"mean_token_accuracy": 0.17381157577037812,
|
|
"num_tokens": 42184301.0,
|
|
"step": 22865
|
|
},
|
|
{
|
|
"entropy": 5.657670736312866,
|
|
"epoch": 1.9214030665826507,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004629404974501823,
|
|
"loss": 5.3347,
|
|
"mean_token_accuracy": 0.17408420890569687,
|
|
"num_tokens": 42193266.0,
|
|
"step": 22870
|
|
},
|
|
{
|
|
"entropy": 5.550420045852661,
|
|
"epoch": 1.9218231463978155,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004629240309348658,
|
|
"loss": 5.2736,
|
|
"mean_token_accuracy": 0.1723278731107712,
|
|
"num_tokens": 42202051.0,
|
|
"step": 22875
|
|
},
|
|
{
|
|
"entropy": 5.5658341407775875,
|
|
"epoch": 1.9222432262129805,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004629075610905717,
|
|
"loss": 5.1858,
|
|
"mean_token_accuracy": 0.18649692982435226,
|
|
"num_tokens": 42210716.0,
|
|
"step": 22880
|
|
},
|
|
{
|
|
"entropy": 5.521829605102539,
|
|
"epoch": 1.9226633060281455,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000462891087917592,
|
|
"loss": 5.255,
|
|
"mean_token_accuracy": 0.1794131278991699,
|
|
"num_tokens": 42219930.0,
|
|
"step": 22885
|
|
},
|
|
{
|
|
"entropy": 5.61706805229187,
|
|
"epoch": 1.9230833858433103,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046287461141621844,
|
|
"loss": 5.301,
|
|
"mean_token_accuracy": 0.185006545484066,
|
|
"num_tokens": 42228864.0,
|
|
"step": 22890
|
|
},
|
|
{
|
|
"entropy": 5.622384357452392,
|
|
"epoch": 1.923503465658475,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004628581315867429,
|
|
"loss": 5.3738,
|
|
"mean_token_accuracy": 0.17600143253803252,
|
|
"num_tokens": 42238030.0,
|
|
"step": 22895
|
|
},
|
|
{
|
|
"entropy": 5.667676210403442,
|
|
"epoch": 1.9239235454736399,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046284164842945723,
|
|
"loss": 5.3524,
|
|
"mean_token_accuracy": 0.17182712703943254,
|
|
"num_tokens": 42247818.0,
|
|
"step": 22900
|
|
},
|
|
{
|
|
"entropy": 5.661724472045899,
|
|
"epoch": 1.9243436252888049,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004628251619446536,
|
|
"loss": 5.3038,
|
|
"mean_token_accuracy": 0.17410755008459092,
|
|
"num_tokens": 42256772.0,
|
|
"step": 22905
|
|
},
|
|
{
|
|
"entropy": 5.578900766372681,
|
|
"epoch": 1.9247637051039699,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046280867213262385,
|
|
"loss": 5.3696,
|
|
"mean_token_accuracy": 0.16716319620609282,
|
|
"num_tokens": 42265620.0,
|
|
"step": 22910
|
|
},
|
|
{
|
|
"entropy": 5.678067827224732,
|
|
"epoch": 1.9251837849191347,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004627921789936602,
|
|
"loss": 5.426,
|
|
"mean_token_accuracy": 0.16603572368621827,
|
|
"num_tokens": 42274998.0,
|
|
"step": 22915
|
|
},
|
|
{
|
|
"entropy": 5.700376176834107,
|
|
"epoch": 1.9256038647342995,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046277568252805476,
|
|
"loss": 5.3521,
|
|
"mean_token_accuracy": 0.17515442967414857,
|
|
"num_tokens": 42284849.0,
|
|
"step": 22920
|
|
},
|
|
{
|
|
"entropy": 5.584618091583252,
|
|
"epoch": 1.9260239445494642,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004627591827360998,
|
|
"loss": 5.3409,
|
|
"mean_token_accuracy": 0.17606656402349471,
|
|
"num_tokens": 42294133.0,
|
|
"step": 22925
|
|
},
|
|
{
|
|
"entropy": 5.622400760650635,
|
|
"epoch": 1.9264440243646292,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004627426796180876,
|
|
"loss": 5.3253,
|
|
"mean_token_accuracy": 0.18122074604034424,
|
|
"num_tokens": 42302765.0,
|
|
"step": 22930
|
|
},
|
|
{
|
|
"entropy": 5.661354064941406,
|
|
"epoch": 1.9268641041797943,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046272617317431056,
|
|
"loss": 5.278,
|
|
"mean_token_accuracy": 0.17460388243198394,
|
|
"num_tokens": 42311829.0,
|
|
"step": 22935
|
|
},
|
|
{
|
|
"entropy": 5.653006887435913,
|
|
"epoch": 1.927284183994959,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046270966340506087,
|
|
"loss": 5.4127,
|
|
"mean_token_accuracy": 0.1800052508711815,
|
|
"num_tokens": 42321294.0,
|
|
"step": 22940
|
|
},
|
|
{
|
|
"entropy": 5.646188735961914,
|
|
"epoch": 1.9277042638101238,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046269315031063137,
|
|
"loss": 5.2272,
|
|
"mean_token_accuracy": 0.179823400080204,
|
|
"num_tokens": 42329379.0,
|
|
"step": 22945
|
|
},
|
|
{
|
|
"entropy": 5.635144662857056,
|
|
"epoch": 1.9281243436252888,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046267663389131425,
|
|
"loss": 5.4211,
|
|
"mean_token_accuracy": 0.16577421128749847,
|
|
"num_tokens": 42339867.0,
|
|
"step": 22950
|
|
},
|
|
{
|
|
"entropy": 5.672863578796386,
|
|
"epoch": 1.9285444234404538,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.00046266011414740213,
|
|
"loss": 5.4266,
|
|
"mean_token_accuracy": 0.17056871354579925,
|
|
"num_tokens": 42350174.0,
|
|
"step": 22955
|
|
},
|
|
{
|
|
"entropy": 5.623044729232788,
|
|
"epoch": 1.9289645032556186,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004626435910791878,
|
|
"loss": 5.284,
|
|
"mean_token_accuracy": 0.1775414004921913,
|
|
"num_tokens": 42359214.0,
|
|
"step": 22960
|
|
},
|
|
{
|
|
"entropy": 5.5688153266906735,
|
|
"epoch": 1.9293845830707834,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00046262706468696386,
|
|
"loss": 5.3633,
|
|
"mean_token_accuracy": 0.17115409225225447,
|
|
"num_tokens": 42367965.0,
|
|
"step": 22965
|
|
},
|
|
{
|
|
"entropy": 5.6344867706298825,
|
|
"epoch": 1.9298046628859482,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004626105349710231,
|
|
"loss": 5.3841,
|
|
"mean_token_accuracy": 0.16720222681760788,
|
|
"num_tokens": 42377233.0,
|
|
"step": 22970
|
|
},
|
|
{
|
|
"entropy": 5.789755868911743,
|
|
"epoch": 1.9302247427011132,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004625940019316584,
|
|
"loss": 5.3816,
|
|
"mean_token_accuracy": 0.17151414901018142,
|
|
"num_tokens": 42386060.0,
|
|
"step": 22975
|
|
},
|
|
{
|
|
"entropy": 5.65874752998352,
|
|
"epoch": 1.9306448225162782,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046257746556916236,
|
|
"loss": 5.3775,
|
|
"mean_token_accuracy": 0.18202279657125472,
|
|
"num_tokens": 42395659.0,
|
|
"step": 22980
|
|
},
|
|
{
|
|
"entropy": 5.696528530120849,
|
|
"epoch": 1.931064902331443,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00046256092588382825,
|
|
"loss": 5.3834,
|
|
"mean_token_accuracy": 0.1711360841989517,
|
|
"num_tokens": 42403531.0,
|
|
"step": 22985
|
|
},
|
|
{
|
|
"entropy": 5.649896192550659,
|
|
"epoch": 1.9314849821466078,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046254438287594884,
|
|
"loss": 5.3348,
|
|
"mean_token_accuracy": 0.17835707813501359,
|
|
"num_tokens": 42412364.0,
|
|
"step": 22990
|
|
},
|
|
{
|
|
"entropy": 5.599561738967895,
|
|
"epoch": 1.9319050619617726,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046252783654581733,
|
|
"loss": 5.3225,
|
|
"mean_token_accuracy": 0.17222917228937148,
|
|
"num_tokens": 42422276.0,
|
|
"step": 22995
|
|
},
|
|
{
|
|
"entropy": 5.6603082656860355,
|
|
"epoch": 1.9323251417769376,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004625112868937267,
|
|
"loss": 5.3528,
|
|
"mean_token_accuracy": 0.1746899351477623,
|
|
"num_tokens": 42430853.0,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"entropy": 5.572019052505493,
|
|
"epoch": 1.9327452215921026,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004624947339199702,
|
|
"loss": 5.2428,
|
|
"mean_token_accuracy": 0.17491891533136367,
|
|
"num_tokens": 42439034.0,
|
|
"step": 23005
|
|
},
|
|
{
|
|
"entropy": 5.665308284759521,
|
|
"epoch": 1.9331653014072674,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000462478177624841,
|
|
"loss": 5.4216,
|
|
"mean_token_accuracy": 0.1706569865345955,
|
|
"num_tokens": 42448494.0,
|
|
"step": 23010
|
|
},
|
|
{
|
|
"entropy": 5.692689990997314,
|
|
"epoch": 1.9335853812224322,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046246161800863244,
|
|
"loss": 5.3149,
|
|
"mean_token_accuracy": 0.17972690612077713,
|
|
"num_tokens": 42457188.0,
|
|
"step": 23015
|
|
},
|
|
{
|
|
"entropy": 5.6557807445526125,
|
|
"epoch": 1.9340054610375972,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004624450550716379,
|
|
"loss": 5.407,
|
|
"mean_token_accuracy": 0.16998066902160644,
|
|
"num_tokens": 42466321.0,
|
|
"step": 23020
|
|
},
|
|
{
|
|
"entropy": 5.624531412124634,
|
|
"epoch": 1.934425540852762,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004624284888141507,
|
|
"loss": 5.3419,
|
|
"mean_token_accuracy": 0.17627126276493071,
|
|
"num_tokens": 42475879.0,
|
|
"step": 23025
|
|
},
|
|
{
|
|
"entropy": 5.653626155853272,
|
|
"epoch": 1.934845620667927,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004624119192364643,
|
|
"loss": 5.4957,
|
|
"mean_token_accuracy": 0.1685581237077713,
|
|
"num_tokens": 42484988.0,
|
|
"step": 23030
|
|
},
|
|
{
|
|
"entropy": 5.609811210632325,
|
|
"epoch": 1.9352657004830918,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046239534633887223,
|
|
"loss": 5.2922,
|
|
"mean_token_accuracy": 0.1745161935687065,
|
|
"num_tokens": 42493764.0,
|
|
"step": 23035
|
|
},
|
|
{
|
|
"entropy": 5.771266174316406,
|
|
"epoch": 1.9356857802982566,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004623787701216682,
|
|
"loss": 5.5004,
|
|
"mean_token_accuracy": 0.1753440722823143,
|
|
"num_tokens": 42503312.0,
|
|
"step": 23040
|
|
},
|
|
{
|
|
"entropy": 5.6357780456542965,
|
|
"epoch": 1.9361058601134216,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00046236219058514566,
|
|
"loss": 5.352,
|
|
"mean_token_accuracy": 0.1730501800775528,
|
|
"num_tokens": 42512303.0,
|
|
"step": 23045
|
|
},
|
|
{
|
|
"entropy": 5.542242479324341,
|
|
"epoch": 1.9365259399285866,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004623456077295984,
|
|
"loss": 5.2403,
|
|
"mean_token_accuracy": 0.18613847196102143,
|
|
"num_tokens": 42520928.0,
|
|
"step": 23050
|
|
},
|
|
{
|
|
"entropy": 5.61994481086731,
|
|
"epoch": 1.9369460197437514,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004623290215553201,
|
|
"loss": 5.2828,
|
|
"mean_token_accuracy": 0.18155443370342256,
|
|
"num_tokens": 42529945.0,
|
|
"step": 23055
|
|
},
|
|
{
|
|
"entropy": 5.622451591491699,
|
|
"epoch": 1.9373660995589161,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004623124320626048,
|
|
"loss": 5.3357,
|
|
"mean_token_accuracy": 0.1775738701224327,
|
|
"num_tokens": 42539078.0,
|
|
"step": 23060
|
|
},
|
|
{
|
|
"entropy": 5.602380084991455,
|
|
"epoch": 1.937786179374081,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004622958392517461,
|
|
"loss": 5.291,
|
|
"mean_token_accuracy": 0.17909268736839296,
|
|
"num_tokens": 42547842.0,
|
|
"step": 23065
|
|
},
|
|
{
|
|
"entropy": 5.6292308330535885,
|
|
"epoch": 1.938206259189246,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004622792431230381,
|
|
"loss": 5.2295,
|
|
"mean_token_accuracy": 0.18587420433759688,
|
|
"num_tokens": 42556574.0,
|
|
"step": 23070
|
|
},
|
|
{
|
|
"entropy": 5.657032442092896,
|
|
"epoch": 1.938626339004411,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046226264367677476,
|
|
"loss": 5.3364,
|
|
"mean_token_accuracy": 0.1658307746052742,
|
|
"num_tokens": 42565906.0,
|
|
"step": 23075
|
|
},
|
|
{
|
|
"entropy": 5.610013246536255,
|
|
"epoch": 1.9390464188195757,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004622460409132501,
|
|
"loss": 5.3061,
|
|
"mean_token_accuracy": 0.17991530746221543,
|
|
"num_tokens": 42574929.0,
|
|
"step": 23080
|
|
},
|
|
{
|
|
"entropy": 5.6608904838562015,
|
|
"epoch": 1.9394664986347405,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004622294348327582,
|
|
"loss": 5.3509,
|
|
"mean_token_accuracy": 0.17006094008684158,
|
|
"num_tokens": 42585185.0,
|
|
"step": 23085
|
|
},
|
|
{
|
|
"entropy": 5.600542974472046,
|
|
"epoch": 1.9398865784499055,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046221282543559334,
|
|
"loss": 5.3681,
|
|
"mean_token_accuracy": 0.17075276374816895,
|
|
"num_tokens": 42594272.0,
|
|
"step": 23090
|
|
},
|
|
{
|
|
"entropy": 5.5656678676605225,
|
|
"epoch": 1.9403066582650703,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046219621272204967,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.17831842303276063,
|
|
"num_tokens": 42603410.0,
|
|
"step": 23095
|
|
},
|
|
{
|
|
"entropy": 5.666840028762818,
|
|
"epoch": 1.9407267380802353,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046217959669242145,
|
|
"loss": 5.479,
|
|
"mean_token_accuracy": 0.16195986643433571,
|
|
"num_tokens": 42613879.0,
|
|
"step": 23100
|
|
},
|
|
{
|
|
"entropy": 5.662532567977905,
|
|
"epoch": 1.9411468178954001,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000462162977347003,
|
|
"loss": 5.2801,
|
|
"mean_token_accuracy": 0.17427633106708526,
|
|
"num_tokens": 42623323.0,
|
|
"step": 23105
|
|
},
|
|
{
|
|
"entropy": 5.668767642974854,
|
|
"epoch": 1.941566897710565,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046214635468608885,
|
|
"loss": 5.3365,
|
|
"mean_token_accuracy": 0.17507773339748384,
|
|
"num_tokens": 42632365.0,
|
|
"step": 23110
|
|
},
|
|
{
|
|
"entropy": 5.6521703720092775,
|
|
"epoch": 1.94198697752573,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046212972870997336,
|
|
"loss": 5.3073,
|
|
"mean_token_accuracy": 0.17932529896497726,
|
|
"num_tokens": 42641872.0,
|
|
"step": 23115
|
|
},
|
|
{
|
|
"entropy": 5.679893827438354,
|
|
"epoch": 1.942407057340895,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004621130994189511,
|
|
"loss": 5.3758,
|
|
"mean_token_accuracy": 0.17578433007001876,
|
|
"num_tokens": 42652031.0,
|
|
"step": 23120
|
|
},
|
|
{
|
|
"entropy": 5.534250640869141,
|
|
"epoch": 1.9428271371560597,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004620964668133166,
|
|
"loss": 5.3045,
|
|
"mean_token_accuracy": 0.17120088189840316,
|
|
"num_tokens": 42661040.0,
|
|
"step": 23125
|
|
},
|
|
{
|
|
"entropy": 5.6397205829620365,
|
|
"epoch": 1.9432472169712245,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004620798308933646,
|
|
"loss": 5.3392,
|
|
"mean_token_accuracy": 0.17351713329553603,
|
|
"num_tokens": 42670559.0,
|
|
"step": 23130
|
|
},
|
|
{
|
|
"entropy": 5.674306726455688,
|
|
"epoch": 1.9436672967863893,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004620631916593897,
|
|
"loss": 5.3482,
|
|
"mean_token_accuracy": 0.17234041541814804,
|
|
"num_tokens": 42679883.0,
|
|
"step": 23135
|
|
},
|
|
{
|
|
"entropy": 5.741105794906616,
|
|
"epoch": 1.9440873766015543,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004620465491116867,
|
|
"loss": 5.4648,
|
|
"mean_token_accuracy": 0.15939529240131378,
|
|
"num_tokens": 42689746.0,
|
|
"step": 23140
|
|
},
|
|
{
|
|
"entropy": 5.731553983688355,
|
|
"epoch": 1.9445074564167193,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00046202990325055034,
|
|
"loss": 5.3838,
|
|
"mean_token_accuracy": 0.17033789455890655,
|
|
"num_tokens": 42699685.0,
|
|
"step": 23145
|
|
},
|
|
{
|
|
"entropy": 5.571749210357666,
|
|
"epoch": 1.944927536231884,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004620132540762756,
|
|
"loss": 5.2458,
|
|
"mean_token_accuracy": 0.1736294910311699,
|
|
"num_tokens": 42708873.0,
|
|
"step": 23150
|
|
},
|
|
{
|
|
"entropy": 5.5605018615722654,
|
|
"epoch": 1.9453476160470489,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046199660158915734,
|
|
"loss": 5.3166,
|
|
"mean_token_accuracy": 0.1672689750790596,
|
|
"num_tokens": 42717807.0,
|
|
"step": 23155
|
|
},
|
|
{
|
|
"entropy": 5.597097682952881,
|
|
"epoch": 1.9457676958622139,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046197994578949056,
|
|
"loss": 5.3768,
|
|
"mean_token_accuracy": 0.17283654361963272,
|
|
"num_tokens": 42726674.0,
|
|
"step": 23160
|
|
},
|
|
{
|
|
"entropy": 5.670198249816894,
|
|
"epoch": 1.9461877756773787,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004619632866775704,
|
|
"loss": 5.4328,
|
|
"mean_token_accuracy": 0.1715935230255127,
|
|
"num_tokens": 42735621.0,
|
|
"step": 23165
|
|
},
|
|
{
|
|
"entropy": 5.605064630508423,
|
|
"epoch": 1.9466078554925437,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004619466242536918,
|
|
"loss": 5.3183,
|
|
"mean_token_accuracy": 0.17671644389629365,
|
|
"num_tokens": 42744945.0,
|
|
"step": 23170
|
|
},
|
|
{
|
|
"entropy": 5.675964307785034,
|
|
"epoch": 1.9470279353077085,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004619299585181501,
|
|
"loss": 5.4318,
|
|
"mean_token_accuracy": 0.17112542688846588,
|
|
"num_tokens": 42754906.0,
|
|
"step": 23175
|
|
},
|
|
{
|
|
"entropy": 5.677554368972778,
|
|
"epoch": 1.9474480151228732,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046191328947124027,
|
|
"loss": 5.3332,
|
|
"mean_token_accuracy": 0.17521820366382598,
|
|
"num_tokens": 42764673.0,
|
|
"step": 23180
|
|
},
|
|
{
|
|
"entropy": 5.564341068267822,
|
|
"epoch": 1.9478680949380383,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046189661711325784,
|
|
"loss": 5.3217,
|
|
"mean_token_accuracy": 0.18367141485214233,
|
|
"num_tokens": 42774528.0,
|
|
"step": 23185
|
|
},
|
|
{
|
|
"entropy": 5.669634675979614,
|
|
"epoch": 1.9482881747532033,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00046187994144449815,
|
|
"loss": 5.2309,
|
|
"mean_token_accuracy": 0.1801608145236969,
|
|
"num_tokens": 42783813.0,
|
|
"step": 23190
|
|
},
|
|
{
|
|
"entropy": 5.586480951309204,
|
|
"epoch": 1.948708254568368,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004618632624652565,
|
|
"loss": 5.3154,
|
|
"mean_token_accuracy": 0.17071151435375215,
|
|
"num_tokens": 42793483.0,
|
|
"step": 23195
|
|
},
|
|
{
|
|
"entropy": 5.59461088180542,
|
|
"epoch": 1.9491283343835328,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004618465801758283,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.1717785432934761,
|
|
"num_tokens": 42803177.0,
|
|
"step": 23200
|
|
},
|
|
{
|
|
"entropy": 5.673942232131958,
|
|
"epoch": 1.9495484141986976,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046182989457650925,
|
|
"loss": 5.3849,
|
|
"mean_token_accuracy": 0.17533280104398727,
|
|
"num_tokens": 42812395.0,
|
|
"step": 23205
|
|
},
|
|
{
|
|
"entropy": 5.617794990539551,
|
|
"epoch": 1.9499684940138626,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046181320566759476,
|
|
"loss": 5.3511,
|
|
"mean_token_accuracy": 0.17385358661413192,
|
|
"num_tokens": 42821495.0,
|
|
"step": 23210
|
|
},
|
|
{
|
|
"entropy": 5.608628606796264,
|
|
"epoch": 1.9503885738290276,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00046179651344938055,
|
|
"loss": 5.3336,
|
|
"mean_token_accuracy": 0.17260808795690535,
|
|
"num_tokens": 42832219.0,
|
|
"step": 23215
|
|
},
|
|
{
|
|
"entropy": 5.632011890411377,
|
|
"epoch": 1.9508086536441924,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00046177981792216234,
|
|
"loss": 5.2745,
|
|
"mean_token_accuracy": 0.1757341668009758,
|
|
"num_tokens": 42841368.0,
|
|
"step": 23220
|
|
},
|
|
{
|
|
"entropy": 5.603061056137085,
|
|
"epoch": 1.9512287334593572,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046176311908623574,
|
|
"loss": 5.3093,
|
|
"mean_token_accuracy": 0.1824861243367195,
|
|
"num_tokens": 42850512.0,
|
|
"step": 23225
|
|
},
|
|
{
|
|
"entropy": 5.6234039783477785,
|
|
"epoch": 1.951648813274522,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004617464169418967,
|
|
"loss": 5.3568,
|
|
"mean_token_accuracy": 0.1736053630709648,
|
|
"num_tokens": 42860749.0,
|
|
"step": 23230
|
|
},
|
|
{
|
|
"entropy": 5.616316413879394,
|
|
"epoch": 1.952068893089687,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00046172971148944106,
|
|
"loss": 5.3083,
|
|
"mean_token_accuracy": 0.17737708240747452,
|
|
"num_tokens": 42869880.0,
|
|
"step": 23235
|
|
},
|
|
{
|
|
"entropy": 5.60381588935852,
|
|
"epoch": 1.952488972904852,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046171300272916465,
|
|
"loss": 5.2901,
|
|
"mean_token_accuracy": 0.18085473626852036,
|
|
"num_tokens": 42879001.0,
|
|
"step": 23240
|
|
},
|
|
{
|
|
"entropy": 5.551793575286865,
|
|
"epoch": 1.9529090527200168,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046169629066136357,
|
|
"loss": 5.2287,
|
|
"mean_token_accuracy": 0.1819872483611107,
|
|
"num_tokens": 42888036.0,
|
|
"step": 23245
|
|
},
|
|
{
|
|
"entropy": 5.664321565628052,
|
|
"epoch": 1.9533291325351816,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00046167957528633387,
|
|
"loss": 5.3401,
|
|
"mean_token_accuracy": 0.1780684620141983,
|
|
"num_tokens": 42897460.0,
|
|
"step": 23250
|
|
},
|
|
{
|
|
"entropy": 5.6305899143219,
|
|
"epoch": 1.9537492123503466,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046166285660437164,
|
|
"loss": 5.3538,
|
|
"mean_token_accuracy": 0.1773480087518692,
|
|
"num_tokens": 42907010.0,
|
|
"step": 23255
|
|
},
|
|
{
|
|
"entropy": 5.664665699005127,
|
|
"epoch": 1.9541692921655116,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000461646134615773,
|
|
"loss": 5.2976,
|
|
"mean_token_accuracy": 0.17132930904626847,
|
|
"num_tokens": 42915684.0,
|
|
"step": 23260
|
|
},
|
|
{
|
|
"entropy": 5.595171546936035,
|
|
"epoch": 1.9545893719806764,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046162940932083414,
|
|
"loss": 5.3159,
|
|
"mean_token_accuracy": 0.17843813300132752,
|
|
"num_tokens": 42924903.0,
|
|
"step": 23265
|
|
},
|
|
{
|
|
"entropy": 5.591875410079956,
|
|
"epoch": 1.9550094517958412,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00046161268071985144,
|
|
"loss": 5.3182,
|
|
"mean_token_accuracy": 0.17087887227535248,
|
|
"num_tokens": 42935234.0,
|
|
"step": 23270
|
|
},
|
|
{
|
|
"entropy": 5.50767765045166,
|
|
"epoch": 1.955429531611006,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004615959488131212,
|
|
"loss": 5.2438,
|
|
"mean_token_accuracy": 0.18054774403572083,
|
|
"num_tokens": 42944093.0,
|
|
"step": 23275
|
|
},
|
|
{
|
|
"entropy": 5.6021500587463375,
|
|
"epoch": 1.955849611426171,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004615792136009398,
|
|
"loss": 5.2662,
|
|
"mean_token_accuracy": 0.17670947611331939,
|
|
"num_tokens": 42953504.0,
|
|
"step": 23280
|
|
},
|
|
{
|
|
"entropy": 5.602096080780029,
|
|
"epoch": 1.956269691241336,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046156247508360375,
|
|
"loss": 5.3159,
|
|
"mean_token_accuracy": 0.1776598408818245,
|
|
"num_tokens": 42962205.0,
|
|
"step": 23285
|
|
},
|
|
{
|
|
"entropy": 5.548053550720215,
|
|
"epoch": 1.9566897710565008,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004615457332614095,
|
|
"loss": 5.2466,
|
|
"mean_token_accuracy": 0.17867524921894073,
|
|
"num_tokens": 42971240.0,
|
|
"step": 23290
|
|
},
|
|
{
|
|
"entropy": 5.659411191940308,
|
|
"epoch": 1.9571098508716656,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046152898813465353,
|
|
"loss": 5.4036,
|
|
"mean_token_accuracy": 0.16589925736188887,
|
|
"num_tokens": 42981573.0,
|
|
"step": 23295
|
|
},
|
|
{
|
|
"entropy": 5.64855694770813,
|
|
"epoch": 1.9575299306868303,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004615122397036327,
|
|
"loss": 5.327,
|
|
"mean_token_accuracy": 0.17262526452541352,
|
|
"num_tokens": 42991383.0,
|
|
"step": 23300
|
|
},
|
|
{
|
|
"entropy": 5.609686803817749,
|
|
"epoch": 1.9579500105019954,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046149548796864355,
|
|
"loss": 5.2754,
|
|
"mean_token_accuracy": 0.1759060487151146,
|
|
"num_tokens": 43000029.0,
|
|
"step": 23305
|
|
},
|
|
{
|
|
"entropy": 5.634216022491455,
|
|
"epoch": 1.9583700903171604,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00046147873292998285,
|
|
"loss": 5.3476,
|
|
"mean_token_accuracy": 0.17457685023546218,
|
|
"num_tokens": 43008880.0,
|
|
"step": 23310
|
|
},
|
|
{
|
|
"entropy": 5.561314058303833,
|
|
"epoch": 1.9587901701323251,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004614619745879475,
|
|
"loss": 5.3153,
|
|
"mean_token_accuracy": 0.1781879886984825,
|
|
"num_tokens": 43017417.0,
|
|
"step": 23315
|
|
},
|
|
{
|
|
"entropy": 5.620518207550049,
|
|
"epoch": 1.95921024994749,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004614452129428342,
|
|
"loss": 5.2382,
|
|
"mean_token_accuracy": 0.18082706928253173,
|
|
"num_tokens": 43025738.0,
|
|
"step": 23320
|
|
},
|
|
{
|
|
"entropy": 5.715609693527222,
|
|
"epoch": 1.959630329762655,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004614284479949399,
|
|
"loss": 5.3641,
|
|
"mean_token_accuracy": 0.17538043707609177,
|
|
"num_tokens": 43035485.0,
|
|
"step": 23325
|
|
},
|
|
{
|
|
"entropy": 5.712373542785644,
|
|
"epoch": 1.96005040957782,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004614116797445617,
|
|
"loss": 5.3889,
|
|
"mean_token_accuracy": 0.1784473180770874,
|
|
"num_tokens": 43044627.0,
|
|
"step": 23330
|
|
},
|
|
{
|
|
"entropy": 5.570833015441894,
|
|
"epoch": 1.9604704893929847,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00046139490819199666,
|
|
"loss": 5.2968,
|
|
"mean_token_accuracy": 0.1788347989320755,
|
|
"num_tokens": 43053790.0,
|
|
"step": 23335
|
|
},
|
|
{
|
|
"entropy": 5.616850519180298,
|
|
"epoch": 1.9608905692081495,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004613781333375417,
|
|
"loss": 5.2878,
|
|
"mean_token_accuracy": 0.18670900613069535,
|
|
"num_tokens": 43063511.0,
|
|
"step": 23340
|
|
},
|
|
{
|
|
"entropy": 5.548789978027344,
|
|
"epoch": 1.9613106490233143,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004613613551814941,
|
|
"loss": 5.2263,
|
|
"mean_token_accuracy": 0.18141030222177507,
|
|
"num_tokens": 43072349.0,
|
|
"step": 23345
|
|
},
|
|
{
|
|
"entropy": 5.640681552886963,
|
|
"epoch": 1.9617307288384793,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004613445737241511,
|
|
"loss": 5.3351,
|
|
"mean_token_accuracy": 0.17484953999519348,
|
|
"num_tokens": 43081552.0,
|
|
"step": 23350
|
|
},
|
|
{
|
|
"entropy": 5.684726333618164,
|
|
"epoch": 1.9621508086536443,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046132778896581,
|
|
"loss": 5.3775,
|
|
"mean_token_accuracy": 0.17865779995918274,
|
|
"num_tokens": 43092321.0,
|
|
"step": 23355
|
|
},
|
|
{
|
|
"entropy": 5.672852087020874,
|
|
"epoch": 1.9625708884688091,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004613110009067679,
|
|
"loss": 5.3385,
|
|
"mean_token_accuracy": 0.17483696341514587,
|
|
"num_tokens": 43102326.0,
|
|
"step": 23360
|
|
},
|
|
{
|
|
"entropy": 5.655863475799561,
|
|
"epoch": 1.962990968283974,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00046129420954732237,
|
|
"loss": 5.3726,
|
|
"mean_token_accuracy": 0.17350683659315108,
|
|
"num_tokens": 43110895.0,
|
|
"step": 23365
|
|
},
|
|
{
|
|
"entropy": 5.571282768249512,
|
|
"epoch": 1.9634110480991387,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004612774148877709,
|
|
"loss": 5.2236,
|
|
"mean_token_accuracy": 0.1840864822268486,
|
|
"num_tokens": 43119948.0,
|
|
"step": 23370
|
|
},
|
|
{
|
|
"entropy": 5.671322822570801,
|
|
"epoch": 1.9638311279143037,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000461260616928411,
|
|
"loss": 5.4221,
|
|
"mean_token_accuracy": 0.17291183322668074,
|
|
"num_tokens": 43129876.0,
|
|
"step": 23375
|
|
},
|
|
{
|
|
"entropy": 5.6836082458496096,
|
|
"epoch": 1.9642512077294687,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00046124381566954006,
|
|
"loss": 5.3752,
|
|
"mean_token_accuracy": 0.1769299626350403,
|
|
"num_tokens": 43138831.0,
|
|
"step": 23380
|
|
},
|
|
{
|
|
"entropy": 5.644669532775879,
|
|
"epoch": 1.9646712875446335,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046122701111145587,
|
|
"loss": 5.3462,
|
|
"mean_token_accuracy": 0.16992448419332504,
|
|
"num_tokens": 43147338.0,
|
|
"step": 23385
|
|
},
|
|
{
|
|
"entropy": 5.6005443096160885,
|
|
"epoch": 1.9650913673597983,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004612102032544561,
|
|
"loss": 5.2866,
|
|
"mean_token_accuracy": 0.1766381561756134,
|
|
"num_tokens": 43158587.0,
|
|
"step": 23390
|
|
},
|
|
{
|
|
"entropy": 5.610977602005005,
|
|
"epoch": 1.9655114471749633,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046119339209883846,
|
|
"loss": 5.2766,
|
|
"mean_token_accuracy": 0.18496377915143966,
|
|
"num_tokens": 43167610.0,
|
|
"step": 23395
|
|
},
|
|
{
|
|
"entropy": 5.537552261352539,
|
|
"epoch": 1.965931526990128,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004611765776449007,
|
|
"loss": 5.2482,
|
|
"mean_token_accuracy": 0.17576922178268434,
|
|
"num_tokens": 43176374.0,
|
|
"step": 23400
|
|
},
|
|
{
|
|
"entropy": 5.630776309967041,
|
|
"epoch": 1.966351606805293,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00046115975989294083,
|
|
"loss": 5.4188,
|
|
"mean_token_accuracy": 0.16968157142400742,
|
|
"num_tokens": 43187038.0,
|
|
"step": 23405
|
|
},
|
|
{
|
|
"entropy": 5.745281171798706,
|
|
"epoch": 1.9667716866204579,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004611429388432566,
|
|
"loss": 5.4078,
|
|
"mean_token_accuracy": 0.17005863785743713,
|
|
"num_tokens": 43197868.0,
|
|
"step": 23410
|
|
},
|
|
{
|
|
"entropy": 5.659871816635132,
|
|
"epoch": 1.9671917664356227,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00046112611449614603,
|
|
"loss": 5.3696,
|
|
"mean_token_accuracy": 0.16665552854537963,
|
|
"num_tokens": 43207675.0,
|
|
"step": 23415
|
|
},
|
|
{
|
|
"entropy": 5.648601293563843,
|
|
"epoch": 1.9676118462507877,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004611092868519072,
|
|
"loss": 5.3676,
|
|
"mean_token_accuracy": 0.17277957051992415,
|
|
"num_tokens": 43217154.0,
|
|
"step": 23420
|
|
},
|
|
{
|
|
"entropy": 5.631614065170288,
|
|
"epoch": 1.9680319260659527,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004610924559108383,
|
|
"loss": 5.3662,
|
|
"mean_token_accuracy": 0.17904918119311333,
|
|
"num_tokens": 43226912.0,
|
|
"step": 23425
|
|
},
|
|
{
|
|
"entropy": 5.663963079452515,
|
|
"epoch": 1.9684520058811175,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004610756216732372,
|
|
"loss": 5.3729,
|
|
"mean_token_accuracy": 0.17254897505044936,
|
|
"num_tokens": 43236711.0,
|
|
"step": 23430
|
|
},
|
|
{
|
|
"entropy": 5.701264095306397,
|
|
"epoch": 1.9688720856962822,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00046105878413940237,
|
|
"loss": 5.349,
|
|
"mean_token_accuracy": 0.18224181234836578,
|
|
"num_tokens": 43247005.0,
|
|
"step": 23435
|
|
},
|
|
{
|
|
"entropy": 5.405902004241943,
|
|
"epoch": 1.969292165511447,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000461041943309632,
|
|
"loss": 5.1978,
|
|
"mean_token_accuracy": 0.18523926436901092,
|
|
"num_tokens": 43255868.0,
|
|
"step": 23440
|
|
},
|
|
{
|
|
"entropy": 5.579332637786865,
|
|
"epoch": 1.969712245326612,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004610250991842244,
|
|
"loss": 5.3133,
|
|
"mean_token_accuracy": 0.17708868831396102,
|
|
"num_tokens": 43265727.0,
|
|
"step": 23445
|
|
},
|
|
{
|
|
"entropy": 5.68139796257019,
|
|
"epoch": 1.970132325141777,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00046100825176347796,
|
|
"loss": 5.3433,
|
|
"mean_token_accuracy": 0.17815263122320174,
|
|
"num_tokens": 43274530.0,
|
|
"step": 23450
|
|
},
|
|
{
|
|
"entropy": 5.513430643081665,
|
|
"epoch": 1.9705524049569418,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000460991401047691,
|
|
"loss": 5.2518,
|
|
"mean_token_accuracy": 0.17275855988264083,
|
|
"num_tokens": 43285130.0,
|
|
"step": 23455
|
|
},
|
|
{
|
|
"entropy": 5.609006071090699,
|
|
"epoch": 1.9709724847721066,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004609745470371622,
|
|
"loss": 5.3268,
|
|
"mean_token_accuracy": 0.17718621343374252,
|
|
"num_tokens": 43293574.0,
|
|
"step": 23460
|
|
},
|
|
{
|
|
"entropy": 5.5410703182220455,
|
|
"epoch": 1.9713925645872716,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004609576897321902,
|
|
"loss": 5.1567,
|
|
"mean_token_accuracy": 0.18253391236066818,
|
|
"num_tokens": 43301989.0,
|
|
"step": 23465
|
|
},
|
|
{
|
|
"entropy": 5.649783420562744,
|
|
"epoch": 1.9718126444024364,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046094082913307336,
|
|
"loss": 5.358,
|
|
"mean_token_accuracy": 0.17399438023567199,
|
|
"num_tokens": 43310934.0,
|
|
"step": 23470
|
|
},
|
|
{
|
|
"entropy": 5.576969957351684,
|
|
"epoch": 1.9722327242176014,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004609239652401104,
|
|
"loss": 5.2712,
|
|
"mean_token_accuracy": 0.17430226355791092,
|
|
"num_tokens": 43320703.0,
|
|
"step": 23475
|
|
},
|
|
{
|
|
"entropy": 5.605929708480835,
|
|
"epoch": 1.9726528040327662,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046090709805360027,
|
|
"loss": 5.2428,
|
|
"mean_token_accuracy": 0.1821880042552948,
|
|
"num_tokens": 43329444.0,
|
|
"step": 23480
|
|
},
|
|
{
|
|
"entropy": 5.6519848823547365,
|
|
"epoch": 1.973072883847931,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004608902275738416,
|
|
"loss": 5.3677,
|
|
"mean_token_accuracy": 0.18188654333353044,
|
|
"num_tokens": 43337853.0,
|
|
"step": 23485
|
|
},
|
|
{
|
|
"entropy": 5.696072387695312,
|
|
"epoch": 1.973492963663096,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004608733538011333,
|
|
"loss": 5.4032,
|
|
"mean_token_accuracy": 0.16969217211008072,
|
|
"num_tokens": 43347901.0,
|
|
"step": 23490
|
|
},
|
|
{
|
|
"entropy": 5.601595401763916,
|
|
"epoch": 1.973913043478261,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004608564767357741,
|
|
"loss": 5.2628,
|
|
"mean_token_accuracy": 0.17602366507053374,
|
|
"num_tokens": 43357358.0,
|
|
"step": 23495
|
|
},
|
|
{
|
|
"entropy": 5.590727233886719,
|
|
"epoch": 1.9743331232934258,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004608395963780632,
|
|
"loss": 5.3724,
|
|
"mean_token_accuracy": 0.17241780906915666,
|
|
"num_tokens": 43366749.0,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"entropy": 5.586251354217529,
|
|
"epoch": 1.9747532031085906,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004608227127282996,
|
|
"loss": 5.3251,
|
|
"mean_token_accuracy": 0.17821072190999984,
|
|
"num_tokens": 43375243.0,
|
|
"step": 23505
|
|
},
|
|
{
|
|
"entropy": 5.666031312942505,
|
|
"epoch": 1.9751732829237554,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004608058257867823,
|
|
"loss": 5.2887,
|
|
"mean_token_accuracy": 0.18276388347148895,
|
|
"num_tokens": 43383470.0,
|
|
"step": 23510
|
|
},
|
|
{
|
|
"entropy": 5.6554535865783695,
|
|
"epoch": 1.9755933627389204,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004607889355538105,
|
|
"loss": 5.4184,
|
|
"mean_token_accuracy": 0.17027001827955246,
|
|
"num_tokens": 43393527.0,
|
|
"step": 23515
|
|
},
|
|
{
|
|
"entropy": 5.604500722885132,
|
|
"epoch": 1.9760134425540854,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046077204202968325,
|
|
"loss": 5.2812,
|
|
"mean_token_accuracy": 0.17676235735416412,
|
|
"num_tokens": 43402390.0,
|
|
"step": 23520
|
|
},
|
|
{
|
|
"entropy": 5.573892974853516,
|
|
"epoch": 1.9764335223692502,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00046075514521470005,
|
|
"loss": 5.2718,
|
|
"mean_token_accuracy": 0.17329889982938768,
|
|
"num_tokens": 43411479.0,
|
|
"step": 23525
|
|
},
|
|
{
|
|
"entropy": 5.554893112182617,
|
|
"epoch": 1.976853602184415,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00046073824510916005,
|
|
"loss": 5.2121,
|
|
"mean_token_accuracy": 0.17935867458581925,
|
|
"num_tokens": 43420402.0,
|
|
"step": 23530
|
|
},
|
|
{
|
|
"entropy": 5.622291040420532,
|
|
"epoch": 1.9772736819995798,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00046072134171336267,
|
|
"loss": 5.3531,
|
|
"mean_token_accuracy": 0.16644867211580278,
|
|
"num_tokens": 43429011.0,
|
|
"step": 23535
|
|
},
|
|
{
|
|
"entropy": 5.605536222457886,
|
|
"epoch": 1.9776937618147448,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004607044350276074,
|
|
"loss": 5.2344,
|
|
"mean_token_accuracy": 0.17794644683599473,
|
|
"num_tokens": 43438548.0,
|
|
"step": 23540
|
|
},
|
|
{
|
|
"entropy": 5.648398113250733,
|
|
"epoch": 1.9781138416299098,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046068752505219366,
|
|
"loss": 5.3322,
|
|
"mean_token_accuracy": 0.17605538964271544,
|
|
"num_tokens": 43448332.0,
|
|
"step": 23545
|
|
},
|
|
{
|
|
"entropy": 5.653730773925782,
|
|
"epoch": 1.9785339214450746,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000460670611787421,
|
|
"loss": 5.4006,
|
|
"mean_token_accuracy": 0.17038846909999847,
|
|
"num_tokens": 43457726.0,
|
|
"step": 23550
|
|
},
|
|
{
|
|
"entropy": 5.605834054946899,
|
|
"epoch": 1.9789540012602393,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004606536952335891,
|
|
"loss": 5.3285,
|
|
"mean_token_accuracy": 0.17592835873365403,
|
|
"num_tokens": 43466617.0,
|
|
"step": 23555
|
|
},
|
|
{
|
|
"entropy": 5.556175947189331,
|
|
"epoch": 1.9793740810754044,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046063677539099756,
|
|
"loss": 5.3061,
|
|
"mean_token_accuracy": 0.1715977743268013,
|
|
"num_tokens": 43476044.0,
|
|
"step": 23560
|
|
},
|
|
{
|
|
"entropy": 5.579178810119629,
|
|
"epoch": 1.9797941608905694,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046061985225994616,
|
|
"loss": 5.2886,
|
|
"mean_token_accuracy": 0.17626330852508545,
|
|
"num_tokens": 43485488.0,
|
|
"step": 23565
|
|
},
|
|
{
|
|
"entropy": 5.6485466957092285,
|
|
"epoch": 1.9802142407057342,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00046060292584073465,
|
|
"loss": 5.3135,
|
|
"mean_token_accuracy": 0.17889403253793718,
|
|
"num_tokens": 43494423.0,
|
|
"step": 23570
|
|
},
|
|
{
|
|
"entropy": 5.590170574188233,
|
|
"epoch": 1.980634320520899,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00046058599613366287,
|
|
"loss": 5.1856,
|
|
"mean_token_accuracy": 0.1918771132826805,
|
|
"num_tokens": 43502874.0,
|
|
"step": 23575
|
|
},
|
|
{
|
|
"entropy": 5.672195100784302,
|
|
"epoch": 1.9810544003360637,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004605690631390308,
|
|
"loss": 5.4446,
|
|
"mean_token_accuracy": 0.16917974948883058,
|
|
"num_tokens": 43512222.0,
|
|
"step": 23580
|
|
},
|
|
{
|
|
"entropy": 5.618256378173828,
|
|
"epoch": 1.9814744801512287,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004605521268571382,
|
|
"loss": 5.3509,
|
|
"mean_token_accuracy": 0.17687894701957702,
|
|
"num_tokens": 43521577.0,
|
|
"step": 23585
|
|
},
|
|
{
|
|
"entropy": 5.707334375381469,
|
|
"epoch": 1.9818945599663937,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.00046053518728828534,
|
|
"loss": 5.3422,
|
|
"mean_token_accuracy": 0.17523998022079468,
|
|
"num_tokens": 43529763.0,
|
|
"step": 23590
|
|
},
|
|
{
|
|
"entropy": 5.680575227737426,
|
|
"epoch": 1.9823146397815585,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004605182444327721,
|
|
"loss": 5.3651,
|
|
"mean_token_accuracy": 0.16793065816164016,
|
|
"num_tokens": 43538663.0,
|
|
"step": 23595
|
|
},
|
|
{
|
|
"entropy": 5.517296218872071,
|
|
"epoch": 1.9827347195967233,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004605012982908987,
|
|
"loss": 5.1665,
|
|
"mean_token_accuracy": 0.18656257838010787,
|
|
"num_tokens": 43547302.0,
|
|
"step": 23600
|
|
},
|
|
{
|
|
"entropy": 5.56163215637207,
|
|
"epoch": 1.983154799411888,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00046048434886296536,
|
|
"loss": 5.3806,
|
|
"mean_token_accuracy": 0.16650519967079164,
|
|
"num_tokens": 43557222.0,
|
|
"step": 23605
|
|
},
|
|
{
|
|
"entropy": 5.647756576538086,
|
|
"epoch": 1.9835748792270531,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004604673961492722,
|
|
"loss": 5.2736,
|
|
"mean_token_accuracy": 0.18116023987531663,
|
|
"num_tokens": 43566210.0,
|
|
"step": 23610
|
|
},
|
|
{
|
|
"entropy": 5.562030267715454,
|
|
"epoch": 1.9839949590422181,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00046045044015011975,
|
|
"loss": 5.2476,
|
|
"mean_token_accuracy": 0.1800748810172081,
|
|
"num_tokens": 43576275.0,
|
|
"step": 23615
|
|
},
|
|
{
|
|
"entropy": 5.579679298400879,
|
|
"epoch": 1.984415038857383,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004604334808658081,
|
|
"loss": 5.3723,
|
|
"mean_token_accuracy": 0.17557549476623535,
|
|
"num_tokens": 43585480.0,
|
|
"step": 23620
|
|
},
|
|
{
|
|
"entropy": 5.653260231018066,
|
|
"epoch": 1.9848351186725477,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00046041651829663787,
|
|
"loss": 5.3961,
|
|
"mean_token_accuracy": 0.17293741554021835,
|
|
"num_tokens": 43593911.0,
|
|
"step": 23625
|
|
},
|
|
{
|
|
"entropy": 5.6270537853240965,
|
|
"epoch": 1.9852551984877127,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046039955244290957,
|
|
"loss": 5.3168,
|
|
"mean_token_accuracy": 0.17904412150382995,
|
|
"num_tokens": 43604029.0,
|
|
"step": 23630
|
|
},
|
|
{
|
|
"entropy": 5.683732986450195,
|
|
"epoch": 1.9856752783028777,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00046038258330492363,
|
|
"loss": 5.3514,
|
|
"mean_token_accuracy": 0.17994878441095352,
|
|
"num_tokens": 43613248.0,
|
|
"step": 23635
|
|
},
|
|
{
|
|
"entropy": 5.647359848022461,
|
|
"epoch": 1.9860953581180425,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004603656108829806,
|
|
"loss": 5.3049,
|
|
"mean_token_accuracy": 0.17984101325273513,
|
|
"num_tokens": 43623232.0,
|
|
"step": 23640
|
|
},
|
|
{
|
|
"entropy": 5.658777713775635,
|
|
"epoch": 1.9865154379332073,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00046034863517738136,
|
|
"loss": 5.3651,
|
|
"mean_token_accuracy": 0.16325145363807678,
|
|
"num_tokens": 43632999.0,
|
|
"step": 23645
|
|
},
|
|
{
|
|
"entropy": 5.650898551940918,
|
|
"epoch": 1.986935517748372,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00046033165618842637,
|
|
"loss": 5.3269,
|
|
"mean_token_accuracy": 0.17500171065330505,
|
|
"num_tokens": 43641492.0,
|
|
"step": 23650
|
|
},
|
|
{
|
|
"entropy": 5.711059141159057,
|
|
"epoch": 1.987355597563537,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046031467391641657,
|
|
"loss": 5.314,
|
|
"mean_token_accuracy": 0.1773490861058235,
|
|
"num_tokens": 43650999.0,
|
|
"step": 23655
|
|
},
|
|
{
|
|
"entropy": 5.643770027160644,
|
|
"epoch": 1.987775677378702,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004602976883616527,
|
|
"loss": 5.3811,
|
|
"mean_token_accuracy": 0.16796135902404785,
|
|
"num_tokens": 43660777.0,
|
|
"step": 23660
|
|
},
|
|
{
|
|
"entropy": 5.592342329025269,
|
|
"epoch": 1.9881957571938669,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.00046028069952443575,
|
|
"loss": 5.3036,
|
|
"mean_token_accuracy": 0.17716382443904877,
|
|
"num_tokens": 43670404.0,
|
|
"step": 23665
|
|
},
|
|
{
|
|
"entropy": 5.582193326950073,
|
|
"epoch": 1.9886158370090317,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046026370740506663,
|
|
"loss": 5.2388,
|
|
"mean_token_accuracy": 0.1850288465619087,
|
|
"num_tokens": 43679183.0,
|
|
"step": 23670
|
|
},
|
|
{
|
|
"entropy": 5.575860261917114,
|
|
"epoch": 1.9890359168241964,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004602467120038463,
|
|
"loss": 5.26,
|
|
"mean_token_accuracy": 0.17996072322130202,
|
|
"num_tokens": 43688080.0,
|
|
"step": 23675
|
|
},
|
|
{
|
|
"entropy": 5.633952903747558,
|
|
"epoch": 1.9894559966393615,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00046022971332107586,
|
|
"loss": 5.2255,
|
|
"mean_token_accuracy": 0.18307080417871474,
|
|
"num_tokens": 43697271.0,
|
|
"step": 23680
|
|
},
|
|
{
|
|
"entropy": 5.565424203872681,
|
|
"epoch": 1.9898760764545265,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00046021271135705637,
|
|
"loss": 5.2542,
|
|
"mean_token_accuracy": 0.183968748152256,
|
|
"num_tokens": 43705541.0,
|
|
"step": 23685
|
|
},
|
|
{
|
|
"entropy": 5.609846735000611,
|
|
"epoch": 1.9902961562696913,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004601957061120891,
|
|
"loss": 5.3808,
|
|
"mean_token_accuracy": 0.17398780435323716,
|
|
"num_tokens": 43713701.0,
|
|
"step": 23690
|
|
},
|
|
{
|
|
"entropy": 5.566676950454712,
|
|
"epoch": 1.990716236084856,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004601786975864753,
|
|
"loss": 5.3329,
|
|
"mean_token_accuracy": 0.18383182138204573,
|
|
"num_tokens": 43723050.0,
|
|
"step": 23695
|
|
},
|
|
{
|
|
"entropy": 5.576835489273071,
|
|
"epoch": 1.991136315900021,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004601616857805162,
|
|
"loss": 5.3136,
|
|
"mean_token_accuracy": 0.180113722383976,
|
|
"num_tokens": 43733029.0,
|
|
"step": 23700
|
|
},
|
|
{
|
|
"entropy": 5.5873369693756105,
|
|
"epoch": 1.9915563957151858,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004601446706945132,
|
|
"loss": 5.2822,
|
|
"mean_token_accuracy": 0.1761482909321785,
|
|
"num_tokens": 43741818.0,
|
|
"step": 23705
|
|
},
|
|
{
|
|
"entropy": 5.640828418731689,
|
|
"epoch": 1.9919764755303508,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00046012765232876767,
|
|
"loss": 5.3156,
|
|
"mean_token_accuracy": 0.17892836183309554,
|
|
"num_tokens": 43750755.0,
|
|
"step": 23710
|
|
},
|
|
{
|
|
"entropy": 5.56833963394165,
|
|
"epoch": 1.9923965553455156,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004601106306835811,
|
|
"loss": 5.2021,
|
|
"mean_token_accuracy": 0.18445106595754623,
|
|
"num_tokens": 43759135.0,
|
|
"step": 23715
|
|
},
|
|
{
|
|
"entropy": 5.560920858383179,
|
|
"epoch": 1.9928166351606804,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004600936057592551,
|
|
"loss": 5.1672,
|
|
"mean_token_accuracy": 0.1868069976568222,
|
|
"num_tokens": 43767629.0,
|
|
"step": 23720
|
|
},
|
|
{
|
|
"entropy": 5.553515100479126,
|
|
"epoch": 1.9932367149758454,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00046007657755609113,
|
|
"loss": 5.354,
|
|
"mean_token_accuracy": 0.1743677958846092,
|
|
"num_tokens": 43776561.0,
|
|
"step": 23725
|
|
},
|
|
{
|
|
"entropy": 5.648859310150146,
|
|
"epoch": 1.9936567947910104,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004600595460743908,
|
|
"loss": 5.4235,
|
|
"mean_token_accuracy": 0.16585010588169097,
|
|
"num_tokens": 43786569.0,
|
|
"step": 23730
|
|
},
|
|
{
|
|
"entropy": 5.628311204910278,
|
|
"epoch": 1.9940768746061752,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000460042511314456,
|
|
"loss": 5.3687,
|
|
"mean_token_accuracy": 0.16907652020454406,
|
|
"num_tokens": 43795621.0,
|
|
"step": 23735
|
|
},
|
|
{
|
|
"entropy": 5.744281530380249,
|
|
"epoch": 1.99449695442134,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00046002547327658847,
|
|
"loss": 5.3597,
|
|
"mean_token_accuracy": 0.1761852040886879,
|
|
"num_tokens": 43804728.0,
|
|
"step": 23740
|
|
},
|
|
{
|
|
"entropy": 5.586940860748291,
|
|
"epoch": 1.9949170342365048,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004600084319610898,
|
|
"loss": 5.2577,
|
|
"mean_token_accuracy": 0.18133593946695328,
|
|
"num_tokens": 43813495.0,
|
|
"step": 23745
|
|
},
|
|
{
|
|
"entropy": 5.504205417633057,
|
|
"epoch": 1.9953371140516698,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004599913873682621,
|
|
"loss": 5.2068,
|
|
"mean_token_accuracy": 0.1786206528544426,
|
|
"num_tokens": 43823791.0,
|
|
"step": 23750
|
|
},
|
|
{
|
|
"entropy": 5.565213632583618,
|
|
"epoch": 1.9957571938668348,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00045997433949840724,
|
|
"loss": 5.2772,
|
|
"mean_token_accuracy": 0.18052580058574677,
|
|
"num_tokens": 43833904.0,
|
|
"step": 23755
|
|
},
|
|
{
|
|
"entropy": 5.663149499893189,
|
|
"epoch": 1.9961772736819996,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00045995728835182716,
|
|
"loss": 5.364,
|
|
"mean_token_accuracy": 0.1738879531621933,
|
|
"num_tokens": 43843430.0,
|
|
"step": 23760
|
|
},
|
|
{
|
|
"entropy": 5.670080518722534,
|
|
"epoch": 1.9965973534971644,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.00045994023392882395,
|
|
"loss": 5.3107,
|
|
"mean_token_accuracy": 0.1848461866378784,
|
|
"num_tokens": 43851405.0,
|
|
"step": 23765
|
|
},
|
|
{
|
|
"entropy": 5.584572267532349,
|
|
"epoch": 1.9970174333123294,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00045992317622969977,
|
|
"loss": 5.3923,
|
|
"mean_token_accuracy": 0.17312257885932922,
|
|
"num_tokens": 43860034.0,
|
|
"step": 23770
|
|
},
|
|
{
|
|
"entropy": 5.558753299713135,
|
|
"epoch": 1.9974375131274942,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00045990611525475675,
|
|
"loss": 5.3231,
|
|
"mean_token_accuracy": 0.17416706085205078,
|
|
"num_tokens": 43869371.0,
|
|
"step": 23775
|
|
},
|
|
{
|
|
"entropy": 5.634297561645508,
|
|
"epoch": 1.9978575929426592,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004598890510042971,
|
|
"loss": 5.3685,
|
|
"mean_token_accuracy": 0.1768188074231148,
|
|
"num_tokens": 43878462.0,
|
|
"step": 23780
|
|
},
|
|
{
|
|
"entropy": 5.6588939189910885,
|
|
"epoch": 1.998277672757824,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000459871983478623,
|
|
"loss": 5.2981,
|
|
"mean_token_accuracy": 0.17682368606328963,
|
|
"num_tokens": 43887435.0,
|
|
"step": 23785
|
|
},
|
|
{
|
|
"entropy": 5.586805820465088,
|
|
"epoch": 1.9986977525729888,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045985491267803703,
|
|
"loss": 5.3459,
|
|
"mean_token_accuracy": 0.1741949737071991,
|
|
"num_tokens": 43896720.0,
|
|
"step": 23790
|
|
},
|
|
{
|
|
"entropy": 5.552562236785889,
|
|
"epoch": 1.9991178323881538,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00045983783860284146,
|
|
"loss": 5.3472,
|
|
"mean_token_accuracy": 0.1720125764608383,
|
|
"num_tokens": 43906403.0,
|
|
"step": 23795
|
|
},
|
|
{
|
|
"entropy": 5.672985076904297,
|
|
"epoch": 1.9995379122033188,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00045982076125333874,
|
|
"loss": 5.3871,
|
|
"mean_token_accuracy": 0.16746917366981506,
|
|
"num_tokens": 43915059.0,
|
|
"step": 23800
|
|
},
|
|
{
|
|
"entropy": 5.748750972747803,
|
|
"epoch": 1.9999579920184836,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00045980368062983147,
|
|
"loss": 5.4214,
|
|
"mean_token_accuracy": 0.17349109947681426,
|
|
"num_tokens": 43925598.0,
|
|
"step": 23805
|
|
},
|
|
{
|
|
"entropy": 5.655678378211127,
|
|
"epoch": 2.000336063852132,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004597865967326221,
|
|
"loss": 5.2086,
|
|
"mean_token_accuracy": 0.18223923444747925,
|
|
"num_tokens": 43934471.0,
|
|
"step": 23810
|
|
},
|
|
{
|
|
"entropy": 5.576621007919312,
|
|
"epoch": 2.0007561436672967,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.00045976950956201325,
|
|
"loss": 5.3083,
|
|
"mean_token_accuracy": 0.17662405222654343,
|
|
"num_tokens": 43944451.0,
|
|
"step": 23815
|
|
},
|
|
{
|
|
"entropy": 5.652852296829224,
|
|
"epoch": 2.0011762234824615,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004597524191183078,
|
|
"loss": 5.2708,
|
|
"mean_token_accuracy": 0.18469424694776534,
|
|
"num_tokens": 43953892.0,
|
|
"step": 23820
|
|
},
|
|
{
|
|
"entropy": 5.648382472991943,
|
|
"epoch": 2.0015963032976267,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004597353254018082,
|
|
"loss": 5.3562,
|
|
"mean_token_accuracy": 0.17324745506048203,
|
|
"num_tokens": 43963155.0,
|
|
"step": 23825
|
|
},
|
|
{
|
|
"entropy": 5.600753879547119,
|
|
"epoch": 2.0020163831127915,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004597182284128177,
|
|
"loss": 5.2033,
|
|
"mean_token_accuracy": 0.18439362943172455,
|
|
"num_tokens": 43972468.0,
|
|
"step": 23830
|
|
},
|
|
{
|
|
"entropy": 5.721098184585571,
|
|
"epoch": 2.0024364629279563,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004597011281516387,
|
|
"loss": 5.4567,
|
|
"mean_token_accuracy": 0.16708213537931443,
|
|
"num_tokens": 43982709.0,
|
|
"step": 23835
|
|
},
|
|
{
|
|
"entropy": 5.576871728897094,
|
|
"epoch": 2.002856542743121,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00045968402461857435,
|
|
"loss": 5.2333,
|
|
"mean_token_accuracy": 0.18440057784318925,
|
|
"num_tokens": 43992607.0,
|
|
"step": 23840
|
|
},
|
|
{
|
|
"entropy": 5.603885555267334,
|
|
"epoch": 2.003276622558286,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045966691781392763,
|
|
"loss": 5.1848,
|
|
"mean_token_accuracy": 0.18089883625507355,
|
|
"num_tokens": 44001265.0,
|
|
"step": 23845
|
|
},
|
|
{
|
|
"entropy": 5.653714561462403,
|
|
"epoch": 2.003696702373451,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045964980773800156,
|
|
"loss": 5.4064,
|
|
"mean_token_accuracy": 0.1741128757596016,
|
|
"num_tokens": 44010440.0,
|
|
"step": 23850
|
|
},
|
|
{
|
|
"entropy": 5.640526151657104,
|
|
"epoch": 2.004116782188616,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004596326943910993,
|
|
"loss": 5.2281,
|
|
"mean_token_accuracy": 0.17560428082942964,
|
|
"num_tokens": 44020237.0,
|
|
"step": 23855
|
|
},
|
|
{
|
|
"entropy": 5.618943929672241,
|
|
"epoch": 2.0045368620037807,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00045961557777352376,
|
|
"loss": 5.3358,
|
|
"mean_token_accuracy": 0.17574749439954757,
|
|
"num_tokens": 44028976.0,
|
|
"step": 23860
|
|
},
|
|
{
|
|
"entropy": 5.63735933303833,
|
|
"epoch": 2.0049569418189455,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.00045959845788557844,
|
|
"loss": 5.2992,
|
|
"mean_token_accuracy": 0.17881839573383332,
|
|
"num_tokens": 44038186.0,
|
|
"step": 23865
|
|
},
|
|
{
|
|
"entropy": 5.616828918457031,
|
|
"epoch": 2.0053770216341107,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004595813347275665,
|
|
"loss": 5.2725,
|
|
"mean_token_accuracy": 0.17441747933626175,
|
|
"num_tokens": 44047780.0,
|
|
"step": 23870
|
|
},
|
|
{
|
|
"entropy": 5.60105562210083,
|
|
"epoch": 2.0057971014492755,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004595642082997912,
|
|
"loss": 5.211,
|
|
"mean_token_accuracy": 0.18210149556398392,
|
|
"num_tokens": 44056678.0,
|
|
"step": 23875
|
|
},
|
|
{
|
|
"entropy": 5.614001226425171,
|
|
"epoch": 2.0062171812644403,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000459547078602556,
|
|
"loss": 5.2657,
|
|
"mean_token_accuracy": 0.17585084587335587,
|
|
"num_tokens": 44066428.0,
|
|
"step": 23880
|
|
},
|
|
{
|
|
"entropy": 5.570787191390991,
|
|
"epoch": 2.006637261079605,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00045952994563616434,
|
|
"loss": 5.2614,
|
|
"mean_token_accuracy": 0.1772843211889267,
|
|
"num_tokens": 44075285.0,
|
|
"step": 23885
|
|
},
|
|
{
|
|
"entropy": 5.624676752090454,
|
|
"epoch": 2.00705734089477,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004595128094009197,
|
|
"loss": 5.2494,
|
|
"mean_token_accuracy": 0.1796739473938942,
|
|
"num_tokens": 44084333.0,
|
|
"step": 23890
|
|
},
|
|
{
|
|
"entropy": 5.634045553207398,
|
|
"epoch": 2.007477420709935,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004594956698971256,
|
|
"loss": 5.2697,
|
|
"mean_token_accuracy": 0.17147087454795837,
|
|
"num_tokens": 44093504.0,
|
|
"step": 23895
|
|
},
|
|
{
|
|
"entropy": 5.668183422088623,
|
|
"epoch": 2.0078975005251,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004594785271250858,
|
|
"loss": 5.2788,
|
|
"mean_token_accuracy": 0.17484120875597,
|
|
"num_tokens": 44102887.0,
|
|
"step": 23900
|
|
},
|
|
{
|
|
"entropy": 5.561066436767578,
|
|
"epoch": 2.0083175803402646,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004594613810851039,
|
|
"loss": 5.2687,
|
|
"mean_token_accuracy": 0.1750637874007225,
|
|
"num_tokens": 44113074.0,
|
|
"step": 23905
|
|
},
|
|
{
|
|
"entropy": 5.519744539260865,
|
|
"epoch": 2.0087376601554294,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00045944423177748353,
|
|
"loss": 5.2696,
|
|
"mean_token_accuracy": 0.18134041875600815,
|
|
"num_tokens": 44122557.0,
|
|
"step": 23910
|
|
},
|
|
{
|
|
"entropy": 5.658271312713623,
|
|
"epoch": 2.009157739970594,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00045942707920252864,
|
|
"loss": 5.2783,
|
|
"mean_token_accuracy": 0.17392106503248214,
|
|
"num_tokens": 44130198.0,
|
|
"step": 23915
|
|
},
|
|
{
|
|
"entropy": 5.630684757232666,
|
|
"epoch": 2.0095778197857594,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000459409923360543,
|
|
"loss": 5.2459,
|
|
"mean_token_accuracy": 0.18495004624128342,
|
|
"num_tokens": 44139267.0,
|
|
"step": 23920
|
|
},
|
|
{
|
|
"entropy": 5.597193384170533,
|
|
"epoch": 2.0099978996009242,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004593927642518305,
|
|
"loss": 5.3217,
|
|
"mean_token_accuracy": 0.17152093052864076,
|
|
"num_tokens": 44149620.0,
|
|
"step": 23925
|
|
},
|
|
{
|
|
"entropy": 5.581966161727905,
|
|
"epoch": 2.010417979416089,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004593756018766951,
|
|
"loss": 5.1661,
|
|
"mean_token_accuracy": 0.18001709878444672,
|
|
"num_tokens": 44158678.0,
|
|
"step": 23930
|
|
},
|
|
{
|
|
"entropy": 5.503856134414673,
|
|
"epoch": 2.010838059231254,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00045935843623544093,
|
|
"loss": 5.1473,
|
|
"mean_token_accuracy": 0.18169627338647842,
|
|
"num_tokens": 44167376.0,
|
|
"step": 23935
|
|
},
|
|
{
|
|
"entropy": 5.6009259700775145,
|
|
"epoch": 2.011258139046419,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004593412673283719,
|
|
"loss": 5.275,
|
|
"mean_token_accuracy": 0.17766901403665541,
|
|
"num_tokens": 44176001.0,
|
|
"step": 23940
|
|
},
|
|
{
|
|
"entropy": 5.688672161102295,
|
|
"epoch": 2.011678218861584,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00045932409515579226,
|
|
"loss": 5.3321,
|
|
"mean_token_accuracy": 0.17283178567886354,
|
|
"num_tokens": 44185132.0,
|
|
"step": 23945
|
|
},
|
|
{
|
|
"entropy": 5.570486927032471,
|
|
"epoch": 2.0120982986767486,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00045930691971800627,
|
|
"loss": 5.2738,
|
|
"mean_token_accuracy": 0.1786741316318512,
|
|
"num_tokens": 44193256.0,
|
|
"step": 23950
|
|
},
|
|
{
|
|
"entropy": 5.657260227203369,
|
|
"epoch": 2.0125183784919134,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00045928974101531805,
|
|
"loss": 5.37,
|
|
"mean_token_accuracy": 0.17304892987012863,
|
|
"num_tokens": 44202884.0,
|
|
"step": 23955
|
|
},
|
|
{
|
|
"entropy": 5.7106156826019285,
|
|
"epoch": 2.012938458307078,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004592725590480319,
|
|
"loss": 5.3492,
|
|
"mean_token_accuracy": 0.16924804002046584,
|
|
"num_tokens": 44212826.0,
|
|
"step": 23960
|
|
},
|
|
{
|
|
"entropy": 5.6548271656036375,
|
|
"epoch": 2.0133585381222434,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004592553738164524,
|
|
"loss": 5.3199,
|
|
"mean_token_accuracy": 0.16807449012994766,
|
|
"num_tokens": 44222369.0,
|
|
"step": 23965
|
|
},
|
|
{
|
|
"entropy": 5.565683746337891,
|
|
"epoch": 2.013778617937408,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004592381853208837,
|
|
"loss": 5.2165,
|
|
"mean_token_accuracy": 0.17430078536272048,
|
|
"num_tokens": 44230964.0,
|
|
"step": 23970
|
|
},
|
|
{
|
|
"entropy": 5.604999732971192,
|
|
"epoch": 2.014198697752573,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004592209935616304,
|
|
"loss": 5.289,
|
|
"mean_token_accuracy": 0.17769130319356918,
|
|
"num_tokens": 44240199.0,
|
|
"step": 23975
|
|
},
|
|
{
|
|
"entropy": 5.645999479293823,
|
|
"epoch": 2.0146187775677378,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004592037985389971,
|
|
"loss": 5.2669,
|
|
"mean_token_accuracy": 0.18346799314022064,
|
|
"num_tokens": 44249857.0,
|
|
"step": 23980
|
|
},
|
|
{
|
|
"entropy": 5.532536315917969,
|
|
"epoch": 2.0150388573829026,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004591866002532885,
|
|
"loss": 5.2317,
|
|
"mean_token_accuracy": 0.17959018796682358,
|
|
"num_tokens": 44258364.0,
|
|
"step": 23985
|
|
},
|
|
{
|
|
"entropy": 5.497239446640014,
|
|
"epoch": 2.015458937198068,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00045916939870480896,
|
|
"loss": 5.1629,
|
|
"mean_token_accuracy": 0.18248820006847383,
|
|
"num_tokens": 44267473.0,
|
|
"step": 23990
|
|
},
|
|
{
|
|
"entropy": 5.64896559715271,
|
|
"epoch": 2.0158790170132326,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00045915219389386336,
|
|
"loss": 5.2054,
|
|
"mean_token_accuracy": 0.1814291298389435,
|
|
"num_tokens": 44276665.0,
|
|
"step": 23995
|
|
},
|
|
{
|
|
"entropy": 5.596774005889893,
|
|
"epoch": 2.0162990968283974,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004591349858207565,
|
|
"loss": 5.2614,
|
|
"mean_token_accuracy": 0.1755758687853813,
|
|
"num_tokens": 44285928.0,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 2.0162990968283974,
|
|
"eval_entropy": 5.367871912509312,
|
|
"eval_loss": 5.36544132232666,
|
|
"eval_mean_token_accuracy": 0.18292493719046923,
|
|
"eval_num_tokens": 44285928.0,
|
|
"eval_runtime": 27.3301,
|
|
"eval_samples_per_second": 1367.212,
|
|
"eval_steps_per_second": 170.911,
|
|
"step": 24000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 119020,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.4819220631552e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|