3877 lines
106 KiB
JSON
3877 lines
106 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 3820,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.4903042420744896,
|
|
"epoch": 0.005235944760782774,
|
|
"grad_norm": 7.59375,
|
|
"learning_rate": 1.1780104712041885e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.6602190021425486,
|
|
"num_tokens": 148667.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.5531679213047027,
|
|
"epoch": 0.010471889521565548,
|
|
"grad_norm": 6.9375,
|
|
"learning_rate": 2.486910994764398e-06,
|
|
"loss": 1.7071,
|
|
"mean_token_accuracy": 0.6444286152720451,
|
|
"num_tokens": 289380.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.5901738341897727,
|
|
"epoch": 0.015707834282348322,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 3.7958115183246074e-06,
|
|
"loss": 1.6751,
|
|
"mean_token_accuracy": 0.6434450890868902,
|
|
"num_tokens": 433015.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.5159442014992237,
|
|
"epoch": 0.020943779043131095,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 5.104712041884817e-06,
|
|
"loss": 1.5104,
|
|
"mean_token_accuracy": 0.6608107829466462,
|
|
"num_tokens": 586196.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.5390321850776671,
|
|
"epoch": 0.026179723803913868,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 6.4136125654450265e-06,
|
|
"loss": 1.4851,
|
|
"mean_token_accuracy": 0.664670011587441,
|
|
"num_tokens": 732500.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.486197516322136,
|
|
"epoch": 0.031415668564696644,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 7.722513089005236e-06,
|
|
"loss": 1.4375,
|
|
"mean_token_accuracy": 0.6728810863569379,
|
|
"num_tokens": 871796.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.4761194687336683,
|
|
"epoch": 0.036651613325479414,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 9.031413612565446e-06,
|
|
"loss": 1.4175,
|
|
"mean_token_accuracy": 0.6700865641236305,
|
|
"num_tokens": 1015760.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.397418873384595,
|
|
"epoch": 0.04188755808626219,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 1.0340314136125655e-05,
|
|
"loss": 1.3627,
|
|
"mean_token_accuracy": 0.6793193189427257,
|
|
"num_tokens": 1162521.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.4198294993489982,
|
|
"epoch": 0.04712350284704497,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 1.1649214659685865e-05,
|
|
"loss": 1.3982,
|
|
"mean_token_accuracy": 0.678496933169663,
|
|
"num_tokens": 1304606.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.3636814955621959,
|
|
"epoch": 0.052359447607827736,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 1.2958115183246074e-05,
|
|
"loss": 1.3157,
|
|
"mean_token_accuracy": 0.6834567856043577,
|
|
"num_tokens": 1459672.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.371455504372716,
|
|
"epoch": 0.05759539236861051,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 1.4267015706806284e-05,
|
|
"loss": 1.337,
|
|
"mean_token_accuracy": 0.680043394304812,
|
|
"num_tokens": 1614513.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.4191205739974975,
|
|
"epoch": 0.06283133712939329,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 1.5575916230366495e-05,
|
|
"loss": 1.3601,
|
|
"mean_token_accuracy": 0.6790978884324431,
|
|
"num_tokens": 1764120.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.4203862166032195,
|
|
"epoch": 0.06806728189017607,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.68848167539267e-05,
|
|
"loss": 1.3384,
|
|
"mean_token_accuracy": 0.6819259503856301,
|
|
"num_tokens": 1905542.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.4149893302470447,
|
|
"epoch": 0.07330322665095883,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.8193717277486914e-05,
|
|
"loss": 1.3124,
|
|
"mean_token_accuracy": 0.6826399000361562,
|
|
"num_tokens": 2051546.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.319907895848155,
|
|
"epoch": 0.0785391714117416,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 1.950261780104712e-05,
|
|
"loss": 1.2291,
|
|
"mean_token_accuracy": 0.6927984276786446,
|
|
"num_tokens": 2197518.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.3174231912940741,
|
|
"epoch": 0.08377511617252438,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 2.0811518324607333e-05,
|
|
"loss": 1.2863,
|
|
"mean_token_accuracy": 0.6850969936698675,
|
|
"num_tokens": 2348210.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.2859620593488217,
|
|
"epoch": 0.08901106093330716,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 2.212041884816754e-05,
|
|
"loss": 1.2469,
|
|
"mean_token_accuracy": 0.694661033526063,
|
|
"num_tokens": 2488974.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.3225422732532024,
|
|
"epoch": 0.09424700569408993,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 2.3429319371727752e-05,
|
|
"loss": 1.3216,
|
|
"mean_token_accuracy": 0.6867891995236277,
|
|
"num_tokens": 2638402.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.2810360241681338,
|
|
"epoch": 0.0994829504548727,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 2.473821989528796e-05,
|
|
"loss": 1.2682,
|
|
"mean_token_accuracy": 0.6961364889517426,
|
|
"num_tokens": 2780719.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.30760794095695,
|
|
"epoch": 0.10471889521565547,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 2.604712041884817e-05,
|
|
"loss": 1.3035,
|
|
"mean_token_accuracy": 0.6898889668285847,
|
|
"num_tokens": 2926604.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.264478771481663,
|
|
"epoch": 0.10995483997643825,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 2.7356020942408378e-05,
|
|
"loss": 1.2493,
|
|
"mean_token_accuracy": 0.6941331747919322,
|
|
"num_tokens": 3070548.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.2868385933339597,
|
|
"epoch": 0.11519078473722102,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 2.8664921465968587e-05,
|
|
"loss": 1.2975,
|
|
"mean_token_accuracy": 0.6899457449093461,
|
|
"num_tokens": 3222578.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.2868557438254355,
|
|
"epoch": 0.1204267294980038,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 2.99738219895288e-05,
|
|
"loss": 1.2843,
|
|
"mean_token_accuracy": 0.6913154577836395,
|
|
"num_tokens": 3367648.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.2696480546146631,
|
|
"epoch": 0.12566267425878658,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 3.1282722513089006e-05,
|
|
"loss": 1.271,
|
|
"mean_token_accuracy": 0.6934641852974892,
|
|
"num_tokens": 3513611.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.2611329367384314,
|
|
"epoch": 0.13089861901956934,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 3.2591623036649216e-05,
|
|
"loss": 1.27,
|
|
"mean_token_accuracy": 0.6962274318560958,
|
|
"num_tokens": 3654075.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.3185498464852572,
|
|
"epoch": 0.13613456378035213,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 3.3900523560209426e-05,
|
|
"loss": 1.3081,
|
|
"mean_token_accuracy": 0.6832702022045851,
|
|
"num_tokens": 3799547.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.2299459297209978,
|
|
"epoch": 0.1413705085411349,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 3.5209424083769635e-05,
|
|
"loss": 1.2355,
|
|
"mean_token_accuracy": 0.6997983153909445,
|
|
"num_tokens": 3940094.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.2170744601637125,
|
|
"epoch": 0.14660645330191766,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 3.6518324607329845e-05,
|
|
"loss": 1.2097,
|
|
"mean_token_accuracy": 0.7026064315810799,
|
|
"num_tokens": 4084450.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.2503085616976022,
|
|
"epoch": 0.15184239806270045,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 3.7827225130890054e-05,
|
|
"loss": 1.2366,
|
|
"mean_token_accuracy": 0.6935763908550143,
|
|
"num_tokens": 4235094.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.2002368062734603,
|
|
"epoch": 0.1570783428234832,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 3.9136125654450264e-05,
|
|
"loss": 1.1993,
|
|
"mean_token_accuracy": 0.7031140483915805,
|
|
"num_tokens": 4378493.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.2433323854580522,
|
|
"epoch": 0.162314287584266,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 4.044502617801047e-05,
|
|
"loss": 1.2437,
|
|
"mean_token_accuracy": 0.6959625506773591,
|
|
"num_tokens": 4513351.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.2667765196412801,
|
|
"epoch": 0.16755023234504876,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 4.175392670157068e-05,
|
|
"loss": 1.2609,
|
|
"mean_token_accuracy": 0.6903780495747924,
|
|
"num_tokens": 4654221.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 1.257903415709734,
|
|
"epoch": 0.17278617710583152,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 4.306282722513089e-05,
|
|
"loss": 1.2591,
|
|
"mean_token_accuracy": 0.6927119480445981,
|
|
"num_tokens": 4793001.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.2474724128842354,
|
|
"epoch": 0.1780221218666143,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 4.43717277486911e-05,
|
|
"loss": 1.258,
|
|
"mean_token_accuracy": 0.6958453560248017,
|
|
"num_tokens": 4938800.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.271824512630701,
|
|
"epoch": 0.18325806662739708,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 4.568062827225131e-05,
|
|
"loss": 1.2777,
|
|
"mean_token_accuracy": 0.6864122781902552,
|
|
"num_tokens": 5088676.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.206870013102889,
|
|
"epoch": 0.18849401138817987,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.698952879581152e-05,
|
|
"loss": 1.2145,
|
|
"mean_token_accuracy": 0.7025035681203008,
|
|
"num_tokens": 5233017.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 1.2822908700443805,
|
|
"epoch": 0.19372995614896263,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 4.829842931937173e-05,
|
|
"loss": 1.281,
|
|
"mean_token_accuracy": 0.685633241944015,
|
|
"num_tokens": 5383911.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.2813241746276618,
|
|
"epoch": 0.1989659009097454,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 4.960732984293194e-05,
|
|
"loss": 1.2896,
|
|
"mean_token_accuracy": 0.6846682282164693,
|
|
"num_tokens": 5532300.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.2817171201109887,
|
|
"epoch": 0.20420184567052818,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 4.999948856244768e-05,
|
|
"loss": 1.2811,
|
|
"mean_token_accuracy": 0.6898531707003712,
|
|
"num_tokens": 5673323.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.2512086292728781,
|
|
"epoch": 0.20943779043131094,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 4.9996983612565773e-05,
|
|
"loss": 1.2605,
|
|
"mean_token_accuracy": 0.690356932580471,
|
|
"num_tokens": 5818798.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.2130850929766894,
|
|
"epoch": 0.21467373519209373,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 4.999239142174581e-05,
|
|
"loss": 1.2191,
|
|
"mean_token_accuracy": 0.6989637283608318,
|
|
"num_tokens": 5967139.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.2118938906118273,
|
|
"epoch": 0.2199096799528765,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 4.99857123734344e-05,
|
|
"loss": 1.2075,
|
|
"mean_token_accuracy": 0.6999017883092165,
|
|
"num_tokens": 6125485.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.2652900835499168,
|
|
"epoch": 0.22514562471365926,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 4.9976947025330155e-05,
|
|
"loss": 1.2729,
|
|
"mean_token_accuracy": 0.6881730291992426,
|
|
"num_tokens": 6271940.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.2763973344117403,
|
|
"epoch": 0.23038156947444205,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 4.9966096109337125e-05,
|
|
"loss": 1.3224,
|
|
"mean_token_accuracy": 0.6868822824209928,
|
|
"num_tokens": 6415021.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.2256551414728165,
|
|
"epoch": 0.2356175142352248,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 4.995316053150366e-05,
|
|
"loss": 1.2271,
|
|
"mean_token_accuracy": 0.6921151876449585,
|
|
"num_tokens": 6558653.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.233339687436819,
|
|
"epoch": 0.2408534589960076,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 4.993814137194681e-05,
|
|
"loss": 1.2679,
|
|
"mean_token_accuracy": 0.6938443537801504,
|
|
"num_tokens": 6698611.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.2777669046074152,
|
|
"epoch": 0.24608940375679036,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 4.9921039884762057e-05,
|
|
"loss": 1.2873,
|
|
"mean_token_accuracy": 0.6864250931888819,
|
|
"num_tokens": 6847385.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 1.197480170428753,
|
|
"epoch": 0.25132534851757315,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 4.9901857497918655e-05,
|
|
"loss": 1.2159,
|
|
"mean_token_accuracy": 0.7006905306130647,
|
|
"num_tokens": 6990503.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.2612487450242043,
|
|
"epoch": 0.2565612932783559,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 4.98805958131404e-05,
|
|
"loss": 1.2667,
|
|
"mean_token_accuracy": 0.6904748784378171,
|
|
"num_tokens": 7138114.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.2380808498710394,
|
|
"epoch": 0.2617972380391387,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 4.985725660577184e-05,
|
|
"loss": 1.2627,
|
|
"mean_token_accuracy": 0.693475303426385,
|
|
"num_tokens": 7295788.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.2590212849900126,
|
|
"epoch": 0.26703318279992144,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 4.983184182463009e-05,
|
|
"loss": 1.252,
|
|
"mean_token_accuracy": 0.6881766313686967,
|
|
"num_tokens": 7431663.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.2515497665852309,
|
|
"epoch": 0.27226912756070426,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 4.980435359184204e-05,
|
|
"loss": 1.2831,
|
|
"mean_token_accuracy": 0.693330561555922,
|
|
"num_tokens": 7569801.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 1.184147422760725,
|
|
"epoch": 0.277505072321487,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 4.977479420266723e-05,
|
|
"loss": 1.1646,
|
|
"mean_token_accuracy": 0.703077656775713,
|
|
"num_tokens": 7713092.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 1.2746197815984488,
|
|
"epoch": 0.2827410170822698,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 4.974316612530615e-05,
|
|
"loss": 1.2765,
|
|
"mean_token_accuracy": 0.6863818326964974,
|
|
"num_tokens": 7854137.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 1.2566815540194511,
|
|
"epoch": 0.28797696184305255,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 4.970947200069416e-05,
|
|
"loss": 1.2648,
|
|
"mean_token_accuracy": 0.6892008159309626,
|
|
"num_tokens": 7992676.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.2914229419082404,
|
|
"epoch": 0.2932129066038353,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 4.967371464228096e-05,
|
|
"loss": 1.3179,
|
|
"mean_token_accuracy": 0.6854391321539879,
|
|
"num_tokens": 8149846.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 1.2183680593967439,
|
|
"epoch": 0.29844885136461813,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 4.963589703579569e-05,
|
|
"loss": 1.24,
|
|
"mean_token_accuracy": 0.6942471470683813,
|
|
"num_tokens": 8288016.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 1.3149288706481457,
|
|
"epoch": 0.3036847961254009,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 4.959602233899762e-05,
|
|
"loss": 1.3198,
|
|
"mean_token_accuracy": 0.679352731257677,
|
|
"num_tokens": 8435936.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 1.234706364199519,
|
|
"epoch": 0.30892074088618365,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 4.955409388141243e-05,
|
|
"loss": 1.2742,
|
|
"mean_token_accuracy": 0.6961829710751772,
|
|
"num_tokens": 8576679.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 1.2482819214463234,
|
|
"epoch": 0.3141566856469664,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.9510115164054297e-05,
|
|
"loss": 1.2703,
|
|
"mean_token_accuracy": 0.6918804241344333,
|
|
"num_tokens": 8724159.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.2805782459676265,
|
|
"epoch": 0.3193926304077492,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 4.946408985913344e-05,
|
|
"loss": 1.3043,
|
|
"mean_token_accuracy": 0.6854454703629017,
|
|
"num_tokens": 8871635.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 1.2219614367932081,
|
|
"epoch": 0.324628575168532,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 4.941602180974958e-05,
|
|
"loss": 1.2703,
|
|
"mean_token_accuracy": 0.6931755296885967,
|
|
"num_tokens": 9025175.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 1.2477784302085637,
|
|
"epoch": 0.32986451992931476,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 4.9365915029571007e-05,
|
|
"loss": 1.2585,
|
|
"mean_token_accuracy": 0.6917018702253699,
|
|
"num_tokens": 9162826.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 1.230723000690341,
|
|
"epoch": 0.3351004646900975,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 4.9313773702499455e-05,
|
|
"loss": 1.2391,
|
|
"mean_token_accuracy": 0.6946986148133873,
|
|
"num_tokens": 9312989.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 1.2285594891756773,
|
|
"epoch": 0.3403364094508803,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 4.925960218232073e-05,
|
|
"loss": 1.2367,
|
|
"mean_token_accuracy": 0.6967854388058186,
|
|
"num_tokens": 9455995.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.3053442865610123,
|
|
"epoch": 0.34557235421166305,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 4.920340499234116e-05,
|
|
"loss": 1.311,
|
|
"mean_token_accuracy": 0.6871177144348621,
|
|
"num_tokens": 9595268.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 1.2352213632315396,
|
|
"epoch": 0.35080829897244586,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 4.914518682500995e-05,
|
|
"loss": 1.2577,
|
|
"mean_token_accuracy": 0.69151939060539,
|
|
"num_tokens": 9737073.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 1.2220519341528415,
|
|
"epoch": 0.3560442437332286,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 4.908495254152731e-05,
|
|
"loss": 1.2419,
|
|
"mean_token_accuracy": 0.7000335277989507,
|
|
"num_tokens": 9880500.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 1.2571235705167054,
|
|
"epoch": 0.3612801884940114,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 4.902270717143859e-05,
|
|
"loss": 1.2772,
|
|
"mean_token_accuracy": 0.6950660437345505,
|
|
"num_tokens": 10029677.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 1.1837443890050054,
|
|
"epoch": 0.36651613325479415,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 4.895845591221426e-05,
|
|
"loss": 1.1883,
|
|
"mean_token_accuracy": 0.7034358236938715,
|
|
"num_tokens": 10169257.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.1898296054452657,
|
|
"epoch": 0.3717520780155769,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 4.8892204128816e-05,
|
|
"loss": 1.2018,
|
|
"mean_token_accuracy": 0.6991554461419582,
|
|
"num_tokens": 10312298.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 1.2473948691040277,
|
|
"epoch": 0.37698802277635973,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 4.882395735324864e-05,
|
|
"loss": 1.2845,
|
|
"mean_token_accuracy": 0.6941293969750404,
|
|
"num_tokens": 10455848.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 1.217660278454423,
|
|
"epoch": 0.3822239675371425,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 4.87537212840983e-05,
|
|
"loss": 1.2161,
|
|
"mean_token_accuracy": 0.6975388413295149,
|
|
"num_tokens": 10582901.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 1.2198585540056228,
|
|
"epoch": 0.38745991229792526,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 4.8681501786056544e-05,
|
|
"loss": 1.2394,
|
|
"mean_token_accuracy": 0.6949820145964622,
|
|
"num_tokens": 10724617.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 1.2234349481761455,
|
|
"epoch": 0.392695857058708,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 4.860730488943068e-05,
|
|
"loss": 1.2335,
|
|
"mean_token_accuracy": 0.6962696801871061,
|
|
"num_tokens": 10869452.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.2394332230091094,
|
|
"epoch": 0.3979318018194908,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 4.8531136789640216e-05,
|
|
"loss": 1.2645,
|
|
"mean_token_accuracy": 0.695160668157041,
|
|
"num_tokens": 11017465.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 1.219304683059454,
|
|
"epoch": 0.4031677465802736,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 4.845300384669958e-05,
|
|
"loss": 1.2453,
|
|
"mean_token_accuracy": 0.6994039881974459,
|
|
"num_tokens": 11158876.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 1.256004797667265,
|
|
"epoch": 0.40840369134105636,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 4.837291258468701e-05,
|
|
"loss": 1.2915,
|
|
"mean_token_accuracy": 0.6878539452329278,
|
|
"num_tokens": 11297155.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 1.201148072630167,
|
|
"epoch": 0.4136396361018391,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 4.8290869691199834e-05,
|
|
"loss": 1.22,
|
|
"mean_token_accuracy": 0.7018963057547808,
|
|
"num_tokens": 11439337.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 1.233640456199646,
|
|
"epoch": 0.4188755808626219,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 4.820688201679605e-05,
|
|
"loss": 1.2485,
|
|
"mean_token_accuracy": 0.6938533913344145,
|
|
"num_tokens": 11585266.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.2794130560010673,
|
|
"epoch": 0.42411152562340465,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 4.812095657442231e-05,
|
|
"loss": 1.2894,
|
|
"mean_token_accuracy": 0.6880532244220376,
|
|
"num_tokens": 11725643.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 1.2399780409410597,
|
|
"epoch": 0.42934747038418747,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 4.803310053882831e-05,
|
|
"loss": 1.2887,
|
|
"mean_token_accuracy": 0.6891330601647496,
|
|
"num_tokens": 11878982.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 1.2244506664574146,
|
|
"epoch": 0.43458341514497023,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 4.794332124596775e-05,
|
|
"loss": 1.2513,
|
|
"mean_token_accuracy": 0.6966191967949271,
|
|
"num_tokens": 12025844.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 1.2087435230612755,
|
|
"epoch": 0.439819359905753,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 4.7851626192385745e-05,
|
|
"loss": 1.2347,
|
|
"mean_token_accuracy": 0.7009090483188629,
|
|
"num_tokens": 12177345.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 1.2146162753924727,
|
|
"epoch": 0.44505530466653576,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 4.775802303459288e-05,
|
|
"loss": 1.2465,
|
|
"mean_token_accuracy": 0.6959697719663381,
|
|
"num_tokens": 12320998.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.198814813606441,
|
|
"epoch": 0.4502912494273185,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 4.76625195884259e-05,
|
|
"loss": 1.2194,
|
|
"mean_token_accuracy": 0.6998335400596261,
|
|
"num_tokens": 12470540.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 1.1972925199195743,
|
|
"epoch": 0.45552719418810134,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 4.7565123828395064e-05,
|
|
"loss": 1.2199,
|
|
"mean_token_accuracy": 0.7027627993375063,
|
|
"num_tokens": 12605966.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 1.2037498267367481,
|
|
"epoch": 0.4607631389488841,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 4.7465843887018316e-05,
|
|
"loss": 1.2338,
|
|
"mean_token_accuracy": 0.7029284704476595,
|
|
"num_tokens": 12753176.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 1.2665604405105113,
|
|
"epoch": 0.46599908370966686,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 4.736468805414218e-05,
|
|
"loss": 1.2826,
|
|
"mean_token_accuracy": 0.6867102902382612,
|
|
"num_tokens": 12891112.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 1.2413052493706345,
|
|
"epoch": 0.4712350284704496,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 4.72616647762496e-05,
|
|
"loss": 1.2708,
|
|
"mean_token_accuracy": 0.6939191322773695,
|
|
"num_tokens": 13037695.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.2334762597456574,
|
|
"epoch": 0.4764709732312324,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 4.7156782655754625e-05,
|
|
"loss": 1.2801,
|
|
"mean_token_accuracy": 0.6962197717279196,
|
|
"num_tokens": 13188396.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 1.2926313485950232,
|
|
"epoch": 0.4817069179920152,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 4.7050050450284147e-05,
|
|
"loss": 1.2973,
|
|
"mean_token_accuracy": 0.6834666855633259,
|
|
"num_tokens": 13334753.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 1.2605458820238709,
|
|
"epoch": 0.48694286275279797,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 4.6941477071946594e-05,
|
|
"loss": 1.273,
|
|
"mean_token_accuracy": 0.6905643936246634,
|
|
"num_tokens": 13481005.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 1.2309048125520348,
|
|
"epoch": 0.49217880751358073,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 4.683107158658781e-05,
|
|
"loss": 1.2438,
|
|
"mean_token_accuracy": 0.6907353041693568,
|
|
"num_tokens": 13618213.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 1.2219564571976662,
|
|
"epoch": 0.4974147522743635,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 4.6718843213034067e-05,
|
|
"loss": 1.228,
|
|
"mean_token_accuracy": 0.6988438554108143,
|
|
"num_tokens": 13751965.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.1836062878370286,
|
|
"epoch": 0.5026506970351463,
|
|
"grad_norm": 6.375,
|
|
"learning_rate": 4.660480132232225e-05,
|
|
"loss": 1.2209,
|
|
"mean_token_accuracy": 0.6979983827099204,
|
|
"num_tokens": 13901640.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 1.1964640978723764,
|
|
"epoch": 0.507886641795929,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 4.648895543691741e-05,
|
|
"loss": 1.2005,
|
|
"mean_token_accuracy": 0.6987225420773029,
|
|
"num_tokens": 14047322.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 1.2625075351446866,
|
|
"epoch": 0.5131225865567118,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 4.637131522991764e-05,
|
|
"loss": 1.2557,
|
|
"mean_token_accuracy": 0.6897652598097921,
|
|
"num_tokens": 14188766.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 1.2458597056567668,
|
|
"epoch": 0.5183585313174947,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 4.625189052424638e-05,
|
|
"loss": 1.2676,
|
|
"mean_token_accuracy": 0.6945432106032967,
|
|
"num_tokens": 14329591.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 1.245772442035377,
|
|
"epoch": 0.5235944760782774,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 4.613069129183218e-05,
|
|
"loss": 1.2665,
|
|
"mean_token_accuracy": 0.6959560567513108,
|
|
"num_tokens": 14477258.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.2078047215938568,
|
|
"epoch": 0.5288304208390602,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 4.600772765277607e-05,
|
|
"loss": 1.2176,
|
|
"mean_token_accuracy": 0.697881168872118,
|
|
"num_tokens": 14633237.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 1.2214165650308133,
|
|
"epoch": 0.5340663655998429,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 4.588300987450652e-05,
|
|
"loss": 1.2345,
|
|
"mean_token_accuracy": 0.6999158889055253,
|
|
"num_tokens": 14776068.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 1.2305742222815752,
|
|
"epoch": 0.5393023103606257,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 4.575654837092214e-05,
|
|
"loss": 1.2398,
|
|
"mean_token_accuracy": 0.6949134254828095,
|
|
"num_tokens": 14913719.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 1.2660383846610785,
|
|
"epoch": 0.5445382551214085,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 4.5628353701522055e-05,
|
|
"loss": 1.2685,
|
|
"mean_token_accuracy": 0.6927320031449199,
|
|
"num_tokens": 15066188.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 1.2355303570628167,
|
|
"epoch": 0.5497741998821912,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 4.5498436570524296e-05,
|
|
"loss": 1.2563,
|
|
"mean_token_accuracy": 0.6943851266056299,
|
|
"num_tokens": 15211662.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.2411348339170218,
|
|
"epoch": 0.555010144642974,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 4.536680782597191e-05,
|
|
"loss": 1.2807,
|
|
"mean_token_accuracy": 0.6922256585210562,
|
|
"num_tokens": 15361477.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 1.2601716944947838,
|
|
"epoch": 0.5602460894037568,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 4.5233478458827176e-05,
|
|
"loss": 1.2964,
|
|
"mean_token_accuracy": 0.6930005580186844,
|
|
"num_tokens": 15512017.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 1.3123046960681677,
|
|
"epoch": 0.5654820341645396,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 4.509845960205389e-05,
|
|
"loss": 1.3435,
|
|
"mean_token_accuracy": 0.6821131203323603,
|
|
"num_tokens": 15666802.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 1.1636217474937438,
|
|
"epoch": 0.5707179789253224,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 4.496176252968774e-05,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.7117271330207586,
|
|
"num_tokens": 15811306.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 1.1874678194522859,
|
|
"epoch": 0.5759539236861051,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 4.4823398655894924e-05,
|
|
"loss": 1.1878,
|
|
"mean_token_accuracy": 0.7028247270733118,
|
|
"num_tokens": 15957078.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.247788500599563,
|
|
"epoch": 0.5811898684468879,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 4.468337953401908e-05,
|
|
"loss": 1.2483,
|
|
"mean_token_accuracy": 0.6906874619424344,
|
|
"num_tokens": 16099300.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 1.2340633975341917,
|
|
"epoch": 0.5864258132076706,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 4.45417168556166e-05,
|
|
"loss": 1.2385,
|
|
"mean_token_accuracy": 0.6969642581418156,
|
|
"num_tokens": 16241998.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 1.1755719013512134,
|
|
"epoch": 0.5916617579684534,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 4.4398422449480356e-05,
|
|
"loss": 1.203,
|
|
"mean_token_accuracy": 0.7083542978391051,
|
|
"num_tokens": 16387522.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 1.2200544375926257,
|
|
"epoch": 0.5968977027292363,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 4.425350828065204e-05,
|
|
"loss": 1.2805,
|
|
"mean_token_accuracy": 0.6941867485642433,
|
|
"num_tokens": 16529425.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 1.2577802315354347,
|
|
"epoch": 0.602133647490019,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 4.410698644942303e-05,
|
|
"loss": 1.2505,
|
|
"mean_token_accuracy": 0.692890228703618,
|
|
"num_tokens": 16670311.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.1618805171921849,
|
|
"epoch": 0.6073695922508018,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 4.395886919032406e-05,
|
|
"loss": 1.1903,
|
|
"mean_token_accuracy": 0.7042877223342657,
|
|
"num_tokens": 16813681.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 1.2381837129592896,
|
|
"epoch": 0.6126055370115845,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 4.380916887110366e-05,
|
|
"loss": 1.2605,
|
|
"mean_token_accuracy": 0.6947620201855897,
|
|
"num_tokens": 16958167.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 1.2540216479450463,
|
|
"epoch": 0.6178414817723673,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 4.365789799169539e-05,
|
|
"loss": 1.2586,
|
|
"mean_token_accuracy": 0.6949862573295832,
|
|
"num_tokens": 17101042.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 1.2213780038058757,
|
|
"epoch": 0.6230774265331501,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 4.350506918317416e-05,
|
|
"loss": 1.2509,
|
|
"mean_token_accuracy": 0.6928936781361699,
|
|
"num_tokens": 17241395.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 1.1887108445167542,
|
|
"epoch": 0.6283133712939328,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 4.335069520670149e-05,
|
|
"loss": 1.2134,
|
|
"mean_token_accuracy": 0.7078650841489434,
|
|
"num_tokens": 17382621.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.1683177448809148,
|
|
"epoch": 0.6335493160547156,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 4.3194788952459996e-05,
|
|
"loss": 1.1862,
|
|
"mean_token_accuracy": 0.7075361222028732,
|
|
"num_tokens": 17524511.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 1.2176016632467508,
|
|
"epoch": 0.6387852608154984,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 4.303736343857704e-05,
|
|
"loss": 1.2635,
|
|
"mean_token_accuracy": 0.6953268457204104,
|
|
"num_tokens": 17674442.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 1.2287296935915948,
|
|
"epoch": 0.6440212055762812,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 4.2878431810037724e-05,
|
|
"loss": 1.239,
|
|
"mean_token_accuracy": 0.7013807725161314,
|
|
"num_tokens": 17827391.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 1.236463399976492,
|
|
"epoch": 0.649257150337064,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 4.27180073375873e-05,
|
|
"loss": 1.2571,
|
|
"mean_token_accuracy": 0.6894650906324387,
|
|
"num_tokens": 17972502.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 1.1414937254041433,
|
|
"epoch": 0.6544930950978467,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 4.255610341662304e-05,
|
|
"loss": 1.1546,
|
|
"mean_token_accuracy": 0.7119367253035307,
|
|
"num_tokens": 18115408.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.193948952294886,
|
|
"epoch": 0.6597290398586295,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 4.239273356607576e-05,
|
|
"loss": 1.2245,
|
|
"mean_token_accuracy": 0.7001152852550149,
|
|
"num_tokens": 18262489.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 1.246325920522213,
|
|
"epoch": 0.6649649846194122,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 4.222791142728097e-05,
|
|
"loss": 1.2505,
|
|
"mean_token_accuracy": 0.6927321873605251,
|
|
"num_tokens": 18399068.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 1.1776666756719352,
|
|
"epoch": 0.670200929380195,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 4.2061650762839825e-05,
|
|
"loss": 1.2147,
|
|
"mean_token_accuracy": 0.7039317097514868,
|
|
"num_tokens": 18543913.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 1.2006115175783634,
|
|
"epoch": 0.6754368741409779,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 4.189396545546995e-05,
|
|
"loss": 1.2154,
|
|
"mean_token_accuracy": 0.7036881025880575,
|
|
"num_tokens": 18678858.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 1.1709488430991768,
|
|
"epoch": 0.6806728189017606,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 4.1724869506846267e-05,
|
|
"loss": 1.1949,
|
|
"mean_token_accuracy": 0.7089729970321059,
|
|
"num_tokens": 18834916.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.2211165010929108,
|
|
"epoch": 0.6859087636625434,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 4.1554377036431816e-05,
|
|
"loss": 1.2437,
|
|
"mean_token_accuracy": 0.6957355309277773,
|
|
"num_tokens": 18987694.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 1.2230720650404692,
|
|
"epoch": 0.6911447084233261,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 4.138250228029882e-05,
|
|
"loss": 1.2417,
|
|
"mean_token_accuracy": 0.6949638992547988,
|
|
"num_tokens": 19137646.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 1.1824469189159572,
|
|
"epoch": 0.6963806531841089,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 4.120925958993994e-05,
|
|
"loss": 1.2038,
|
|
"mean_token_accuracy": 0.7080136310309172,
|
|
"num_tokens": 19285574.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 1.1995828442275525,
|
|
"epoch": 0.7016165979448917,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 4.103466343106999e-05,
|
|
"loss": 1.2104,
|
|
"mean_token_accuracy": 0.7012225743383169,
|
|
"num_tokens": 19432213.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 1.2089691065251826,
|
|
"epoch": 0.7068525427056744,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 4.0858728382417966e-05,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.6981374306604267,
|
|
"num_tokens": 19586405.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.2061294008046388,
|
|
"epoch": 0.7120884874664573,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 4.06814691345098e-05,
|
|
"loss": 1.2052,
|
|
"mean_token_accuracy": 0.6995515301823616,
|
|
"num_tokens": 19736389.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 1.2196388389915227,
|
|
"epoch": 0.71732443222724,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 4.0502900488441706e-05,
|
|
"loss": 1.2295,
|
|
"mean_token_accuracy": 0.6974983751773834,
|
|
"num_tokens": 19884434.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 1.1925444403663277,
|
|
"epoch": 0.7225603769880228,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 4.032303735464422e-05,
|
|
"loss": 1.1885,
|
|
"mean_token_accuracy": 0.7031191129237413,
|
|
"num_tokens": 20019339.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 1.184141149930656,
|
|
"epoch": 0.7277963217488056,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 4.0141894751637264e-05,
|
|
"loss": 1.2061,
|
|
"mean_token_accuracy": 0.7056893218308687,
|
|
"num_tokens": 20172036.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 1.18991824015975,
|
|
"epoch": 0.7330322665095883,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 3.995948780477605e-05,
|
|
"loss": 1.1945,
|
|
"mean_token_accuracy": 0.7036307500675321,
|
|
"num_tokens": 20316279.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.221291032806039,
|
|
"epoch": 0.7382682112703711,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 3.977583174498816e-05,
|
|
"loss": 1.2229,
|
|
"mean_token_accuracy": 0.6996599985286593,
|
|
"num_tokens": 20457700.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 1.2056787729263305,
|
|
"epoch": 0.7435041560311538,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 3.959094190750172e-05,
|
|
"loss": 1.2209,
|
|
"mean_token_accuracy": 0.699485157802701,
|
|
"num_tokens": 20601526.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 1.214169954136014,
|
|
"epoch": 0.7487401007919366,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 3.940483373056498e-05,
|
|
"loss": 1.2251,
|
|
"mean_token_accuracy": 0.7045085027813911,
|
|
"num_tokens": 20740478.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 1.2199758583679796,
|
|
"epoch": 0.7539760455527195,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 3.921752275415712e-05,
|
|
"loss": 1.232,
|
|
"mean_token_accuracy": 0.6983612652868032,
|
|
"num_tokens": 20889352.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 1.1763014759868384,
|
|
"epoch": 0.7592119903135022,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 3.902902461869079e-05,
|
|
"loss": 1.1852,
|
|
"mean_token_accuracy": 0.706533107161522,
|
|
"num_tokens": 21047325.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.2274865956045686,
|
|
"epoch": 0.764447935074285,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 3.883935506370605e-05,
|
|
"loss": 1.2559,
|
|
"mean_token_accuracy": 0.6984432989731431,
|
|
"num_tokens": 21193313.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 1.2031854771077632,
|
|
"epoch": 0.7696838798350677,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 3.864852992655617e-05,
|
|
"loss": 1.2106,
|
|
"mean_token_accuracy": 0.7018306776881218,
|
|
"num_tokens": 21340786.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 1.2027717839926482,
|
|
"epoch": 0.7749198245958505,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 3.845656514108515e-05,
|
|
"loss": 1.22,
|
|
"mean_token_accuracy": 0.7030729129910469,
|
|
"num_tokens": 21484126.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 1.1977868607267737,
|
|
"epoch": 0.7801557693566333,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 3.8263476736297374e-05,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.7007357392460107,
|
|
"num_tokens": 21629712.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 1.280288253352046,
|
|
"epoch": 0.785391714117416,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 3.806928083501906e-05,
|
|
"loss": 1.3073,
|
|
"mean_token_accuracy": 0.6855264658108353,
|
|
"num_tokens": 21769811.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.2234169896692038,
|
|
"epoch": 0.7906276588781989,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 3.787399365255207e-05,
|
|
"loss": 1.2603,
|
|
"mean_token_accuracy": 0.6935047794133424,
|
|
"num_tokens": 21919583.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 1.2192364005371927,
|
|
"epoch": 0.7958636036389816,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.7677631495319956e-05,
|
|
"loss": 1.2092,
|
|
"mean_token_accuracy": 0.702324446476996,
|
|
"num_tokens": 22067375.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 1.1928664781153202,
|
|
"epoch": 0.8010995483997644,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 3.748021075950633e-05,
|
|
"loss": 1.2297,
|
|
"mean_token_accuracy": 0.6969239924103021,
|
|
"num_tokens": 22210682.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 1.2294584538787603,
|
|
"epoch": 0.8063354931605472,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 3.728174792968582e-05,
|
|
"loss": 1.253,
|
|
"mean_token_accuracy": 0.6952543262392282,
|
|
"num_tokens": 22362042.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 1.2315257797017694,
|
|
"epoch": 0.8115714379213299,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 3.7082259577447605e-05,
|
|
"loss": 1.2633,
|
|
"mean_token_accuracy": 0.6920363411307335,
|
|
"num_tokens": 22518138.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.2164895705878735,
|
|
"epoch": 0.8168073826821127,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 3.688176236001168e-05,
|
|
"loss": 1.2215,
|
|
"mean_token_accuracy": 0.6953978851437569,
|
|
"num_tokens": 22670347.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 1.1897184619680048,
|
|
"epoch": 0.8220433274428954,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 3.668027301883802e-05,
|
|
"loss": 1.2025,
|
|
"mean_token_accuracy": 0.7067706823348999,
|
|
"num_tokens": 22816471.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 1.2356853460893036,
|
|
"epoch": 0.8272792722036783,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 3.6477808378228604e-05,
|
|
"loss": 1.2735,
|
|
"mean_token_accuracy": 0.6951639795675874,
|
|
"num_tokens": 22969371.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 1.172374564781785,
|
|
"epoch": 0.8325152169644611,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 3.6274385343922677e-05,
|
|
"loss": 1.1798,
|
|
"mean_token_accuracy": 0.7071554753929377,
|
|
"num_tokens": 23112802.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 1.2456749143078922,
|
|
"epoch": 0.8377511617252438,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 3.607002090168506e-05,
|
|
"loss": 1.2789,
|
|
"mean_token_accuracy": 0.6919172059744596,
|
|
"num_tokens": 23251431.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 1.2133430268615484,
|
|
"epoch": 0.8429871064860266,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 3.5864732115887866e-05,
|
|
"loss": 1.2482,
|
|
"mean_token_accuracy": 0.7028827562928199,
|
|
"num_tokens": 23398750.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 1.2265673983842134,
|
|
"epoch": 0.8482230512468093,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 3.565853612808562e-05,
|
|
"loss": 1.2666,
|
|
"mean_token_accuracy": 0.6953880734741688,
|
|
"num_tokens": 23541708.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 1.1701237484812737,
|
|
"epoch": 0.8534589960075921,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 3.545145015558399e-05,
|
|
"loss": 1.1376,
|
|
"mean_token_accuracy": 0.7052776984870434,
|
|
"num_tokens": 23689211.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 1.227908807620406,
|
|
"epoch": 0.8586949407683749,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 3.524349149000206e-05,
|
|
"loss": 1.2574,
|
|
"mean_token_accuracy": 0.6968840681016445,
|
|
"num_tokens": 23830779.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 1.1823246696963907,
|
|
"epoch": 0.8639308855291576,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 3.503467749582857e-05,
|
|
"loss": 1.176,
|
|
"mean_token_accuracy": 0.7053217653185129,
|
|
"num_tokens": 23966735.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 1.1737437251955272,
|
|
"epoch": 0.8691668302899405,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 3.482502560897195e-05,
|
|
"loss": 1.202,
|
|
"mean_token_accuracy": 0.7063202302902937,
|
|
"num_tokens": 24114205.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 1.2372199261561037,
|
|
"epoch": 0.8744027750507232,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 3.4614553335304406e-05,
|
|
"loss": 1.2486,
|
|
"mean_token_accuracy": 0.694911016151309,
|
|
"num_tokens": 24257335.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 1.2101770553737878,
|
|
"epoch": 0.879638719811506,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 3.440327824920022e-05,
|
|
"loss": 1.1971,
|
|
"mean_token_accuracy": 0.6996189601719379,
|
|
"num_tokens": 24405338.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 1.1508656131103634,
|
|
"epoch": 0.8848746645722888,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 3.419121799206829e-05,
|
|
"loss": 1.17,
|
|
"mean_token_accuracy": 0.7079865211620927,
|
|
"num_tokens": 24550534.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 1.2128825964406134,
|
|
"epoch": 0.8901106093330715,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.3978390270879055e-05,
|
|
"loss": 1.248,
|
|
"mean_token_accuracy": 0.6985778672620654,
|
|
"num_tokens": 24699673.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 1.1805501360446216,
|
|
"epoch": 0.8953465540938543,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 3.3764812856686e-05,
|
|
"loss": 1.1703,
|
|
"mean_token_accuracy": 0.7011347938328981,
|
|
"num_tokens": 24848326.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 1.211562325246632,
|
|
"epoch": 0.900582498854637,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 3.355050358314172e-05,
|
|
"loss": 1.2425,
|
|
"mean_token_accuracy": 0.702686908468604,
|
|
"num_tokens": 24992122.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 1.1974574619904161,
|
|
"epoch": 0.9058184436154199,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 3.3335480345008905e-05,
|
|
"loss": 1.2259,
|
|
"mean_token_accuracy": 0.6980314027518034,
|
|
"num_tokens": 25139079.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 1.2152755599468947,
|
|
"epoch": 0.9110543883762027,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 3.311976109666605e-05,
|
|
"loss": 1.251,
|
|
"mean_token_accuracy": 0.6992737432941795,
|
|
"num_tokens": 25286349.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 1.2275765413418411,
|
|
"epoch": 0.9162903331369854,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 3.290336385060832e-05,
|
|
"loss": 1.2348,
|
|
"mean_token_accuracy": 0.6949001539498567,
|
|
"num_tokens": 25431706.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 1.1663161270320415,
|
|
"epoch": 0.9215262778977682,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 3.268630667594348e-05,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.7103127352893353,
|
|
"num_tokens": 25575146.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 1.2058376437053084,
|
|
"epoch": 0.9267622226585509,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 3.2468607696883146e-05,
|
|
"loss": 1.2425,
|
|
"mean_token_accuracy": 0.7038685705512762,
|
|
"num_tokens": 25726777.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 1.189345240779221,
|
|
"epoch": 0.9319981674193337,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 3.225028509122944e-05,
|
|
"loss": 1.2287,
|
|
"mean_token_accuracy": 0.7018254602327942,
|
|
"num_tokens": 25875049.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 1.1771747019141912,
|
|
"epoch": 0.9372341121801165,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 3.2031357088857085e-05,
|
|
"loss": 1.1915,
|
|
"mean_token_accuracy": 0.7071957625448704,
|
|
"num_tokens": 26026216.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 1.220152474567294,
|
|
"epoch": 0.9424700569408992,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 3.181184197019127e-05,
|
|
"loss": 1.2299,
|
|
"mean_token_accuracy": 0.7012155883014202,
|
|
"num_tokens": 26170779.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 1.1854938926175236,
|
|
"epoch": 0.9477060017016821,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 3.159175806468126e-05,
|
|
"loss": 1.1874,
|
|
"mean_token_accuracy": 0.705037958547473,
|
|
"num_tokens": 26319322.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 1.2158665416762233,
|
|
"epoch": 0.9529419464624648,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 3.1371123749269805e-05,
|
|
"loss": 1.2222,
|
|
"mean_token_accuracy": 0.6985156688839197,
|
|
"num_tokens": 26465154.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 1.2151709901168943,
|
|
"epoch": 0.9581778912232476,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 3.114995744685877e-05,
|
|
"loss": 1.2667,
|
|
"mean_token_accuracy": 0.6974882191047073,
|
|
"num_tokens": 26611774.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 1.1511426636949182,
|
|
"epoch": 0.9634138359840304,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 3.092827762477074e-05,
|
|
"loss": 1.197,
|
|
"mean_token_accuracy": 0.7165371583774686,
|
|
"num_tokens": 26760701.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 1.2074316812679171,
|
|
"epoch": 0.9686497807448131,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 3.070610279320707e-05,
|
|
"loss": 1.2485,
|
|
"mean_token_accuracy": 0.7016795247793197,
|
|
"num_tokens": 26902851.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 1.1557184986770153,
|
|
"epoch": 0.9738857255055959,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 3.0483451503702264e-05,
|
|
"loss": 1.1973,
|
|
"mean_token_accuracy": 0.7114167800173163,
|
|
"num_tokens": 27053159.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 1.145219304971397,
|
|
"epoch": 0.9791216702663786,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 3.0260342347574915e-05,
|
|
"loss": 1.1371,
|
|
"mean_token_accuracy": 0.7116047518327833,
|
|
"num_tokens": 27195594.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 1.1833709230646492,
|
|
"epoch": 0.9843576150271615,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 3.003679395437536e-05,
|
|
"loss": 1.1913,
|
|
"mean_token_accuracy": 0.7063193745911122,
|
|
"num_tokens": 27335855.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 1.2038549520075321,
|
|
"epoch": 0.9895935597879443,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 2.981282499033009e-05,
|
|
"loss": 1.2036,
|
|
"mean_token_accuracy": 0.6992362190037966,
|
|
"num_tokens": 27485225.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 1.1822845570743084,
|
|
"epoch": 0.994829504548727,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 2.9588454156783163e-05,
|
|
"loss": 1.2295,
|
|
"mean_token_accuracy": 0.7060987044125795,
|
|
"num_tokens": 27634788.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 1.1678016036748886,
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 2.9363700188634598e-05,
|
|
"loss": 1.1579,
|
|
"mean_token_accuracy": 0.7110490016167677,
|
|
"num_tokens": 27776648.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_entropy": 1.1545482213497162,
|
|
"eval_loss": 1.1723113059997559,
|
|
"eval_mean_token_accuracy": 0.7067238150835037,
|
|
"eval_num_tokens": 27776648.0,
|
|
"eval_runtime": 60.9052,
|
|
"eval_samples_per_second": 32.838,
|
|
"eval_steps_per_second": 16.419,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 1.092913302220404,
|
|
"epoch": 1.0052359447607828,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 2.9138581852776055e-05,
|
|
"loss": 1.0676,
|
|
"mean_token_accuracy": 0.7341579392552375,
|
|
"num_tokens": 27920450.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 1.076912895217538,
|
|
"epoch": 1.0104718895215656,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 2.8913117946523803e-05,
|
|
"loss": 1.0904,
|
|
"mean_token_accuracy": 0.7310005821287632,
|
|
"num_tokens": 28067925.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 1.0259252307936548,
|
|
"epoch": 1.0157078342823482,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 2.8687327296049128e-05,
|
|
"loss": 0.9952,
|
|
"mean_token_accuracy": 0.7427292361855506,
|
|
"num_tokens": 28212860.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 1.0669849675148726,
|
|
"epoch": 1.020943779043131,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 2.8461228754806375e-05,
|
|
"loss": 1.0461,
|
|
"mean_token_accuracy": 0.7347980726510286,
|
|
"num_tokens": 28356365.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 1.0898705100640655,
|
|
"epoch": 1.0261797238039139,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 2.823484120195865e-05,
|
|
"loss": 1.0786,
|
|
"mean_token_accuracy": 0.7276619732379913,
|
|
"num_tokens": 28495047.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 1.0758044727146625,
|
|
"epoch": 1.0314156685646967,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 2.8008183540801484e-05,
|
|
"loss": 1.0782,
|
|
"mean_token_accuracy": 0.7319583360105753,
|
|
"num_tokens": 28633968.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 1.031227163411677,
|
|
"epoch": 1.0366516133254795,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 2.7781274697184352e-05,
|
|
"loss": 1.0339,
|
|
"mean_token_accuracy": 0.7387025002390146,
|
|
"num_tokens": 28778453.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 1.1043319918215275,
|
|
"epoch": 1.041887558086262,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 2.7554133617930394e-05,
|
|
"loss": 1.1214,
|
|
"mean_token_accuracy": 0.7271257348358631,
|
|
"num_tokens": 28919232.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 1.0519304445013404,
|
|
"epoch": 1.047123502847045,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 2.732677926925436e-05,
|
|
"loss": 1.059,
|
|
"mean_token_accuracy": 0.7360297767445445,
|
|
"num_tokens": 29050240.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 1.074704316444695,
|
|
"epoch": 1.0523594476078277,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 2.709923063517895e-05,
|
|
"loss": 1.0769,
|
|
"mean_token_accuracy": 0.7276826776564121,
|
|
"num_tokens": 29198739.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 1.1091003093868494,
|
|
"epoch": 1.0575953923686106,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 2.6871506715949606e-05,
|
|
"loss": 1.1141,
|
|
"mean_token_accuracy": 0.7292679946869611,
|
|
"num_tokens": 29343130.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 1.1154426285997032,
|
|
"epoch": 1.0628313371293934,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 2.664362652644806e-05,
|
|
"loss": 1.1339,
|
|
"mean_token_accuracy": 0.7209920965135097,
|
|
"num_tokens": 29489679.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 1.0308820417150855,
|
|
"epoch": 1.068067281890176,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 2.641560909460456e-05,
|
|
"loss": 1.0344,
|
|
"mean_token_accuracy": 0.737670699879527,
|
|
"num_tokens": 29630133.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 1.1122422970831394,
|
|
"epoch": 1.0733032266509588,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 2.6187473459809043e-05,
|
|
"loss": 1.1265,
|
|
"mean_token_accuracy": 0.7212287154048681,
|
|
"num_tokens": 29781663.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 1.0620404394343494,
|
|
"epoch": 1.0785391714117416,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 2.595923867132136e-05,
|
|
"loss": 1.0485,
|
|
"mean_token_accuracy": 0.7329498503357172,
|
|
"num_tokens": 29930945.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 1.0296688327565788,
|
|
"epoch": 1.0837751161725244,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 2.573092378668067e-05,
|
|
"loss": 1.0277,
|
|
"mean_token_accuracy": 0.7400956619530916,
|
|
"num_tokens": 30074139.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 1.071678783558309,
|
|
"epoch": 1.0890110609333072,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 2.5502547870114135e-05,
|
|
"loss": 1.0915,
|
|
"mean_token_accuracy": 0.7298048492521048,
|
|
"num_tokens": 30216099.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 1.032330046594143,
|
|
"epoch": 1.0942470056940898,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 2.5274129990945067e-05,
|
|
"loss": 1.0344,
|
|
"mean_token_accuracy": 0.7430270429700613,
|
|
"num_tokens": 30368171.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 1.074818222783506,
|
|
"epoch": 1.0994829504548727,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 2.504568922200064e-05,
|
|
"loss": 1.0715,
|
|
"mean_token_accuracy": 0.726781240105629,
|
|
"num_tokens": 30510708.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 1.0945234788581728,
|
|
"epoch": 1.1047188952156555,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 2.481724463801933e-05,
|
|
"loss": 1.1047,
|
|
"mean_token_accuracy": 0.7265350595116615,
|
|
"num_tokens": 30662144.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 1.0475447980687023,
|
|
"epoch": 1.1099548399764383,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 2.4588815314058155e-05,
|
|
"loss": 1.0611,
|
|
"mean_token_accuracy": 0.7366870004683733,
|
|
"num_tokens": 30810485.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 1.0811086906120182,
|
|
"epoch": 1.115190784737221,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 2.436042032389992e-05,
|
|
"loss": 1.0993,
|
|
"mean_token_accuracy": 0.7298943884670734,
|
|
"num_tokens": 30963681.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 1.0684187861159444,
|
|
"epoch": 1.1204267294980037,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 2.4132078738460588e-05,
|
|
"loss": 1.0798,
|
|
"mean_token_accuracy": 0.7327737433835864,
|
|
"num_tokens": 31116018.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 1.0509660685434938,
|
|
"epoch": 1.1256626742587865,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 2.3903809624196825e-05,
|
|
"loss": 1.0373,
|
|
"mean_token_accuracy": 0.73740617595613,
|
|
"num_tokens": 31248950.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 1.0591240156441928,
|
|
"epoch": 1.1308986190195693,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 2.3675632041513978e-05,
|
|
"loss": 1.0743,
|
|
"mean_token_accuracy": 0.7330824228003621,
|
|
"num_tokens": 31386629.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 1.0668636929243802,
|
|
"epoch": 1.1361345637803522,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 2.3447565043174533e-05,
|
|
"loss": 1.0589,
|
|
"mean_token_accuracy": 0.7297359511256218,
|
|
"num_tokens": 31535751.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 1.0401808319613337,
|
|
"epoch": 1.141370508541135,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 2.321962767270724e-05,
|
|
"loss": 1.0402,
|
|
"mean_token_accuracy": 0.7368248403072357,
|
|
"num_tokens": 31676167.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 1.0865553246811033,
|
|
"epoch": 1.1466064533019176,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 2.299183896281692e-05,
|
|
"loss": 1.1019,
|
|
"mean_token_accuracy": 0.7275375993922353,
|
|
"num_tokens": 31831724.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 1.0097376400604845,
|
|
"epoch": 1.1518423980627004,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 2.27642179337953e-05,
|
|
"loss": 1.0072,
|
|
"mean_token_accuracy": 0.7466676604002714,
|
|
"num_tokens": 31966434.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 1.086430662125349,
|
|
"epoch": 1.1570783428234832,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 2.2536783591932784e-05,
|
|
"loss": 1.1011,
|
|
"mean_token_accuracy": 0.7281720124185085,
|
|
"num_tokens": 32110771.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 1.049152427725494,
|
|
"epoch": 1.162314287584266,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 2.2309554927931493e-05,
|
|
"loss": 1.0408,
|
|
"mean_token_accuracy": 0.7368430346250534,
|
|
"num_tokens": 32251257.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 1.0445547673851252,
|
|
"epoch": 1.1675502323450488,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 2.208255091531947e-05,
|
|
"loss": 1.0657,
|
|
"mean_token_accuracy": 0.7318288933485746,
|
|
"num_tokens": 32392959.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 1.075485266186297,
|
|
"epoch": 1.1727861771058314,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 2.1855790508866435e-05,
|
|
"loss": 1.09,
|
|
"mean_token_accuracy": 0.7345754325389862,
|
|
"num_tokens": 32528087.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 1.041004289314151,
|
|
"epoch": 1.1780221218666143,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 2.162929264300107e-05,
|
|
"loss": 1.031,
|
|
"mean_token_accuracy": 0.7372262746095657,
|
|
"num_tokens": 32670677.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 1.053070001862943,
|
|
"epoch": 1.183258066627397,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 2.1403076230230006e-05,
|
|
"loss": 1.0635,
|
|
"mean_token_accuracy": 0.7337488930672407,
|
|
"num_tokens": 32817744.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 1.1205989433452488,
|
|
"epoch": 1.18849401138818,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 2.11771601595586e-05,
|
|
"loss": 1.1453,
|
|
"mean_token_accuracy": 0.7223296284675598,
|
|
"num_tokens": 32959765.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 1.0392343305051326,
|
|
"epoch": 1.1937299561489627,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 2.0951563294913738e-05,
|
|
"loss": 1.0358,
|
|
"mean_token_accuracy": 0.7359498247504235,
|
|
"num_tokens": 33097626.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 1.059504010900855,
|
|
"epoch": 1.1989659009097453,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 2.0726304473568693e-05,
|
|
"loss": 1.081,
|
|
"mean_token_accuracy": 0.735545065253973,
|
|
"num_tokens": 33239983.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 1.0819577634334565,
|
|
"epoch": 1.2042018456705281,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 2.0501402504570234e-05,
|
|
"loss": 1.094,
|
|
"mean_token_accuracy": 0.7268587298691273,
|
|
"num_tokens": 33384613.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 1.0227667864412069,
|
|
"epoch": 1.209437790431311,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 2.0276876167168044e-05,
|
|
"loss": 1.027,
|
|
"mean_token_accuracy": 0.7428420815616846,
|
|
"num_tokens": 33530796.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 1.0223766604438425,
|
|
"epoch": 1.2146737351920938,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 2.005274420924668e-05,
|
|
"loss": 1.031,
|
|
"mean_token_accuracy": 0.7396607849746942,
|
|
"num_tokens": 33666957.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 1.0607954716309904,
|
|
"epoch": 1.2199096799528766,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 1.9829025345760124e-05,
|
|
"loss": 1.071,
|
|
"mean_token_accuracy": 0.7329499468207359,
|
|
"num_tokens": 33812151.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 1.038625803217292,
|
|
"epoch": 1.2251456247136592,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 1.960573825716911e-05,
|
|
"loss": 1.0433,
|
|
"mean_token_accuracy": 0.7345146417617798,
|
|
"num_tokens": 33958267.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 1.0953392999246716,
|
|
"epoch": 1.230381569474442,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 1.9382901587881275e-05,
|
|
"loss": 1.0805,
|
|
"mean_token_accuracy": 0.7263612521812319,
|
|
"num_tokens": 34104819.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 1.0933478716760874,
|
|
"epoch": 1.2356175142352248,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 1.9160533944694366e-05,
|
|
"loss": 1.1009,
|
|
"mean_token_accuracy": 0.7227355781942606,
|
|
"num_tokens": 34238326.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 1.0345038840547205,
|
|
"epoch": 1.2408534589960076,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 1.8938653895242604e-05,
|
|
"loss": 1.0354,
|
|
"mean_token_accuracy": 0.739068279415369,
|
|
"num_tokens": 34376182.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 1.0739264035597444,
|
|
"epoch": 1.2460894037567904,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 1.8717279966446267e-05,
|
|
"loss": 1.0603,
|
|
"mean_token_accuracy": 0.7294208355247974,
|
|
"num_tokens": 34519957.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 1.0502849434502424,
|
|
"epoch": 1.251325348517573,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 1.8496430642964696e-05,
|
|
"loss": 1.0985,
|
|
"mean_token_accuracy": 0.7344448113813996,
|
|
"num_tokens": 34674907.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 1.0143306592479349,
|
|
"epoch": 1.2565612932783559,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 1.827612436565286e-05,
|
|
"loss": 1.0286,
|
|
"mean_token_accuracy": 0.7477799784392118,
|
|
"num_tokens": 34821308.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 1.0486345458775759,
|
|
"epoch": 1.2617972380391387,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 1.8056379530021493e-05,
|
|
"loss": 1.0472,
|
|
"mean_token_accuracy": 0.7336918633431196,
|
|
"num_tokens": 34965952.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 1.0630970790982246,
|
|
"epoch": 1.2670331827999215,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 1.7837214484701154e-05,
|
|
"loss": 1.079,
|
|
"mean_token_accuracy": 0.7314780931919813,
|
|
"num_tokens": 35110373.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 1.0528906928375363,
|
|
"epoch": 1.2722691275607043,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 1.7618647529910042e-05,
|
|
"loss": 1.0483,
|
|
"mean_token_accuracy": 0.7345411021262407,
|
|
"num_tokens": 35258495.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 1.018369135260582,
|
|
"epoch": 1.2775050723214871,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 1.7400696915925996e-05,
|
|
"loss": 1.0174,
|
|
"mean_token_accuracy": 0.7416710961610079,
|
|
"num_tokens": 35404823.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 1.0643933141604065,
|
|
"epoch": 1.2827410170822697,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 1.718338084156254e-05,
|
|
"loss": 1.0795,
|
|
"mean_token_accuracy": 0.7312329623848199,
|
|
"num_tokens": 35555059.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 1.0236325599253178,
|
|
"epoch": 1.2879769618430525,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 1.6966717452649373e-05,
|
|
"loss": 1.0252,
|
|
"mean_token_accuracy": 0.7439669661223889,
|
|
"num_tokens": 35704746.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 1.0227021113038064,
|
|
"epoch": 1.2932129066038354,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 1.67507248405171e-05,
|
|
"loss": 1.0337,
|
|
"mean_token_accuracy": 0.7387756012380123,
|
|
"num_tokens": 35852823.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 1.1011345129460097,
|
|
"epoch": 1.2984488513646182,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 1.6535421040486686e-05,
|
|
"loss": 1.0906,
|
|
"mean_token_accuracy": 0.7245030000805854,
|
|
"num_tokens": 36004677.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 1.0781516009941696,
|
|
"epoch": 1.3036847961254008,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 1.6320824030363458e-05,
|
|
"loss": 1.11,
|
|
"mean_token_accuracy": 0.7256867518648505,
|
|
"num_tokens": 36141496.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 1.0459418123587967,
|
|
"epoch": 1.3089207408861836,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 1.6106951728936025e-05,
|
|
"loss": 1.0554,
|
|
"mean_token_accuracy": 0.734595287963748,
|
|
"num_tokens": 36292155.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 1.0505487740039825,
|
|
"epoch": 1.3141566856469664,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 1.5893821994479995e-05,
|
|
"loss": 1.0537,
|
|
"mean_token_accuracy": 0.7354540932923556,
|
|
"num_tokens": 36438145.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 1.0511848462745548,
|
|
"epoch": 1.3193926304077492,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 1.5681452623266867e-05,
|
|
"loss": 1.068,
|
|
"mean_token_accuracy": 0.7358380068093539,
|
|
"num_tokens": 36577572.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 1.0947185611352324,
|
|
"epoch": 1.324628575168532,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 1.5469861348078014e-05,
|
|
"loss": 1.1011,
|
|
"mean_token_accuracy": 0.7275305841118097,
|
|
"num_tokens": 36722937.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 1.0880089100450276,
|
|
"epoch": 1.3298645199293149,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 1.5259065836724033e-05,
|
|
"loss": 1.0962,
|
|
"mean_token_accuracy": 0.7264829911291599,
|
|
"num_tokens": 36872221.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 1.0425203360617161,
|
|
"epoch": 1.3351004646900975,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 1.5049083690569455e-05,
|
|
"loss": 1.047,
|
|
"mean_token_accuracy": 0.7343699801713228,
|
|
"num_tokens": 37016594.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 1.0769597385078669,
|
|
"epoch": 1.3403364094508803,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 1.4839932443063056e-05,
|
|
"loss": 1.0833,
|
|
"mean_token_accuracy": 0.7293191211298108,
|
|
"num_tokens": 37165697.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 1.0958488559350372,
|
|
"epoch": 1.345572354211663,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 1.4631629558273801e-05,
|
|
"loss": 1.1182,
|
|
"mean_token_accuracy": 0.7267538867890835,
|
|
"num_tokens": 37315842.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 1.0367282923310994,
|
|
"epoch": 1.350808298972446,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 1.4424192429432656e-05,
|
|
"loss": 1.0612,
|
|
"mean_token_accuracy": 0.736408182233572,
|
|
"num_tokens": 37459644.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 1.0303277362138032,
|
|
"epoch": 1.3560442437332285,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 1.4217638377480158e-05,
|
|
"loss": 1.0507,
|
|
"mean_token_accuracy": 0.7396169764921069,
|
|
"num_tokens": 37607638.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 1.089581909775734,
|
|
"epoch": 1.3612801884940113,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 1.4011984649620211e-05,
|
|
"loss": 1.0904,
|
|
"mean_token_accuracy": 0.7292284790426493,
|
|
"num_tokens": 37754738.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 1.0630993578583001,
|
|
"epoch": 1.3665161332547942,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 1.3807248417879895e-05,
|
|
"loss": 1.0852,
|
|
"mean_token_accuracy": 0.731200734898448,
|
|
"num_tokens": 37894410.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 1.0982538178563117,
|
|
"epoch": 1.371752078015577,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 1.3603446777675665e-05,
|
|
"loss": 1.1,
|
|
"mean_token_accuracy": 0.7262859750539065,
|
|
"num_tokens": 38038356.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 1.0451888531446456,
|
|
"epoch": 1.3769880227763598,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 1.3400596746385815e-05,
|
|
"loss": 1.0298,
|
|
"mean_token_accuracy": 0.7341227237135172,
|
|
"num_tokens": 38184874.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 1.0088561842218042,
|
|
"epoch": 1.3822239675371426,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 1.3198715261929586e-05,
|
|
"loss": 0.9888,
|
|
"mean_token_accuracy": 0.7419910099357366,
|
|
"num_tokens": 38327082.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 1.062550875172019,
|
|
"epoch": 1.3874599122979252,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 1.2997819181352822e-05,
|
|
"loss": 1.0731,
|
|
"mean_token_accuracy": 0.7318339478224516,
|
|
"num_tokens": 38476686.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 1.066984947770834,
|
|
"epoch": 1.392695857058708,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 1.2797925279420453e-05,
|
|
"loss": 1.0677,
|
|
"mean_token_accuracy": 0.7307767707854509,
|
|
"num_tokens": 38635267.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 1.08070537019521,
|
|
"epoch": 1.3979318018194908,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 1.2599050247215764e-05,
|
|
"loss": 1.0787,
|
|
"mean_token_accuracy": 0.7295586479827761,
|
|
"num_tokens": 38782523.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 1.1134451285004616,
|
|
"epoch": 1.4031677465802737,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 1.2401210690746703e-05,
|
|
"loss": 1.1551,
|
|
"mean_token_accuracy": 0.7224681507796049,
|
|
"num_tokens": 38934863.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 1.006534701772034,
|
|
"epoch": 1.4084036913410563,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 1.2204423129559306e-05,
|
|
"loss": 1.0215,
|
|
"mean_token_accuracy": 0.7462513867765665,
|
|
"num_tokens": 39088409.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 1.0482663962990046,
|
|
"epoch": 1.413639636101839,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 1.20087039953583e-05,
|
|
"loss": 1.0421,
|
|
"mean_token_accuracy": 0.7341501908376813,
|
|
"num_tokens": 39236900.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 1.0507450453937053,
|
|
"epoch": 1.4188755808626219,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 1.1814069630635068e-05,
|
|
"loss": 1.0481,
|
|
"mean_token_accuracy": 0.7352609395980835,
|
|
"num_tokens": 39386493.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 1.0338343350216745,
|
|
"epoch": 1.4241115256234047,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 1.1620536287303052e-05,
|
|
"loss": 1.0533,
|
|
"mean_token_accuracy": 0.7419755831360817,
|
|
"num_tokens": 39541549.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 1.0351802745833993,
|
|
"epoch": 1.4293474703841875,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 1.1428120125340716e-05,
|
|
"loss": 1.055,
|
|
"mean_token_accuracy": 0.7351777728646993,
|
|
"num_tokens": 39694300.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 1.0450334103778005,
|
|
"epoch": 1.4345834151449703,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 1.1236837211442231e-05,
|
|
"loss": 1.0436,
|
|
"mean_token_accuracy": 0.739201857149601,
|
|
"num_tokens": 39833319.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 1.0565732188522816,
|
|
"epoch": 1.439819359905753,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 1.1046703517675846e-05,
|
|
"loss": 1.0437,
|
|
"mean_token_accuracy": 0.7327644564211369,
|
|
"num_tokens": 39977306.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 1.0406348885968328,
|
|
"epoch": 1.4450553046665358,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 1.085773492015028e-05,
|
|
"loss": 1.0215,
|
|
"mean_token_accuracy": 0.739136977866292,
|
|
"num_tokens": 40117010.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 1.0673470385372639,
|
|
"epoch": 1.4502912494273186,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 1.0669947197689034e-05,
|
|
"loss": 1.088,
|
|
"mean_token_accuracy": 0.7327257882803678,
|
|
"num_tokens": 40258345.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 1.108323130570352,
|
|
"epoch": 1.4555271941881014,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 1.0483356030512912e-05,
|
|
"loss": 1.0889,
|
|
"mean_token_accuracy": 0.7247421193867922,
|
|
"num_tokens": 40401612.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 1.0475279117003082,
|
|
"epoch": 1.460763138948884,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 1.0297976998930664e-05,
|
|
"loss": 1.0514,
|
|
"mean_token_accuracy": 0.7359228234738111,
|
|
"num_tokens": 40540992.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 1.0940048353746534,
|
|
"epoch": 1.4659990837096668,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 1.0113825582038078e-05,
|
|
"loss": 1.0891,
|
|
"mean_token_accuracy": 0.7249374518170952,
|
|
"num_tokens": 40681843.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 1.0349202129989863,
|
|
"epoch": 1.4712350284704496,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 9.930917156425476e-06,
|
|
"loss": 1.034,
|
|
"mean_token_accuracy": 0.7375153541564942,
|
|
"num_tokens": 40819621.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 1.0569802735000848,
|
|
"epoch": 1.4764709732312324,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 9.749266994893755e-06,
|
|
"loss": 1.0714,
|
|
"mean_token_accuracy": 0.7332708152011037,
|
|
"num_tokens": 40963825.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 1.0443452363833785,
|
|
"epoch": 1.4817069179920153,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 9.568890265179128e-06,
|
|
"loss": 1.0527,
|
|
"mean_token_accuracy": 0.7357666999101639,
|
|
"num_tokens": 41115227.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 1.0533791413530706,
|
|
"epoch": 1.486942862752798,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 9.389802028686617e-06,
|
|
"loss": 1.0627,
|
|
"mean_token_accuracy": 0.7320496127009392,
|
|
"num_tokens": 41263145.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 1.0201024271547794,
|
|
"epoch": 1.4921788075135807,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 9.212017239232425e-06,
|
|
"loss": 1.0202,
|
|
"mean_token_accuracy": 0.7403682049363851,
|
|
"num_tokens": 41407284.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 1.072772230580449,
|
|
"epoch": 1.4974147522743635,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 9.03555074179533e-06,
|
|
"loss": 1.0785,
|
|
"mean_token_accuracy": 0.7321215584874153,
|
|
"num_tokens": 41554005.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 1.0840026365593076,
|
|
"epoch": 1.5026506970351463,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 8.860417271277066e-06,
|
|
"loss": 1.0806,
|
|
"mean_token_accuracy": 0.7338382225483656,
|
|
"num_tokens": 41694676.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 1.0635898549109697,
|
|
"epoch": 1.507886641795929,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 8.68663145127203e-06,
|
|
"loss": 1.0874,
|
|
"mean_token_accuracy": 0.7339150093495845,
|
|
"num_tokens": 41852022.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 1.0565571097657085,
|
|
"epoch": 1.5131225865567117,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 8.514207792846169e-06,
|
|
"loss": 1.0684,
|
|
"mean_token_accuracy": 0.735099958628416,
|
|
"num_tokens": 41997358.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 1.0264921691268682,
|
|
"epoch": 1.5183585313174945,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 8.343160693325355e-06,
|
|
"loss": 1.0521,
|
|
"mean_token_accuracy": 0.7389906920492649,
|
|
"num_tokens": 42142495.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 1.0624020613729954,
|
|
"epoch": 1.5235944760782774,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 8.173504435093174e-06,
|
|
"loss": 1.0369,
|
|
"mean_token_accuracy": 0.7302116710692644,
|
|
"num_tokens": 42278534.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 1.066250941902399,
|
|
"epoch": 1.5288304208390602,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 8.005253184398359e-06,
|
|
"loss": 1.0605,
|
|
"mean_token_accuracy": 0.7321529988199472,
|
|
"num_tokens": 42414588.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 1.0273714432492853,
|
|
"epoch": 1.534066365599843,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 7.838420990171928e-06,
|
|
"loss": 1.0421,
|
|
"mean_token_accuracy": 0.7360200606286526,
|
|
"num_tokens": 42558263.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 1.0550982277840375,
|
|
"epoch": 1.5393023103606258,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 7.673021782854084e-06,
|
|
"loss": 1.0462,
|
|
"mean_token_accuracy": 0.734701413474977,
|
|
"num_tokens": 42699208.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 1.0459831846877932,
|
|
"epoch": 1.5445382551214086,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 7.50906937323104e-06,
|
|
"loss": 1.071,
|
|
"mean_token_accuracy": 0.735860938206315,
|
|
"num_tokens": 42843543.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 1.0140997383743524,
|
|
"epoch": 1.5497741998821912,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 7.346577451281822e-06,
|
|
"loss": 1.0028,
|
|
"mean_token_accuracy": 0.7434132274240255,
|
|
"num_tokens": 42991376.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 1.0464172219857573,
|
|
"epoch": 1.555010144642974,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 7.185559585035137e-06,
|
|
"loss": 1.0408,
|
|
"mean_token_accuracy": 0.7328146979212761,
|
|
"num_tokens": 43138825.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 1.0702244764193893,
|
|
"epoch": 1.5602460894037566,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 7.026029219436503e-06,
|
|
"loss": 1.0959,
|
|
"mean_token_accuracy": 0.732689993456006,
|
|
"num_tokens": 43283731.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 1.0607844032347202,
|
|
"epoch": 1.5654820341645395,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 6.8679996752255224e-06,
|
|
"loss": 1.0445,
|
|
"mean_token_accuracy": 0.731873894110322,
|
|
"num_tokens": 43438468.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 1.0623069098219275,
|
|
"epoch": 1.5707179789253223,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 6.711484147823663e-06,
|
|
"loss": 1.0859,
|
|
"mean_token_accuracy": 0.7330159761011601,
|
|
"num_tokens": 43586062.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 1.0011677112430335,
|
|
"epoch": 1.575953923686105,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 6.556495706232412e-06,
|
|
"loss": 1.004,
|
|
"mean_token_accuracy": 0.7418603513389825,
|
|
"num_tokens": 43729361.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 1.0570779686793685,
|
|
"epoch": 1.581189868446888,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 6.403047291942057e-06,
|
|
"loss": 1.0487,
|
|
"mean_token_accuracy": 0.7325568657368422,
|
|
"num_tokens": 43866642.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 1.075397195480764,
|
|
"epoch": 1.5864258132076707,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 6.251151717851023e-06,
|
|
"loss": 1.0957,
|
|
"mean_token_accuracy": 0.731479388475418,
|
|
"num_tokens": 44012594.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 1.0650788258761168,
|
|
"epoch": 1.5916617579684535,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 6.100821667196041e-06,
|
|
"loss": 1.0668,
|
|
"mean_token_accuracy": 0.7329257596284151,
|
|
"num_tokens": 44156219.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 1.075978034362197,
|
|
"epoch": 1.5968977027292364,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 5.952069692493062e-06,
|
|
"loss": 1.0978,
|
|
"mean_token_accuracy": 0.7265279643237591,
|
|
"num_tokens": 44300766.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 1.074189928546548,
|
|
"epoch": 1.602133647490019,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 5.80490821448918e-06,
|
|
"loss": 1.0844,
|
|
"mean_token_accuracy": 0.731312808021903,
|
|
"num_tokens": 44457805.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 1.0550312519073486,
|
|
"epoch": 1.6073695922508018,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 5.65934952112546e-06,
|
|
"loss": 1.0639,
|
|
"mean_token_accuracy": 0.7341483242809772,
|
|
"num_tokens": 44602720.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 1.0775770872831345,
|
|
"epoch": 1.6126055370115844,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 5.5154057665109e-06,
|
|
"loss": 1.0682,
|
|
"mean_token_accuracy": 0.7288901913911104,
|
|
"num_tokens": 44754337.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 1.0554250160232186,
|
|
"epoch": 1.6178414817723672,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 5.373088969907586e-06,
|
|
"loss": 1.0413,
|
|
"mean_token_accuracy": 0.7333483207970858,
|
|
"num_tokens": 44901493.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 1.0390476867556573,
|
|
"epoch": 1.62307742653315,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 5.23241101472709e-06,
|
|
"loss": 1.0654,
|
|
"mean_token_accuracy": 0.7387387953698635,
|
|
"num_tokens": 45051644.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 1.0286037972196937,
|
|
"epoch": 1.6283133712939328,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 5.09338364753818e-06,
|
|
"loss": 1.0294,
|
|
"mean_token_accuracy": 0.7419391922652722,
|
|
"num_tokens": 45206269.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 1.0362283935770393,
|
|
"epoch": 1.6335493160547156,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 4.956018477086005e-06,
|
|
"loss": 1.0556,
|
|
"mean_token_accuracy": 0.7357694737613201,
|
|
"num_tokens": 45352368.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 1.0753269331529736,
|
|
"epoch": 1.6387852608154985,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 4.820326973322764e-06,
|
|
"loss": 1.0746,
|
|
"mean_token_accuracy": 0.7283272542059421,
|
|
"num_tokens": 45499839.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 1.0553035859018565,
|
|
"epoch": 1.6440212055762813,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 4.686320466449981e-06,
|
|
"loss": 1.1012,
|
|
"mean_token_accuracy": 0.7353771705180406,
|
|
"num_tokens": 45638917.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 1.0884598640725016,
|
|
"epoch": 1.649257150337064,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 4.554010145972417e-06,
|
|
"loss": 1.1183,
|
|
"mean_token_accuracy": 0.7301896862685681,
|
|
"num_tokens": 45789557.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 1.01657194532454,
|
|
"epoch": 1.6544930950978467,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 4.423407059763745e-06,
|
|
"loss": 1.0208,
|
|
"mean_token_accuracy": 0.740845986828208,
|
|
"num_tokens": 45932715.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 1.0645186068490147,
|
|
"epoch": 1.6597290398586295,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 4.294522113144078e-06,
|
|
"loss": 1.0814,
|
|
"mean_token_accuracy": 0.7319622810930013,
|
|
"num_tokens": 46085727.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 1.0936360348947347,
|
|
"epoch": 1.6649649846194121,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 4.16736606796938e-06,
|
|
"loss": 1.0958,
|
|
"mean_token_accuracy": 0.7290900621563197,
|
|
"num_tokens": 46237949.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 1.0574020750820636,
|
|
"epoch": 1.670200929380195,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 4.041949541732826e-06,
|
|
"loss": 1.0481,
|
|
"mean_token_accuracy": 0.734756362810731,
|
|
"num_tokens": 46383620.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 1.0825668659992516,
|
|
"epoch": 1.6754368741409777,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 3.9182830066782614e-06,
|
|
"loss": 1.0925,
|
|
"mean_token_accuracy": 0.7287914883345366,
|
|
"num_tokens": 46522204.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 1.0611087726429105,
|
|
"epoch": 1.6806728189017606,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 3.7963767889257704e-06,
|
|
"loss": 1.0773,
|
|
"mean_token_accuracy": 0.7307531669735908,
|
|
"num_tokens": 46664305.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 1.0411738075315953,
|
|
"epoch": 1.6859087636625434,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 3.676241067609465e-06,
|
|
"loss": 1.035,
|
|
"mean_token_accuracy": 0.7382205333560705,
|
|
"num_tokens": 46810702.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 1.023532929085195,
|
|
"epoch": 1.6911447084233262,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 3.5578858740274973e-06,
|
|
"loss": 1.0167,
|
|
"mean_token_accuracy": 0.743023120239377,
|
|
"num_tokens": 46959451.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 1.092939823679626,
|
|
"epoch": 1.696380653184109,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 3.4413210908044696e-06,
|
|
"loss": 1.1151,
|
|
"mean_token_accuracy": 0.7277210278436541,
|
|
"num_tokens": 47108206.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 1.07600337844342,
|
|
"epoch": 1.7016165979448918,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 3.3265564510662343e-06,
|
|
"loss": 1.102,
|
|
"mean_token_accuracy": 0.7284684276208282,
|
|
"num_tokens": 47262079.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 1.0967259481549263,
|
|
"epoch": 1.7068525427056744,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 3.213601537627195e-06,
|
|
"loss": 1.1053,
|
|
"mean_token_accuracy": 0.72360435500741,
|
|
"num_tokens": 47414104.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 1.029846752807498,
|
|
"epoch": 1.7120884874664573,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 3.102465782190106e-06,
|
|
"loss": 1.0467,
|
|
"mean_token_accuracy": 0.7382533248513937,
|
|
"num_tokens": 47555455.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 1.090262323245406,
|
|
"epoch": 1.7173244322272398,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 2.9931584645585654e-06,
|
|
"loss": 1.0854,
|
|
"mean_token_accuracy": 0.7280427444726228,
|
|
"num_tokens": 47689549.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 1.0471955848857761,
|
|
"epoch": 1.7225603769880227,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 2.8856887118621364e-06,
|
|
"loss": 1.0303,
|
|
"mean_token_accuracy": 0.7392646053805947,
|
|
"num_tokens": 47828675.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 1.0756644216366111,
|
|
"epoch": 1.7277963217488055,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 2.7800654977942488e-06,
|
|
"loss": 1.0728,
|
|
"mean_token_accuracy": 0.7287148278206587,
|
|
"num_tokens": 47977441.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 1.0582644551992417,
|
|
"epoch": 1.7330322665095883,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 2.676297641862879e-06,
|
|
"loss": 1.051,
|
|
"mean_token_accuracy": 0.7314374148845673,
|
|
"num_tokens": 48130026.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 1.0880960457026958,
|
|
"epoch": 1.7382682112703711,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 2.5743938086541354e-06,
|
|
"loss": 1.1199,
|
|
"mean_token_accuracy": 0.7294365499168635,
|
|
"num_tokens": 48279615.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 1.0541712949052453,
|
|
"epoch": 1.743504156031154,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 2.4743625071087574e-06,
|
|
"loss": 1.0614,
|
|
"mean_token_accuracy": 0.7369356131181121,
|
|
"num_tokens": 48435016.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 1.0605900973081588,
|
|
"epoch": 1.7487401007919368,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 2.3762120898116498e-06,
|
|
"loss": 1.0477,
|
|
"mean_token_accuracy": 0.7361986979842186,
|
|
"num_tokens": 48575627.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 1.1169460522010923,
|
|
"epoch": 1.7539760455527196,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 2.2799507522944048e-06,
|
|
"loss": 1.143,
|
|
"mean_token_accuracy": 0.7201281778514386,
|
|
"num_tokens": 48712687.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 1.0832444079220296,
|
|
"epoch": 1.7592119903135022,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 2.1855865323510055e-06,
|
|
"loss": 1.0956,
|
|
"mean_token_accuracy": 0.7290066111832857,
|
|
"num_tokens": 48857330.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 1.0283724040724338,
|
|
"epoch": 1.764447935074285,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 2.0931273093666575e-06,
|
|
"loss": 1.0341,
|
|
"mean_token_accuracy": 0.743080747872591,
|
|
"num_tokens": 48998035.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 1.027001916244626,
|
|
"epoch": 1.7696838798350676,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 2.002580803659873e-06,
|
|
"loss": 1.0041,
|
|
"mean_token_accuracy": 0.7414408419281244,
|
|
"num_tokens": 49139070.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 0.9976796295493842,
|
|
"epoch": 1.7749198245958504,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 1.9139545758378256e-06,
|
|
"loss": 0.9828,
|
|
"mean_token_accuracy": 0.7511645819991827,
|
|
"num_tokens": 49276032.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 1.0575186382979154,
|
|
"epoch": 1.7801557693566332,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 1.8272560261650279e-06,
|
|
"loss": 1.0549,
|
|
"mean_token_accuracy": 0.7337985582649708,
|
|
"num_tokens": 49412690.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 1.0142314087599515,
|
|
"epoch": 1.785391714117416,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 1.7424923939454273e-06,
|
|
"loss": 1.0415,
|
|
"mean_token_accuracy": 0.7428950823843479,
|
|
"num_tokens": 49553666.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 1.0519614189863205,
|
|
"epoch": 1.7906276588781989,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 1.6596707569179304e-06,
|
|
"loss": 1.0709,
|
|
"mean_token_accuracy": 0.7382104344666004,
|
|
"num_tokens": 49704519.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 1.0671229269355536,
|
|
"epoch": 1.7958636036389817,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 1.578798030665385e-06,
|
|
"loss": 1.0751,
|
|
"mean_token_accuracy": 0.7311564918607474,
|
|
"num_tokens": 49843624.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 1.0378224339336157,
|
|
"epoch": 1.8010995483997645,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 1.499880968037165e-06,
|
|
"loss": 1.0427,
|
|
"mean_token_accuracy": 0.7399079620838165,
|
|
"num_tokens": 49980104.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 1.1307296685874462,
|
|
"epoch": 1.8063354931605473,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 1.4229261585852805e-06,
|
|
"loss": 1.1653,
|
|
"mean_token_accuracy": 0.7181130038574338,
|
|
"num_tokens": 50135452.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 1.0895096741616725,
|
|
"epoch": 1.81157143792133,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 1.3479400280141884e-06,
|
|
"loss": 1.106,
|
|
"mean_token_accuracy": 0.7267964135855436,
|
|
"num_tokens": 50285568.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 1.1072740200906992,
|
|
"epoch": 1.8168073826821127,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 1.2749288376442043e-06,
|
|
"loss": 1.1295,
|
|
"mean_token_accuracy": 0.7238930713385343,
|
|
"num_tokens": 50431109.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 1.0873282797634602,
|
|
"epoch": 1.8220433274428953,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 1.203898683888713e-06,
|
|
"loss": 1.1206,
|
|
"mean_token_accuracy": 0.7258367579430341,
|
|
"num_tokens": 50572278.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 1.074448931775987,
|
|
"epoch": 1.8272792722036781,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 1.134855497745113e-06,
|
|
"loss": 1.0828,
|
|
"mean_token_accuracy": 0.728118471056223,
|
|
"num_tokens": 50715803.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 1.0785529548302293,
|
|
"epoch": 1.832515216964461,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 1.0678050442995801e-06,
|
|
"loss": 1.0723,
|
|
"mean_token_accuracy": 0.7296169890090823,
|
|
"num_tokens": 50860742.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 1.070153540931642,
|
|
"epoch": 1.8377511617252438,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 1.0027529222456756e-06,
|
|
"loss": 1.0884,
|
|
"mean_token_accuracy": 0.7305432733148336,
|
|
"num_tokens": 51006683.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 1.0611914629116654,
|
|
"epoch": 1.8429871064860266,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 9.397045634168766e-07,
|
|
"loss": 1.043,
|
|
"mean_token_accuracy": 0.7299406290054321,
|
|
"num_tokens": 51154218.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 1.0603390594944357,
|
|
"epoch": 1.8482230512468094,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 8.78665232332998e-07,
|
|
"loss": 1.0571,
|
|
"mean_token_accuracy": 0.7298200543969869,
|
|
"num_tokens": 51299870.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 1.0776327732950448,
|
|
"epoch": 1.8534589960075922,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 8.196400257606207e-07,
|
|
"loss": 1.1094,
|
|
"mean_token_accuracy": 0.7307827772572637,
|
|
"num_tokens": 51449416.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 1.1012815684080124,
|
|
"epoch": 1.858694940768375,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 7.626338722875076e-07,
|
|
"loss": 1.135,
|
|
"mean_token_accuracy": 0.7262665273621678,
|
|
"num_tokens": 51589483.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 1.0277717508375646,
|
|
"epoch": 1.8639308855291576,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 7.076515319110688e-07,
|
|
"loss": 1.0289,
|
|
"mean_token_accuracy": 0.7407132972031831,
|
|
"num_tokens": 51743472.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 0.9922656198963523,
|
|
"epoch": 1.8691668302899405,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 6.54697595640899e-07,
|
|
"loss": 0.9894,
|
|
"mean_token_accuracy": 0.7477642957121133,
|
|
"num_tokens": 51890190.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 1.08709951415658,
|
|
"epoch": 1.874402775050723,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 6.037764851154426e-07,
|
|
"loss": 1.1142,
|
|
"mean_token_accuracy": 0.7220855403691531,
|
|
"num_tokens": 52036734.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 1.0564101081341506,
|
|
"epoch": 1.8796387198115059,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 5.548924522327747e-07,
|
|
"loss": 1.0584,
|
|
"mean_token_accuracy": 0.7324411410838365,
|
|
"num_tokens": 52169348.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 1.0335981843993067,
|
|
"epoch": 1.8848746645722887,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 5.080495787955691e-07,
|
|
"loss": 1.0412,
|
|
"mean_token_accuracy": 0.739568930119276,
|
|
"num_tokens": 52311669.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 1.044294580630958,
|
|
"epoch": 1.8901106093330715,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 4.632517761702815e-07,
|
|
"loss": 1.02,
|
|
"mean_token_accuracy": 0.7376545470207929,
|
|
"num_tokens": 52468463.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 1.0835541209205986,
|
|
"epoch": 1.8953465540938543,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 4.2050278496053587e-07,
|
|
"loss": 1.1012,
|
|
"mean_token_accuracy": 0.729495657980442,
|
|
"num_tokens": 52626044.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 1.0643612802028657,
|
|
"epoch": 1.9005824988546371,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 3.7980617469479953e-07,
|
|
"loss": 1.0706,
|
|
"mean_token_accuracy": 0.7347536141052842,
|
|
"num_tokens": 52772867.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 1.0667219148948788,
|
|
"epoch": 1.90581844361542,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.4116534352831576e-07,
|
|
"loss": 1.0627,
|
|
"mean_token_accuracy": 0.7293199263513088,
|
|
"num_tokens": 52908004.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 1.135395216010511,
|
|
"epoch": 1.9110543883762028,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 3.0458351795936703e-07,
|
|
"loss": 1.1365,
|
|
"mean_token_accuracy": 0.7202159762382507,
|
|
"num_tokens": 53044041.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 1.0965589692816138,
|
|
"epoch": 1.9162903331369854,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 2.7006375255985985e-07,
|
|
"loss": 1.0917,
|
|
"mean_token_accuracy": 0.7249834679067135,
|
|
"num_tokens": 53186581.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 1.032194511592388,
|
|
"epoch": 1.9215262778977682,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 2.3760892972027328e-07,
|
|
"loss": 1.0481,
|
|
"mean_token_accuracy": 0.7402662597596645,
|
|
"num_tokens": 53337558.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 1.1025461964309216,
|
|
"epoch": 1.9267622226585508,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 2.072217594089765e-07,
|
|
"loss": 1.1038,
|
|
"mean_token_accuracy": 0.7263487908989191,
|
|
"num_tokens": 53474564.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 1.0789904015138745,
|
|
"epoch": 1.9319981674193336,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 1.7890477894593748e-07,
|
|
"loss": 1.0865,
|
|
"mean_token_accuracy": 0.7280137140303851,
|
|
"num_tokens": 53614875.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 1.0704231640323996,
|
|
"epoch": 1.9372341121801164,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 1.5266035279088708e-07,
|
|
"loss": 1.0724,
|
|
"mean_token_accuracy": 0.7300405781716108,
|
|
"num_tokens": 53764538.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 1.0188416039571166,
|
|
"epoch": 1.9424700569408992,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 1.284906723458462e-07,
|
|
"loss": 1.0245,
|
|
"mean_token_accuracy": 0.738005406036973,
|
|
"num_tokens": 53908241.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 1.033022477477789,
|
|
"epoch": 1.947706001701682,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 1.0639775577218625e-07,
|
|
"loss": 1.0267,
|
|
"mean_token_accuracy": 0.7387443576008081,
|
|
"num_tokens": 54051083.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 1.0919482603669166,
|
|
"epoch": 1.9529419464624649,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 8.638344782207486e-08,
|
|
"loss": 1.0833,
|
|
"mean_token_accuracy": 0.7268906071782112,
|
|
"num_tokens": 54203892.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 1.0021182408556342,
|
|
"epoch": 1.9581778912232477,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 6.84494196844715e-08,
|
|
"loss": 0.9942,
|
|
"mean_token_accuracy": 0.7441616494208574,
|
|
"num_tokens": 54355138.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 1.0863986648619175,
|
|
"epoch": 1.9634138359840305,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 5.2597168845561206e-08,
|
|
"loss": 1.0861,
|
|
"mean_token_accuracy": 0.7288298228755593,
|
|
"num_tokens": 54503031.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 1.0323819531127811,
|
|
"epoch": 1.9686497807448131,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 3.882801896372967e-08,
|
|
"loss": 1.0337,
|
|
"mean_token_accuracy": 0.7384911965578794,
|
|
"num_tokens": 54656622.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 1.0538762006908655,
|
|
"epoch": 1.973885725505596,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 2.7143119759026613e-08,
|
|
"loss": 1.0898,
|
|
"mean_token_accuracy": 0.738424026966095,
|
|
"num_tokens": 54815724.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 1.1061828639358282,
|
|
"epoch": 1.9791216702663785,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 1.754344691717591e-08,
|
|
"loss": 1.0847,
|
|
"mean_token_accuracy": 0.7238223964348436,
|
|
"num_tokens": 54962285.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 1.062331521883607,
|
|
"epoch": 1.9843576150271613,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 1.0029802008096334e-08,
|
|
"loss": 1.0895,
|
|
"mean_token_accuracy": 0.7333987768739462,
|
|
"num_tokens": 55112914.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 1.0460052080452442,
|
|
"epoch": 1.9895935597879442,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 4.602812418974534e-09,
|
|
"loss": 1.0321,
|
|
"mean_token_accuracy": 0.7373423630371689,
|
|
"num_tokens": 55256115.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 1.0018283769488334,
|
|
"epoch": 1.994829504548727,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 1.2629313018819311e-09,
|
|
"loss": 0.9884,
|
|
"mean_token_accuracy": 0.7454275876283646,
|
|
"num_tokens": 55398525.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 1.0823518706462052,
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 1.0437535929996856e-11,
|
|
"loss": 1.1044,
|
|
"mean_token_accuracy": 0.7266742470143717,
|
|
"num_tokens": 55553296.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_entropy": 1.0679908826947213,
|
|
"eval_loss": 1.1749457120895386,
|
|
"eval_mean_token_accuracy": 0.707340413838625,
|
|
"eval_num_tokens": 55553296.0,
|
|
"eval_runtime": 60.9411,
|
|
"eval_samples_per_second": 32.819,
|
|
"eval_steps_per_second": 16.409,
|
|
"step": 3820
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 3820,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.9198789959923e+17,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|