1364 lines
37 KiB
JSON
1364 lines
37 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.0,
|
|
"eval_steps": 500,
|
|
"global_step": 132,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5756837734952569,
|
|
"epoch": 0.045454545454545456,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.4932,
|
|
"mean_token_accuracy": 0.6398156471550465,
|
|
"num_tokens": 213256.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.5991804897785187,
|
|
"epoch": 0.09090909090909091,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 1.487,
|
|
"mean_token_accuracy": 0.6374303828924894,
|
|
"num_tokens": 427738.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.6032040221616626,
|
|
"epoch": 0.13636363636363635,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 1.4726,
|
|
"mean_token_accuracy": 0.6417305879294872,
|
|
"num_tokens": 627248.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.5772652318701148,
|
|
"epoch": 0.18181818181818182,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 1.484,
|
|
"mean_token_accuracy": 0.6436302307993174,
|
|
"num_tokens": 863658.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.5723715648055077,
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 1.4338,
|
|
"mean_token_accuracy": 0.647509815171361,
|
|
"num_tokens": 1086491.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.6264624744653702,
|
|
"epoch": 0.2727272727272727,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 1.4245,
|
|
"mean_token_accuracy": 0.6502398364245892,
|
|
"num_tokens": 1285338.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5932183619588614,
|
|
"epoch": 0.3181818181818182,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 1.4026,
|
|
"mean_token_accuracy": 0.6524394080042839,
|
|
"num_tokens": 1518366.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.6039981953799725,
|
|
"epoch": 0.36363636363636365,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.3217,
|
|
"mean_token_accuracy": 0.6681927014142275,
|
|
"num_tokens": 1734339.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.5884840972721577,
|
|
"epoch": 0.4090909090909091,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 9.9984209464165e-06,
|
|
"loss": 1.2699,
|
|
"mean_token_accuracy": 0.6729655731469393,
|
|
"num_tokens": 1958264.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.6048963023349643,
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 9.99368478303009e-06,
|
|
"loss": 1.237,
|
|
"mean_token_accuracy": 0.6799562647938728,
|
|
"num_tokens": 2166887.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.5761522091925144,
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 9.98579450130307e-06,
|
|
"loss": 1.2118,
|
|
"mean_token_accuracy": 0.6838508639484644,
|
|
"num_tokens": 2388908.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.5694795530289412,
|
|
"epoch": 0.5454545454545454,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 9.974755084906503e-06,
|
|
"loss": 1.1741,
|
|
"mean_token_accuracy": 0.6954156272113323,
|
|
"num_tokens": 2605601.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.5798916202038527,
|
|
"epoch": 0.5909090909090909,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 9.960573506572391e-06,
|
|
"loss": 1.1211,
|
|
"mean_token_accuracy": 0.6951853409409523,
|
|
"num_tokens": 2817700.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.5453419080004096,
|
|
"epoch": 0.6363636363636364,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 9.94325872368957e-06,
|
|
"loss": 1.1015,
|
|
"mean_token_accuracy": 0.7093381285667419,
|
|
"num_tokens": 3040280.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.5564651843160391,
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 9.922821672646028e-06,
|
|
"loss": 1.081,
|
|
"mean_token_accuracy": 0.7106043919920921,
|
|
"num_tokens": 3247106.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.5601713042706251,
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 9.899275261921236e-06,
|
|
"loss": 1.0923,
|
|
"mean_token_accuracy": 0.7116606514900923,
|
|
"num_tokens": 3458426.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.5526782963424921,
|
|
"epoch": 0.7727272727272727,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 9.872634363932887e-06,
|
|
"loss": 1.0756,
|
|
"mean_token_accuracy": 0.7068049628287554,
|
|
"num_tokens": 3676439.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.5471083605661988,
|
|
"epoch": 0.8181818181818182,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 9.842915805643156e-06,
|
|
"loss": 1.0221,
|
|
"mean_token_accuracy": 0.7173160128295422,
|
|
"num_tokens": 3894572.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5807137787342072,
|
|
"epoch": 0.8636363636363636,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 9.81013835793043e-06,
|
|
"loss": 0.9976,
|
|
"mean_token_accuracy": 0.7263931501656771,
|
|
"num_tokens": 4098419.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.5447382191196084,
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 9.774322723733216e-06,
|
|
"loss": 0.9973,
|
|
"mean_token_accuracy": 0.721901087090373,
|
|
"num_tokens": 4317337.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.5411477498710155,
|
|
"epoch": 0.9545454545454546,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 9.735491524973723e-06,
|
|
"loss": 1.0118,
|
|
"mean_token_accuracy": 0.7198995053768158,
|
|
"num_tokens": 4541085.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.5523755457252264,
|
|
"epoch": 1.0,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 9.693669288269371e-06,
|
|
"loss": 0.981,
|
|
"mean_token_accuracy": 0.7275056149810553,
|
|
"num_tokens": 4761424.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.5443653706461191,
|
|
"epoch": 1.0454545454545454,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 9.648882429441258e-06,
|
|
"loss": 0.9525,
|
|
"mean_token_accuracy": 0.7372480425983667,
|
|
"num_tokens": 4972261.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.5743730887770653,
|
|
"epoch": 1.0909090909090908,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 9.601159236829353e-06,
|
|
"loss": 0.9451,
|
|
"mean_token_accuracy": 0.7314453404396772,
|
|
"num_tokens": 5175446.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.5375164104625583,
|
|
"epoch": 1.1363636363636362,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 9.550529853424979e-06,
|
|
"loss": 0.9413,
|
|
"mean_token_accuracy": 0.7362319473177195,
|
|
"num_tokens": 5383844.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.5443251971155405,
|
|
"epoch": 1.1818181818181819,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 9.497026257831856e-06,
|
|
"loss": 0.976,
|
|
"mean_token_accuracy": 0.7270661573857069,
|
|
"num_tokens": 5603658.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.517438679933548,
|
|
"epoch": 1.2272727272727273,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 9.440682244067724e-06,
|
|
"loss": 0.9339,
|
|
"mean_token_accuracy": 0.7412919718772173,
|
|
"num_tokens": 5833420.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.5209640683606267,
|
|
"epoch": 1.2727272727272727,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 9.381533400219319e-06,
|
|
"loss": 0.9073,
|
|
"mean_token_accuracy": 0.7471859473735094,
|
|
"num_tokens": 6049437.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.5130348689854145,
|
|
"epoch": 1.3181818181818181,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 9.319617085964177e-06,
|
|
"loss": 0.8903,
|
|
"mean_token_accuracy": 0.7493606805801392,
|
|
"num_tokens": 6272070.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.5314307110384107,
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 9.25497240897346e-06,
|
|
"loss": 0.8725,
|
|
"mean_token_accuracy": 0.7509097009897232,
|
|
"num_tokens": 6480742.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.5306935114786029,
|
|
"epoch": 1.4090909090909092,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 9.18764020021071e-06,
|
|
"loss": 0.8547,
|
|
"mean_token_accuracy": 0.752067357301712,
|
|
"num_tokens": 6683773.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.5351670542731881,
|
|
"epoch": 1.4545454545454546,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 9.117662988142138e-06,
|
|
"loss": 0.8583,
|
|
"mean_token_accuracy": 0.7536429259926081,
|
|
"num_tokens": 6886687.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.5129835363477468,
|
|
"epoch": 1.5,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 9.045084971874738e-06,
|
|
"loss": 0.89,
|
|
"mean_token_accuracy": 0.7455399166792631,
|
|
"num_tokens": 7115408.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.5456664310768247,
|
|
"epoch": 1.5454545454545454,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 8.969951993239177e-06,
|
|
"loss": 0.8931,
|
|
"mean_token_accuracy": 0.7431404571980238,
|
|
"num_tokens": 7311605.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.5233238851651549,
|
|
"epoch": 1.5909090909090908,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 8.892311507835118e-06,
|
|
"loss": 0.894,
|
|
"mean_token_accuracy": 0.7455066256225109,
|
|
"num_tokens": 7536281.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.5304507119581103,
|
|
"epoch": 1.6363636363636362,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 8.81221255505724e-06,
|
|
"loss": 0.8802,
|
|
"mean_token_accuracy": 0.7477323599159718,
|
|
"num_tokens": 7750772.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.5277951331809163,
|
|
"epoch": 1.6818181818181817,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 8.729705727120911e-06,
|
|
"loss": 0.8726,
|
|
"mean_token_accuracy": 0.7508427072316408,
|
|
"num_tokens": 7974590.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.5125188445672393,
|
|
"epoch": 1.7272727272727273,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 8.644843137107058e-06,
|
|
"loss": 0.8935,
|
|
"mean_token_accuracy": 0.7437247112393379,
|
|
"num_tokens": 8201132.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.5137846125289798,
|
|
"epoch": 1.7727272727272727,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 8.557678386046429e-06,
|
|
"loss": 0.8641,
|
|
"mean_token_accuracy": 0.7521512098610401,
|
|
"num_tokens": 8422575.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.515689549036324,
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 8.468266529064025e-06,
|
|
"loss": 0.8577,
|
|
"mean_token_accuracy": 0.752921536564827,
|
|
"num_tokens": 8635107.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.5162663543596864,
|
|
"epoch": 1.8636363636363638,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 8.376664040605122e-06,
|
|
"loss": 0.864,
|
|
"mean_token_accuracy": 0.7545531969517469,
|
|
"num_tokens": 8842212.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.514712393283844,
|
|
"epoch": 1.9090909090909092,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 8.282928778764783e-06,
|
|
"loss": 0.8809,
|
|
"mean_token_accuracy": 0.7484686318784952,
|
|
"num_tokens": 9070194.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.5233986722305417,
|
|
"epoch": 1.9545454545454546,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 8.18711994874345e-06,
|
|
"loss": 0.8139,
|
|
"mean_token_accuracy": 0.7604574281722307,
|
|
"num_tokens": 9267652.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.521757710725069,
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 8.089298065451673e-06,
|
|
"loss": 0.8508,
|
|
"mean_token_accuracy": 0.75594780780375,
|
|
"num_tokens": 9490017.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.5071435309946537,
|
|
"epoch": 2.0454545454545454,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 7.989524915287595e-06,
|
|
"loss": 0.8516,
|
|
"mean_token_accuracy": 0.7543456256389618,
|
|
"num_tokens": 9711919.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.5045623360201716,
|
|
"epoch": 2.090909090909091,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 7.887863517111337e-06,
|
|
"loss": 0.8573,
|
|
"mean_token_accuracy": 0.7523710802197456,
|
|
"num_tokens": 9949908.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.5323316175490618,
|
|
"epoch": 2.1363636363636362,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 7.78437808244094e-06,
|
|
"loss": 0.8448,
|
|
"mean_token_accuracy": 0.7550894934684038,
|
|
"num_tokens": 10155466.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.5101688215509057,
|
|
"epoch": 2.1818181818181817,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 7.679133974894984e-06,
|
|
"loss": 0.8411,
|
|
"mean_token_accuracy": 0.7548086270689964,
|
|
"num_tokens": 10371898.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.5281436312943697,
|
|
"epoch": 2.227272727272727,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 7.572197668907533e-06,
|
|
"loss": 0.8431,
|
|
"mean_token_accuracy": 0.7579620629549026,
|
|
"num_tokens": 10586035.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.5026519363746047,
|
|
"epoch": 2.2727272727272725,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 7.463636707741458e-06,
|
|
"loss": 0.8355,
|
|
"mean_token_accuracy": 0.7575494665652514,
|
|
"num_tokens": 10814372.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.5014037564396858,
|
|
"epoch": 2.3181818181818183,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 7.353519660826665e-06,
|
|
"loss": 0.8185,
|
|
"mean_token_accuracy": 0.7590322364121675,
|
|
"num_tokens": 11035168.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.5116605255752802,
|
|
"epoch": 2.3636363636363638,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 7.241916080450163e-06,
|
|
"loss": 0.8359,
|
|
"mean_token_accuracy": 0.7558552380651236,
|
|
"num_tokens": 11249229.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.5000560870394111,
|
|
"epoch": 2.409090909090909,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 7.128896457825364e-06,
|
|
"loss": 0.8081,
|
|
"mean_token_accuracy": 0.7626348324120045,
|
|
"num_tokens": 11462723.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.5258028572425246,
|
|
"epoch": 2.4545454545454546,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 7.014532178568314e-06,
|
|
"loss": 0.83,
|
|
"mean_token_accuracy": 0.7577290665358305,
|
|
"num_tokens": 11664590.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.49798215832561255,
|
|
"epoch": 2.5,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 6.898895477609007e-06,
|
|
"loss": 0.83,
|
|
"mean_token_accuracy": 0.757939899340272,
|
|
"num_tokens": 11894045.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.5007442878559232,
|
|
"epoch": 2.5454545454545454,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 6.782059393566254e-06,
|
|
"loss": 0.8315,
|
|
"mean_token_accuracy": 0.75820074044168,
|
|
"num_tokens": 12115353.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.5033395420759916,
|
|
"epoch": 2.590909090909091,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 6.664097722614934e-06,
|
|
"loss": 0.8273,
|
|
"mean_token_accuracy": 0.7577789463102818,
|
|
"num_tokens": 12332745.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.5042030932381749,
|
|
"epoch": 2.6363636363636362,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 6.545084971874738e-06,
|
|
"loss": 0.8343,
|
|
"mean_token_accuracy": 0.7549517750740051,
|
|
"num_tokens": 12554302.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.509183426387608,
|
|
"epoch": 2.6818181818181817,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 6.425096312349881e-06,
|
|
"loss": 0.796,
|
|
"mean_token_accuracy": 0.7663499284535646,
|
|
"num_tokens": 12764709.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.5077009173110127,
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 6.304207531449486e-06,
|
|
"loss": 0.8174,
|
|
"mean_token_accuracy": 0.7621090263128281,
|
|
"num_tokens": 12983370.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.5197898102924228,
|
|
"epoch": 2.7727272727272725,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 6.182494985118625e-06,
|
|
"loss": 0.8088,
|
|
"mean_token_accuracy": 0.7662495765835047,
|
|
"num_tokens": 13195618.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.49907076358795166,
|
|
"epoch": 2.8181818181818183,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 6.060035549610275e-06,
|
|
"loss": 0.7853,
|
|
"mean_token_accuracy": 0.7684814091771841,
|
|
"num_tokens": 13409539.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.5073017841205001,
|
|
"epoch": 2.8636363636363638,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 5.936906572928625e-06,
|
|
"loss": 0.8065,
|
|
"mean_token_accuracy": 0.7650269959121943,
|
|
"num_tokens": 13620769.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.5056956252083182,
|
|
"epoch": 2.909090909090909,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 5.813185825974419e-06,
|
|
"loss": 0.8059,
|
|
"mean_token_accuracy": 0.7649325635284185,
|
|
"num_tokens": 13842038.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.5035996483638883,
|
|
"epoch": 2.9545454545454546,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 5.68895145342319e-06,
|
|
"loss": 0.8247,
|
|
"mean_token_accuracy": 0.7576907705515623,
|
|
"num_tokens": 14060587.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.5117500508204103,
|
|
"epoch": 3.0,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 5.5642819243674085e-06,
|
|
"loss": 0.7885,
|
|
"mean_token_accuracy": 0.7658284697681665,
|
|
"num_tokens": 14271664.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.5234015788882971,
|
|
"epoch": 3.0454545454545454,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 5.439255982753717e-06,
|
|
"loss": 0.8105,
|
|
"mean_token_accuracy": 0.763946671038866,
|
|
"num_tokens": 14482081.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.49382613878697157,
|
|
"epoch": 3.090909090909091,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 5.3139525976465675e-06,
|
|
"loss": 0.8101,
|
|
"mean_token_accuracy": 0.7636537831276655,
|
|
"num_tokens": 14716672.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.5024409759789705,
|
|
"epoch": 3.1363636363636362,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 5.188450913349674e-06,
|
|
"loss": 0.7897,
|
|
"mean_token_accuracy": 0.7669059839099646,
|
|
"num_tokens": 14933081.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.5273290555924177,
|
|
"epoch": 3.1818181818181817,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 5.062830199416764e-06,
|
|
"loss": 0.7848,
|
|
"mean_token_accuracy": 0.7632801961153746,
|
|
"num_tokens": 15136984.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.5051014283671975,
|
|
"epoch": 3.227272727272727,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 4.937169800583237e-06,
|
|
"loss": 0.7991,
|
|
"mean_token_accuracy": 0.7659726981073618,
|
|
"num_tokens": 15350959.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.5017923256382346,
|
|
"epoch": 3.2727272727272725,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 4.811549086650327e-06,
|
|
"loss": 0.7963,
|
|
"mean_token_accuracy": 0.7660163976252079,
|
|
"num_tokens": 15568699.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.5082337036728859,
|
|
"epoch": 3.3181818181818183,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 4.686047402353433e-06,
|
|
"loss": 0.8051,
|
|
"mean_token_accuracy": 0.7663215212523937,
|
|
"num_tokens": 15778760.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.49399478174746037,
|
|
"epoch": 3.3636363636363638,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 4.560744017246284e-06,
|
|
"loss": 0.787,
|
|
"mean_token_accuracy": 0.7704313322901726,
|
|
"num_tokens": 15997159.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.5041727554053068,
|
|
"epoch": 3.409090909090909,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.4357180756325915e-06,
|
|
"loss": 0.7801,
|
|
"mean_token_accuracy": 0.7677918504923582,
|
|
"num_tokens": 16215568.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.4995924336835742,
|
|
"epoch": 3.4545454545454546,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 4.31104854657681e-06,
|
|
"loss": 0.7761,
|
|
"mean_token_accuracy": 0.7708524893969297,
|
|
"num_tokens": 16425520.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.5208209808915854,
|
|
"epoch": 3.5,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 4.186814174025582e-06,
|
|
"loss": 0.7868,
|
|
"mean_token_accuracy": 0.7692476995289326,
|
|
"num_tokens": 16623924.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.5112534500658512,
|
|
"epoch": 3.5454545454545454,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 4.063093427071376e-06,
|
|
"loss": 0.8215,
|
|
"mean_token_accuracy": 0.7575955875217915,
|
|
"num_tokens": 16837670.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.48737939167767763,
|
|
"epoch": 3.590909090909091,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 3.939964450389728e-06,
|
|
"loss": 0.7774,
|
|
"mean_token_accuracy": 0.7722372729331255,
|
|
"num_tokens": 17055501.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.50137943867594,
|
|
"epoch": 3.6363636363636362,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 3.817505014881378e-06,
|
|
"loss": 0.7922,
|
|
"mean_token_accuracy": 0.7668295446783304,
|
|
"num_tokens": 17276503.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.4918771870434284,
|
|
"epoch": 3.6818181818181817,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 3.695792468550517e-06,
|
|
"loss": 0.797,
|
|
"mean_token_accuracy": 0.7658107988536358,
|
|
"num_tokens": 17500421.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.49470567237585783,
|
|
"epoch": 3.7272727272727275,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 3.5749036876501196e-06,
|
|
"loss": 0.7736,
|
|
"mean_token_accuracy": 0.770993497222662,
|
|
"num_tokens": 17726906.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.49783285334706306,
|
|
"epoch": 3.7727272727272725,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 3.4549150281252635e-06,
|
|
"loss": 0.8268,
|
|
"mean_token_accuracy": 0.759495971724391,
|
|
"num_tokens": 17936432.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.5077159395441413,
|
|
"epoch": 3.8181818181818183,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 3.3359022773850673e-06,
|
|
"loss": 0.787,
|
|
"mean_token_accuracy": 0.7670105397701263,
|
|
"num_tokens": 18151745.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.4940468706190586,
|
|
"epoch": 3.8636363636363638,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 3.217940606433747e-06,
|
|
"loss": 0.7725,
|
|
"mean_token_accuracy": 0.7713121753185987,
|
|
"num_tokens": 18368653.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.5109028052538633,
|
|
"epoch": 3.909090909090909,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 3.1011045223909954e-06,
|
|
"loss": 0.7891,
|
|
"mean_token_accuracy": 0.765845526009798,
|
|
"num_tokens": 18580544.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.5074712671339512,
|
|
"epoch": 3.9545454545454546,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 2.9854678214316875e-06,
|
|
"loss": 0.7784,
|
|
"mean_token_accuracy": 0.770260414108634,
|
|
"num_tokens": 18793479.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.5031467285007238,
|
|
"epoch": 4.0,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 2.871103542174637e-06,
|
|
"loss": 0.8017,
|
|
"mean_token_accuracy": 0.7639777194708586,
|
|
"num_tokens": 19017990.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.5008434914052486,
|
|
"epoch": 4.045454545454546,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 2.7580839195498397e-06,
|
|
"loss": 0.7876,
|
|
"mean_token_accuracy": 0.765197154134512,
|
|
"num_tokens": 19232836.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.5124122239649296,
|
|
"epoch": 4.090909090909091,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 2.646480339173337e-06,
|
|
"loss": 0.7836,
|
|
"mean_token_accuracy": 0.7690869830548763,
|
|
"num_tokens": 19439837.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.505384799093008,
|
|
"epoch": 4.136363636363637,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 2.536363292258543e-06,
|
|
"loss": 0.7888,
|
|
"mean_token_accuracy": 0.767447579652071,
|
|
"num_tokens": 19649689.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.5008498653769493,
|
|
"epoch": 4.181818181818182,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 2.4278023310924676e-06,
|
|
"loss": 0.7858,
|
|
"mean_token_accuracy": 0.7692194785922766,
|
|
"num_tokens": 19865021.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.49786832462996244,
|
|
"epoch": 4.2272727272727275,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 2.320866025105016e-06,
|
|
"loss": 0.7896,
|
|
"mean_token_accuracy": 0.7659407686442137,
|
|
"num_tokens": 20088549.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.4926151381805539,
|
|
"epoch": 4.2727272727272725,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 2.2156219175590623e-06,
|
|
"loss": 0.7791,
|
|
"mean_token_accuracy": 0.7721114605665207,
|
|
"num_tokens": 20314204.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.4964353507384658,
|
|
"epoch": 4.318181818181818,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 2.112136482888663e-06,
|
|
"loss": 0.7619,
|
|
"mean_token_accuracy": 0.7737916205078363,
|
|
"num_tokens": 20523190.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.5042525352910161,
|
|
"epoch": 4.363636363636363,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 2.0104750847124075e-06,
|
|
"loss": 0.7674,
|
|
"mean_token_accuracy": 0.7730825170874596,
|
|
"num_tokens": 20729046.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.4980954099446535,
|
|
"epoch": 4.409090909090909,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 1.910701934548329e-06,
|
|
"loss": 0.7684,
|
|
"mean_token_accuracy": 0.7743080649524927,
|
|
"num_tokens": 20925674.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.5134334182366729,
|
|
"epoch": 4.454545454545454,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.8128800512565514e-06,
|
|
"loss": 0.7852,
|
|
"mean_token_accuracy": 0.7661070600152016,
|
|
"num_tokens": 21149856.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.48121114261448383,
|
|
"epoch": 4.5,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 1.7170712212352187e-06,
|
|
"loss": 0.781,
|
|
"mean_token_accuracy": 0.7670229282230139,
|
|
"num_tokens": 21380719.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.4965139916166663,
|
|
"epoch": 4.545454545454545,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 1.6233359593948777e-06,
|
|
"loss": 0.7825,
|
|
"mean_token_accuracy": 0.7711650598794222,
|
|
"num_tokens": 21593512.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.49770417250692844,
|
|
"epoch": 4.590909090909091,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 1.531733470935976e-06,
|
|
"loss": 0.7974,
|
|
"mean_token_accuracy": 0.7630838695913553,
|
|
"num_tokens": 21813186.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.5052190851420164,
|
|
"epoch": 4.636363636363637,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.4423216139535735e-06,
|
|
"loss": 0.7991,
|
|
"mean_token_accuracy": 0.7624437268823385,
|
|
"num_tokens": 22026623.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.5221615890040994,
|
|
"epoch": 4.681818181818182,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 1.3551568628929434e-06,
|
|
"loss": 0.7913,
|
|
"mean_token_accuracy": 0.7647000271826982,
|
|
"num_tokens": 22244422.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.5008693430572748,
|
|
"epoch": 4.7272727272727275,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 1.2702942728790897e-06,
|
|
"loss": 0.7834,
|
|
"mean_token_accuracy": 0.7710994388908148,
|
|
"num_tokens": 22463984.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.48945672158151865,
|
|
"epoch": 4.7727272727272725,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 1.18778744494276e-06,
|
|
"loss": 0.7905,
|
|
"mean_token_accuracy": 0.7667012866586447,
|
|
"num_tokens": 22682817.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.48861549887806177,
|
|
"epoch": 4.818181818181818,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.1076884921648834e-06,
|
|
"loss": 0.8045,
|
|
"mean_token_accuracy": 0.7612419724464417,
|
|
"num_tokens": 22917878.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.49805399868637323,
|
|
"epoch": 4.863636363636363,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 1.0300480067608232e-06,
|
|
"loss": 0.7937,
|
|
"mean_token_accuracy": 0.7647292520850897,
|
|
"num_tokens": 23123886.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.5075469352304935,
|
|
"epoch": 4.909090909090909,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 9.549150281252633e-07,
|
|
"loss": 0.7662,
|
|
"mean_token_accuracy": 0.7728671338409185,
|
|
"num_tokens": 23326280.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.4884511986747384,
|
|
"epoch": 4.954545454545455,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 8.823370118578628e-07,
|
|
"loss": 0.7868,
|
|
"mean_token_accuracy": 0.7672145701944828,
|
|
"num_tokens": 23570721.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.4963974915444851,
|
|
"epoch": 5.0,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 8.123597997892918e-07,
|
|
"loss": 0.8003,
|
|
"mean_token_accuracy": 0.7601921837776899,
|
|
"num_tokens": 23797581.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.5236662002280354,
|
|
"epoch": 5.045454545454546,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 7.450275910265415e-07,
|
|
"loss": 0.7908,
|
|
"mean_token_accuracy": 0.7661727890372276,
|
|
"num_tokens": 23988050.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.5031342897564173,
|
|
"epoch": 5.090909090909091,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 6.803829140358237e-07,
|
|
"loss": 0.7843,
|
|
"mean_token_accuracy": 0.769377738237381,
|
|
"num_tokens": 24194925.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.49556155782192945,
|
|
"epoch": 5.136363636363637,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 6.184665997806832e-07,
|
|
"loss": 0.7705,
|
|
"mean_token_accuracy": 0.771408112719655,
|
|
"num_tokens": 24414755.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.4983775094151497,
|
|
"epoch": 5.181818181818182,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 5.593177559322776e-07,
|
|
"loss": 0.771,
|
|
"mean_token_accuracy": 0.7732552234083414,
|
|
"num_tokens": 24630985.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.509802995249629,
|
|
"epoch": 5.2272727272727275,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 5.029737421681446e-07,
|
|
"loss": 0.7953,
|
|
"mean_token_accuracy": 0.7667434271425009,
|
|
"num_tokens": 24854501.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.49124314822256565,
|
|
"epoch": 5.2727272727272725,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 4.494701465750217e-07,
|
|
"loss": 0.7816,
|
|
"mean_token_accuracy": 0.7701654080301523,
|
|
"num_tokens": 25085284.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.5039074392989278,
|
|
"epoch": 5.318181818181818,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 3.9884076317064813e-07,
|
|
"loss": 0.7772,
|
|
"mean_token_accuracy": 0.7688853479921818,
|
|
"num_tokens": 25297733.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.49862359277904034,
|
|
"epoch": 5.363636363636363,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 3.511175705587433e-07,
|
|
"loss": 0.7853,
|
|
"mean_token_accuracy": 0.7682102452963591,
|
|
"num_tokens": 25518971.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.4892881168052554,
|
|
"epoch": 5.409090909090909,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 3.0633071173062966e-07,
|
|
"loss": 0.788,
|
|
"mean_token_accuracy": 0.767511548474431,
|
|
"num_tokens": 25748568.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.5163639336824417,
|
|
"epoch": 5.454545454545454,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 2.6450847502627883e-07,
|
|
"loss": 0.7587,
|
|
"mean_token_accuracy": 0.7713187728077173,
|
|
"num_tokens": 25945280.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.49786319863051176,
|
|
"epoch": 5.5,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 2.2567727626678527e-07,
|
|
"loss": 0.7704,
|
|
"mean_token_accuracy": 0.7727676276117563,
|
|
"num_tokens": 26165085.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.48858580458909273,
|
|
"epoch": 5.545454545454545,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 1.8986164206957037e-07,
|
|
"loss": 0.7763,
|
|
"mean_token_accuracy": 0.7700363770127296,
|
|
"num_tokens": 26389656.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.49373806826770306,
|
|
"epoch": 5.590909090909091,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 1.5708419435684463e-07,
|
|
"loss": 0.7705,
|
|
"mean_token_accuracy": 0.770933760330081,
|
|
"num_tokens": 26612348.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.5205885702744126,
|
|
"epoch": 5.636363636363637,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 1.2736563606711384e-07,
|
|
"loss": 0.7881,
|
|
"mean_token_accuracy": 0.7669357471168041,
|
|
"num_tokens": 26814128.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.5114659816026688,
|
|
"epoch": 5.681818181818182,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 1.007247380787657e-07,
|
|
"loss": 0.7778,
|
|
"mean_token_accuracy": 0.7689048480242491,
|
|
"num_tokens": 27017995.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.5094327395781875,
|
|
"epoch": 5.7272727272727275,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 7.717832735397335e-08,
|
|
"loss": 0.8061,
|
|
"mean_token_accuracy": 0.7618958260864019,
|
|
"num_tokens": 27236093.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.49905121326446533,
|
|
"epoch": 5.7727272727272725,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 5.674127631043025e-08,
|
|
"loss": 0.7821,
|
|
"mean_token_accuracy": 0.765771547332406,
|
|
"num_tokens": 27444153.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.5201311567798257,
|
|
"epoch": 5.818181818181818,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 3.9426493427611177e-08,
|
|
"loss": 0.7799,
|
|
"mean_token_accuracy": 0.7713412661105394,
|
|
"num_tokens": 27648072.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.4998158114030957,
|
|
"epoch": 5.863636363636363,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 2.5244915093499134e-08,
|
|
"loss": 0.7712,
|
|
"mean_token_accuracy": 0.7739516459405422,
|
|
"num_tokens": 27855673.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.5035147462040186,
|
|
"epoch": 5.909090909090909,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 1.4205498696930332e-08,
|
|
"loss": 0.7693,
|
|
"mean_token_accuracy": 0.7732928432524204,
|
|
"num_tokens": 28069882.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.50436632335186,
|
|
"epoch": 5.954545454545455,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 6.315216969912663e-09,
|
|
"loss": 0.7765,
|
|
"mean_token_accuracy": 0.7761289775371552,
|
|
"num_tokens": 28286266.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.4877948518842459,
|
|
"epoch": 6.0,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 1.5790535835003006e-09,
|
|
"loss": 0.7997,
|
|
"mean_token_accuracy": 0.7679535001516342,
|
|
"num_tokens": 28515234.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"step": 132,
|
|
"total_flos": 6.216626585809981e+17,
|
|
"train_loss": 0.8825002660353979,
|
|
"train_runtime": 2572.3849,
|
|
"train_samples_per_second": 2.808,
|
|
"train_steps_per_second": 0.051
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 132,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 6,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.216626585809981e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|