3691 lines
99 KiB
JSON
3691 lines
99 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 456,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0043859649122807015,
|
|
"grad_norm": 100.72002410888672,
|
|
"learning_rate": 2.173913043478261e-07,
|
|
"loss": 2.8882,
|
|
"mean_token_accuracy": 0.5581395626068115,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.008771929824561403,
|
|
"grad_norm": 102.30889892578125,
|
|
"learning_rate": 4.347826086956522e-07,
|
|
"loss": 2.7578,
|
|
"mean_token_accuracy": 0.5808748602867126,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.013157894736842105,
|
|
"grad_norm": 95.59414672851562,
|
|
"learning_rate": 6.521739130434783e-07,
|
|
"loss": 2.8921,
|
|
"mean_token_accuracy": 0.5645005106925964,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.017543859649122806,
|
|
"grad_norm": 104.23493957519531,
|
|
"learning_rate": 8.695652173913044e-07,
|
|
"loss": 2.8879,
|
|
"mean_token_accuracy": 0.5589743852615356,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.021929824561403508,
|
|
"grad_norm": 104.50048065185547,
|
|
"learning_rate": 1.0869565217391306e-06,
|
|
"loss": 2.8202,
|
|
"mean_token_accuracy": 0.5714285969734192,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.02631578947368421,
|
|
"grad_norm": 92.29496765136719,
|
|
"learning_rate": 1.3043478260869566e-06,
|
|
"loss": 2.5391,
|
|
"mean_token_accuracy": 0.6177605986595154,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.03070175438596491,
|
|
"grad_norm": 97.3690414428711,
|
|
"learning_rate": 1.521739130434783e-06,
|
|
"loss": 2.5906,
|
|
"mean_token_accuracy": 0.6210092902183533,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.03508771929824561,
|
|
"grad_norm": 104.06405639648438,
|
|
"learning_rate": 1.7391304347826088e-06,
|
|
"loss": 2.1757,
|
|
"mean_token_accuracy": 0.7049999833106995,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.039473684210526314,
|
|
"grad_norm": 88.53056335449219,
|
|
"learning_rate": 1.956521739130435e-06,
|
|
"loss": 2.2199,
|
|
"mean_token_accuracy": 0.6958661675453186,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.043859649122807015,
|
|
"grad_norm": 61.665733337402344,
|
|
"learning_rate": 2.173913043478261e-06,
|
|
"loss": 1.6644,
|
|
"mean_token_accuracy": 0.7212614417076111,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.04824561403508772,
|
|
"grad_norm": 45.59661865234375,
|
|
"learning_rate": 2.391304347826087e-06,
|
|
"loss": 1.3645,
|
|
"mean_token_accuracy": 0.7565470337867737,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.05263157894736842,
|
|
"grad_norm": 42.23125076293945,
|
|
"learning_rate": 2.6086956521739132e-06,
|
|
"loss": 1.3545,
|
|
"mean_token_accuracy": 0.7615955471992493,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.05701754385964912,
|
|
"grad_norm": 41.3696403503418,
|
|
"learning_rate": 2.8260869565217393e-06,
|
|
"loss": 1.2655,
|
|
"mean_token_accuracy": 0.7810526490211487,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.06140350877192982,
|
|
"grad_norm": 20.49009895324707,
|
|
"learning_rate": 3.043478260869566e-06,
|
|
"loss": 0.8974,
|
|
"mean_token_accuracy": 0.8418847918510437,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.06578947368421052,
|
|
"grad_norm": 16.676067352294922,
|
|
"learning_rate": 3.2608695652173914e-06,
|
|
"loss": 0.7967,
|
|
"mean_token_accuracy": 0.8536585569381714,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.07017543859649122,
|
|
"grad_norm": 14.577007293701172,
|
|
"learning_rate": 3.4782608695652175e-06,
|
|
"loss": 0.7143,
|
|
"mean_token_accuracy": 0.8759920597076416,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.07456140350877193,
|
|
"grad_norm": 12.738288879394531,
|
|
"learning_rate": 3.6956521739130436e-06,
|
|
"loss": 0.628,
|
|
"mean_token_accuracy": 0.8964803218841553,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.07894736842105263,
|
|
"grad_norm": 12.496009826660156,
|
|
"learning_rate": 3.91304347826087e-06,
|
|
"loss": 0.6106,
|
|
"mean_token_accuracy": 0.902184247970581,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.08333333333333333,
|
|
"grad_norm": 12.505276679992676,
|
|
"learning_rate": 4.130434782608696e-06,
|
|
"loss": 0.5771,
|
|
"mean_token_accuracy": 0.9018287062644958,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.08771929824561403,
|
|
"grad_norm": 14.074338912963867,
|
|
"learning_rate": 4.347826086956522e-06,
|
|
"loss": 0.5896,
|
|
"mean_token_accuracy": 0.9002057909965515,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.09210526315789473,
|
|
"grad_norm": 11.973801612854004,
|
|
"learning_rate": 4.565217391304348e-06,
|
|
"loss": 0.5058,
|
|
"mean_token_accuracy": 0.9128630757331848,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.09649122807017543,
|
|
"grad_norm": 12.036285400390625,
|
|
"learning_rate": 4.782608695652174e-06,
|
|
"loss": 0.445,
|
|
"mean_token_accuracy": 0.9133782386779785,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.10087719298245613,
|
|
"grad_norm": 9.241310119628906,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.4018,
|
|
"mean_token_accuracy": 0.9313346147537231,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.10526315789473684,
|
|
"grad_norm": 11.00639533996582,
|
|
"learning_rate": 5.2173913043478265e-06,
|
|
"loss": 0.4168,
|
|
"mean_token_accuracy": 0.9190573692321777,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.10964912280701754,
|
|
"grad_norm": 10.37211799621582,
|
|
"learning_rate": 5.4347826086956525e-06,
|
|
"loss": 0.4274,
|
|
"mean_token_accuracy": 0.9305699467658997,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.11403508771929824,
|
|
"grad_norm": 9.478952407836914,
|
|
"learning_rate": 5.652173913043479e-06,
|
|
"loss": 0.3642,
|
|
"mean_token_accuracy": 0.9342916011810303,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.11842105263157894,
|
|
"grad_norm": 10.124141693115234,
|
|
"learning_rate": 5.8695652173913055e-06,
|
|
"loss": 0.4091,
|
|
"mean_token_accuracy": 0.9287148714065552,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.12280701754385964,
|
|
"grad_norm": 9.518803596496582,
|
|
"learning_rate": 6.086956521739132e-06,
|
|
"loss": 0.3748,
|
|
"mean_token_accuracy": 0.9275220632553101,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.12719298245614036,
|
|
"grad_norm": 9.434673309326172,
|
|
"learning_rate": 6.304347826086958e-06,
|
|
"loss": 0.3611,
|
|
"mean_token_accuracy": 0.9374359250068665,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.13157894736842105,
|
|
"grad_norm": 11.588494300842285,
|
|
"learning_rate": 6.521739130434783e-06,
|
|
"loss": 0.426,
|
|
"mean_token_accuracy": 0.9164086580276489,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.13596491228070176,
|
|
"grad_norm": 9.927175521850586,
|
|
"learning_rate": 6.739130434782609e-06,
|
|
"loss": 0.3641,
|
|
"mean_token_accuracy": 0.9309309124946594,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.14035087719298245,
|
|
"grad_norm": 9.435439109802246,
|
|
"learning_rate": 6.956521739130435e-06,
|
|
"loss": 0.3461,
|
|
"mean_token_accuracy": 0.9301868081092834,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.14473684210526316,
|
|
"grad_norm": 9.709479331970215,
|
|
"learning_rate": 7.173913043478261e-06,
|
|
"loss": 0.3551,
|
|
"mean_token_accuracy": 0.9335317611694336,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.14912280701754385,
|
|
"grad_norm": 9.065229415893555,
|
|
"learning_rate": 7.391304347826087e-06,
|
|
"loss": 0.3177,
|
|
"mean_token_accuracy": 0.9381918907165527,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.15350877192982457,
|
|
"grad_norm": 9.64979076385498,
|
|
"learning_rate": 7.608695652173914e-06,
|
|
"loss": 0.3644,
|
|
"mean_token_accuracy": 0.9416754841804504,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.15789473684210525,
|
|
"grad_norm": 9.246652603149414,
|
|
"learning_rate": 7.82608695652174e-06,
|
|
"loss": 0.3028,
|
|
"mean_token_accuracy": 0.9778012633323669,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.16228070175438597,
|
|
"grad_norm": 8.778101921081543,
|
|
"learning_rate": 8.043478260869566e-06,
|
|
"loss": 0.2831,
|
|
"mean_token_accuracy": 0.9801587462425232,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.16666666666666666,
|
|
"grad_norm": 9.583342552185059,
|
|
"learning_rate": 8.260869565217392e-06,
|
|
"loss": 0.337,
|
|
"mean_token_accuracy": 0.9642857313156128,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.17105263157894737,
|
|
"grad_norm": 9.742164611816406,
|
|
"learning_rate": 8.478260869565218e-06,
|
|
"loss": 0.292,
|
|
"mean_token_accuracy": 0.9780701994895935,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.17543859649122806,
|
|
"grad_norm": 9.522716522216797,
|
|
"learning_rate": 8.695652173913044e-06,
|
|
"loss": 0.2891,
|
|
"mean_token_accuracy": 0.9725118279457092,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.17982456140350878,
|
|
"grad_norm": 8.957447052001953,
|
|
"learning_rate": 8.91304347826087e-06,
|
|
"loss": 0.271,
|
|
"mean_token_accuracy": 0.9785056114196777,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.18421052631578946,
|
|
"grad_norm": 8.779328346252441,
|
|
"learning_rate": 9.130434782608697e-06,
|
|
"loss": 0.2596,
|
|
"mean_token_accuracy": 0.9759036302566528,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.18859649122807018,
|
|
"grad_norm": 8.714072227478027,
|
|
"learning_rate": 9.347826086956523e-06,
|
|
"loss": 0.2528,
|
|
"mean_token_accuracy": 0.9750249981880188,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.19298245614035087,
|
|
"grad_norm": 9.57017707824707,
|
|
"learning_rate": 9.565217391304349e-06,
|
|
"loss": 0.2737,
|
|
"mean_token_accuracy": 0.9671794772148132,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.19736842105263158,
|
|
"grad_norm": 8.689032554626465,
|
|
"learning_rate": 9.782608695652175e-06,
|
|
"loss": 0.2213,
|
|
"mean_token_accuracy": 0.9827935099601746,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.20175438596491227,
|
|
"grad_norm": 8.658659934997559,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2289,
|
|
"mean_token_accuracy": 0.977505087852478,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.20614035087719298,
|
|
"grad_norm": 8.370903015136719,
|
|
"learning_rate": 9.999867897077623e-06,
|
|
"loss": 0.2087,
|
|
"mean_token_accuracy": 0.9791460037231445,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.21052631578947367,
|
|
"grad_norm": 8.627614974975586,
|
|
"learning_rate": 9.999471596066567e-06,
|
|
"loss": 0.2199,
|
|
"mean_token_accuracy": 0.9729458689689636,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.2149122807017544,
|
|
"grad_norm": 8.98874568939209,
|
|
"learning_rate": 9.998811120234624e-06,
|
|
"loss": 0.2204,
|
|
"mean_token_accuracy": 0.9805327653884888,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.21929824561403508,
|
|
"grad_norm": 8.562795639038086,
|
|
"learning_rate": 9.99788650835992e-06,
|
|
"loss": 0.194,
|
|
"mean_token_accuracy": 0.9865702390670776,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.2236842105263158,
|
|
"grad_norm": 8.354143142700195,
|
|
"learning_rate": 9.996697814728646e-06,
|
|
"loss": 0.2058,
|
|
"mean_token_accuracy": 0.9749276638031006,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.22807017543859648,
|
|
"grad_norm": 8.861373901367188,
|
|
"learning_rate": 9.99524510913187e-06,
|
|
"loss": 0.1785,
|
|
"mean_token_accuracy": 0.9775280952453613,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.2324561403508772,
|
|
"grad_norm": 8.467371940612793,
|
|
"learning_rate": 9.99352847686144e-06,
|
|
"loss": 0.1753,
|
|
"mean_token_accuracy": 0.9784172773361206,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.23684210526315788,
|
|
"grad_norm": 9.050318717956543,
|
|
"learning_rate": 9.991548018704971e-06,
|
|
"loss": 0.181,
|
|
"mean_token_accuracy": 0.9764453768730164,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.2412280701754386,
|
|
"grad_norm": 8.318038940429688,
|
|
"learning_rate": 9.989303850939937e-06,
|
|
"loss": 0.1424,
|
|
"mean_token_accuracy": 0.9843096137046814,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.24561403508771928,
|
|
"grad_norm": 8.480911254882812,
|
|
"learning_rate": 9.986796105326832e-06,
|
|
"loss": 0.1465,
|
|
"mean_token_accuracy": 0.9778246879577637,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 7.978242874145508,
|
|
"learning_rate": 9.98402492910145e-06,
|
|
"loss": 0.1582,
|
|
"mean_token_accuracy": 0.971230149269104,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.2543859649122807,
|
|
"grad_norm": 7.869690895080566,
|
|
"learning_rate": 9.98099048496622e-06,
|
|
"loss": 0.1625,
|
|
"mean_token_accuracy": 0.9744094610214233,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.25877192982456143,
|
|
"grad_norm": 7.296746253967285,
|
|
"learning_rate": 9.977692951080673e-06,
|
|
"loss": 0.0961,
|
|
"mean_token_accuracy": 0.9874759316444397,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.2631578947368421,
|
|
"grad_norm": 6.89448881149292,
|
|
"learning_rate": 9.97413252105097e-06,
|
|
"loss": 0.1193,
|
|
"mean_token_accuracy": 0.9717624187469482,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2675438596491228,
|
|
"grad_norm": 6.936161994934082,
|
|
"learning_rate": 9.970309403918538e-06,
|
|
"loss": 0.1294,
|
|
"mean_token_accuracy": 0.9783037304878235,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.2719298245614035,
|
|
"grad_norm": 6.3501715660095215,
|
|
"learning_rate": 9.966223824147798e-06,
|
|
"loss": 0.1022,
|
|
"mean_token_accuracy": 0.9817629456520081,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.27631578947368424,
|
|
"grad_norm": 6.021699905395508,
|
|
"learning_rate": 9.961876021612984e-06,
|
|
"loss": 0.0861,
|
|
"mean_token_accuracy": 0.9848331809043884,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.2807017543859649,
|
|
"grad_norm": 5.777260780334473,
|
|
"learning_rate": 9.957266251584061e-06,
|
|
"loss": 0.0754,
|
|
"mean_token_accuracy": 0.9884817004203796,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.2850877192982456,
|
|
"grad_norm": 5.9405131340026855,
|
|
"learning_rate": 9.952394784711736e-06,
|
|
"loss": 0.1364,
|
|
"mean_token_accuracy": 0.9699481725692749,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.2894736842105263,
|
|
"grad_norm": 5.398240566253662,
|
|
"learning_rate": 9.94726190701157e-06,
|
|
"loss": 0.1021,
|
|
"mean_token_accuracy": 0.9772727489471436,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.29385964912280704,
|
|
"grad_norm": 4.3773932456970215,
|
|
"learning_rate": 9.94186791984718e-06,
|
|
"loss": 0.0721,
|
|
"mean_token_accuracy": 0.9819819927215576,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.2982456140350877,
|
|
"grad_norm": 7.893673896789551,
|
|
"learning_rate": 9.936213139912555e-06,
|
|
"loss": 0.0739,
|
|
"mean_token_accuracy": 0.9837892651557922,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.3026315789473684,
|
|
"grad_norm": 3.6643731594085693,
|
|
"learning_rate": 9.930297899213454e-06,
|
|
"loss": 0.0751,
|
|
"mean_token_accuracy": 0.9825462102890015,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.30701754385964913,
|
|
"grad_norm": 3.828979253768921,
|
|
"learning_rate": 9.924122545047908e-06,
|
|
"loss": 0.0995,
|
|
"mean_token_accuracy": 0.9770458936691284,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.31140350877192985,
|
|
"grad_norm": 3.4599835872650146,
|
|
"learning_rate": 9.917687439985848e-06,
|
|
"loss": 0.0729,
|
|
"mean_token_accuracy": 0.981500506401062,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.3157894736842105,
|
|
"grad_norm": 3.9461429119110107,
|
|
"learning_rate": 9.910992961847798e-06,
|
|
"loss": 0.0799,
|
|
"mean_token_accuracy": 0.9785177111625671,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.3201754385964912,
|
|
"grad_norm": 2.959583044052124,
|
|
"learning_rate": 9.904039503682701e-06,
|
|
"loss": 0.0668,
|
|
"mean_token_accuracy": 0.9801192879676819,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.32456140350877194,
|
|
"grad_norm": 3.5721518993377686,
|
|
"learning_rate": 9.896827473744848e-06,
|
|
"loss": 0.0691,
|
|
"mean_token_accuracy": 0.9830328822135925,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.32894736842105265,
|
|
"grad_norm": 4.679792404174805,
|
|
"learning_rate": 9.889357295469893e-06,
|
|
"loss": 0.0941,
|
|
"mean_token_accuracy": 0.9726997017860413,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 3.811170816421509,
|
|
"learning_rate": 9.881629407450007e-06,
|
|
"loss": 0.1065,
|
|
"mean_token_accuracy": 0.9740259647369385,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.33771929824561403,
|
|
"grad_norm": 2.699470281600952,
|
|
"learning_rate": 9.873644263408119e-06,
|
|
"loss": 0.0468,
|
|
"mean_token_accuracy": 0.9821228981018066,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.34210526315789475,
|
|
"grad_norm": 2.773603677749634,
|
|
"learning_rate": 9.86540233217128e-06,
|
|
"loss": 0.0804,
|
|
"mean_token_accuracy": 0.9749518036842346,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.34649122807017546,
|
|
"grad_norm": 2.799196243286133,
|
|
"learning_rate": 9.856904097643136e-06,
|
|
"loss": 0.0758,
|
|
"mean_token_accuracy": 0.9768844246864319,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.3508771929824561,
|
|
"grad_norm": 3.8493893146514893,
|
|
"learning_rate": 9.848150058775514e-06,
|
|
"loss": 0.0876,
|
|
"mean_token_accuracy": 0.9722222089767456,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.35526315789473684,
|
|
"grad_norm": 2.733839273452759,
|
|
"learning_rate": 9.839140729539135e-06,
|
|
"loss": 0.0707,
|
|
"mean_token_accuracy": 0.9825102686882019,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.35964912280701755,
|
|
"grad_norm": 2.1981351375579834,
|
|
"learning_rate": 9.829876638893432e-06,
|
|
"loss": 0.041,
|
|
"mean_token_accuracy": 0.9840085506439209,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.36403508771929827,
|
|
"grad_norm": 2.9656527042388916,
|
|
"learning_rate": 9.820358330755487e-06,
|
|
"loss": 0.0606,
|
|
"mean_token_accuracy": 0.97921222448349,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.3684210526315789,
|
|
"grad_norm": 4.108867645263672,
|
|
"learning_rate": 9.810586363968115e-06,
|
|
"loss": 0.084,
|
|
"mean_token_accuracy": 0.9738767147064209,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.37280701754385964,
|
|
"grad_norm": 3.470015048980713,
|
|
"learning_rate": 9.800561312267033e-06,
|
|
"loss": 0.0822,
|
|
"mean_token_accuracy": 0.9793713092803955,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.37719298245614036,
|
|
"grad_norm": 3.3155014514923096,
|
|
"learning_rate": 9.790283764247188e-06,
|
|
"loss": 0.0689,
|
|
"mean_token_accuracy": 0.9789999723434448,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.3815789473684211,
|
|
"grad_norm": 3.8658089637756348,
|
|
"learning_rate": 9.779754323328192e-06,
|
|
"loss": 0.0872,
|
|
"mean_token_accuracy": 0.9719334840774536,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.38596491228070173,
|
|
"grad_norm": 3.226743459701538,
|
|
"learning_rate": 9.768973607718896e-06,
|
|
"loss": 0.095,
|
|
"mean_token_accuracy": 0.9710578918457031,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.39035087719298245,
|
|
"grad_norm": 3.8896260261535645,
|
|
"learning_rate": 9.757942250381094e-06,
|
|
"loss": 0.0723,
|
|
"mean_token_accuracy": 0.977412760257721,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.39473684210526316,
|
|
"grad_norm": 3.5911498069763184,
|
|
"learning_rate": 9.746660898992362e-06,
|
|
"loss": 0.0869,
|
|
"mean_token_accuracy": 0.9721115827560425,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.3991228070175439,
|
|
"grad_norm": 2.7276499271392822,
|
|
"learning_rate": 9.735130215908027e-06,
|
|
"loss": 0.0732,
|
|
"mean_token_accuracy": 0.9744848012924194,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.40350877192982454,
|
|
"grad_norm": 3.3096349239349365,
|
|
"learning_rate": 9.723350878122283e-06,
|
|
"loss": 0.0701,
|
|
"mean_token_accuracy": 0.9788944721221924,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.40789473684210525,
|
|
"grad_norm": 2.4994590282440186,
|
|
"learning_rate": 9.711323577228433e-06,
|
|
"loss": 0.0917,
|
|
"mean_token_accuracy": 0.9757575988769531,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.41228070175438597,
|
|
"grad_norm": 3.2870211601257324,
|
|
"learning_rate": 9.699049019378303e-06,
|
|
"loss": 0.0891,
|
|
"mean_token_accuracy": 0.9718719720840454,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.4166666666666667,
|
|
"grad_norm": 2.1593267917633057,
|
|
"learning_rate": 9.686527925240763e-06,
|
|
"loss": 0.0488,
|
|
"mean_token_accuracy": 0.9822263717651367,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.42105263157894735,
|
|
"grad_norm": 2.629718542098999,
|
|
"learning_rate": 9.673761029959427e-06,
|
|
"loss": 0.0504,
|
|
"mean_token_accuracy": 0.9824742078781128,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.42543859649122806,
|
|
"grad_norm": 1.9966559410095215,
|
|
"learning_rate": 9.660749083109483e-06,
|
|
"loss": 0.0637,
|
|
"mean_token_accuracy": 0.976313054561615,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.4298245614035088,
|
|
"grad_norm": 2.178070545196533,
|
|
"learning_rate": 9.647492848653689e-06,
|
|
"loss": 0.0437,
|
|
"mean_token_accuracy": 0.9812304377555847,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.4342105263157895,
|
|
"grad_norm": 2.226331949234009,
|
|
"learning_rate": 9.633993104897516e-06,
|
|
"loss": 0.0525,
|
|
"mean_token_accuracy": 0.9882352948188782,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.43859649122807015,
|
|
"grad_norm": 3.1283533573150635,
|
|
"learning_rate": 9.620250644443454e-06,
|
|
"loss": 0.0688,
|
|
"mean_token_accuracy": 0.9809809923171997,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.44298245614035087,
|
|
"grad_norm": 2.914991855621338,
|
|
"learning_rate": 9.606266274144475e-06,
|
|
"loss": 0.067,
|
|
"mean_token_accuracy": 0.9824561476707458,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.4473684210526316,
|
|
"grad_norm": 2.58962082862854,
|
|
"learning_rate": 9.592040815056662e-06,
|
|
"loss": 0.0503,
|
|
"mean_token_accuracy": 0.9818181991577148,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.4517543859649123,
|
|
"grad_norm": 2.361135959625244,
|
|
"learning_rate": 9.577575102390999e-06,
|
|
"loss": 0.0605,
|
|
"mean_token_accuracy": 0.9817444086074829,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.45614035087719296,
|
|
"grad_norm": 2.864607334136963,
|
|
"learning_rate": 9.562869985464341e-06,
|
|
"loss": 0.0812,
|
|
"mean_token_accuracy": 0.9810924530029297,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.4605263157894737,
|
|
"grad_norm": 3.783154249191284,
|
|
"learning_rate": 9.547926327649535e-06,
|
|
"loss": 0.1249,
|
|
"mean_token_accuracy": 0.9709419012069702,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.4649122807017544,
|
|
"grad_norm": 2.7710580825805664,
|
|
"learning_rate": 9.53274500632475e-06,
|
|
"loss": 0.0613,
|
|
"mean_token_accuracy": 0.98591548204422,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.4692982456140351,
|
|
"grad_norm": 2.278409004211426,
|
|
"learning_rate": 9.517326912821948e-06,
|
|
"loss": 0.0571,
|
|
"mean_token_accuracy": 0.9854369163513184,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.47368421052631576,
|
|
"grad_norm": 2.6353845596313477,
|
|
"learning_rate": 9.501672952374551e-06,
|
|
"loss": 0.0494,
|
|
"mean_token_accuracy": 0.9879253506660461,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.4780701754385965,
|
|
"grad_norm": 2.9735920429229736,
|
|
"learning_rate": 9.485784044064305e-06,
|
|
"loss": 0.0702,
|
|
"mean_token_accuracy": 0.98103266954422,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.4824561403508772,
|
|
"grad_norm": 2.455169916152954,
|
|
"learning_rate": 9.469661120767308e-06,
|
|
"loss": 0.08,
|
|
"mean_token_accuracy": 0.9808428883552551,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.4868421052631579,
|
|
"grad_norm": 2.6597843170166016,
|
|
"learning_rate": 9.453305129099241e-06,
|
|
"loss": 0.0754,
|
|
"mean_token_accuracy": 0.9796791672706604,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.49122807017543857,
|
|
"grad_norm": 2.5103378295898438,
|
|
"learning_rate": 9.436717029359794e-06,
|
|
"loss": 0.0408,
|
|
"mean_token_accuracy": 0.9863013625144958,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.4956140350877193,
|
|
"grad_norm": 2.8557910919189453,
|
|
"learning_rate": 9.419897795476276e-06,
|
|
"loss": 0.0682,
|
|
"mean_token_accuracy": 0.9836065769195557,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.57025146484375,
|
|
"learning_rate": 9.402848414946445e-06,
|
|
"loss": 0.0571,
|
|
"mean_token_accuracy": 0.9806763529777527,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.5043859649122807,
|
|
"grad_norm": 2.4798214435577393,
|
|
"learning_rate": 9.385569888780517e-06,
|
|
"loss": 0.0725,
|
|
"mean_token_accuracy": 0.9782823324203491,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.5087719298245614,
|
|
"grad_norm": 3.3407113552093506,
|
|
"learning_rate": 9.368063231442406e-06,
|
|
"loss": 0.0828,
|
|
"mean_token_accuracy": 0.9743863344192505,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.5131578947368421,
|
|
"grad_norm": 2.729813814163208,
|
|
"learning_rate": 9.350329470790153e-06,
|
|
"loss": 0.0752,
|
|
"mean_token_accuracy": 0.98037189245224,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.5175438596491229,
|
|
"grad_norm": 2.1768548488616943,
|
|
"learning_rate": 9.332369648015583e-06,
|
|
"loss": 0.0566,
|
|
"mean_token_accuracy": 0.9804727435112,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.5219298245614035,
|
|
"grad_norm": 2.738077163696289,
|
|
"learning_rate": 9.314184817583176e-06,
|
|
"loss": 0.0873,
|
|
"mean_token_accuracy": 0.9718456864356995,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.5263157894736842,
|
|
"grad_norm": 2.769399404525757,
|
|
"learning_rate": 9.295776047168149e-06,
|
|
"loss": 0.051,
|
|
"mean_token_accuracy": 0.984329104423523,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5307017543859649,
|
|
"grad_norm": 2.4384264945983887,
|
|
"learning_rate": 9.277144417593777e-06,
|
|
"loss": 0.056,
|
|
"mean_token_accuracy": 0.985029935836792,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.5350877192982456,
|
|
"grad_norm": 1.7621722221374512,
|
|
"learning_rate": 9.258291022767932e-06,
|
|
"loss": 0.0445,
|
|
"mean_token_accuracy": 0.9854318499565125,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.5394736842105263,
|
|
"grad_norm": 1.8469805717468262,
|
|
"learning_rate": 9.239216969618862e-06,
|
|
"loss": 0.0338,
|
|
"mean_token_accuracy": 0.9893719553947449,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.543859649122807,
|
|
"grad_norm": 1.7815845012664795,
|
|
"learning_rate": 9.219923378030197e-06,
|
|
"loss": 0.0531,
|
|
"mean_token_accuracy": 0.9894737005233765,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.5482456140350878,
|
|
"grad_norm": 1.8177460432052612,
|
|
"learning_rate": 9.200411380775192e-06,
|
|
"loss": 0.0463,
|
|
"mean_token_accuracy": 0.9866529703140259,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.5526315789473685,
|
|
"grad_norm": 1.940299153327942,
|
|
"learning_rate": 9.180682123450232e-06,
|
|
"loss": 0.0294,
|
|
"mean_token_accuracy": 0.9908722043037415,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.5570175438596491,
|
|
"grad_norm": 2.27999210357666,
|
|
"learning_rate": 9.160736764407555e-06,
|
|
"loss": 0.0511,
|
|
"mean_token_accuracy": 0.9812623262405396,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.5614035087719298,
|
|
"grad_norm": 1.8023228645324707,
|
|
"learning_rate": 9.140576474687263e-06,
|
|
"loss": 0.0373,
|
|
"mean_token_accuracy": 0.9895506501197815,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.5657894736842105,
|
|
"grad_norm": 2.3387715816497803,
|
|
"learning_rate": 9.120202437948551e-06,
|
|
"loss": 0.0406,
|
|
"mean_token_accuracy": 0.9860514998435974,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.5701754385964912,
|
|
"grad_norm": 1.9008742570877075,
|
|
"learning_rate": 9.099615850400214e-06,
|
|
"loss": 0.0484,
|
|
"mean_token_accuracy": 0.9894737005233765,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.5745614035087719,
|
|
"grad_norm": 2.219818353652954,
|
|
"learning_rate": 9.078817920730421e-06,
|
|
"loss": 0.0492,
|
|
"mean_token_accuracy": 0.9812304377555847,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.5789473684210527,
|
|
"grad_norm": 2.4068856239318848,
|
|
"learning_rate": 9.057809870035743e-06,
|
|
"loss": 0.0622,
|
|
"mean_token_accuracy": 0.9838362336158752,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.5833333333333334,
|
|
"grad_norm": 3.42411470413208,
|
|
"learning_rate": 9.036592931749463e-06,
|
|
"loss": 0.0811,
|
|
"mean_token_accuracy": 0.9856262803077698,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.5877192982456141,
|
|
"grad_norm": 2.5055322647094727,
|
|
"learning_rate": 9.015168351569165e-06,
|
|
"loss": 0.0614,
|
|
"mean_token_accuracy": 0.9802761077880859,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.5921052631578947,
|
|
"grad_norm": 2.5999653339385986,
|
|
"learning_rate": 8.993537387383579e-06,
|
|
"loss": 0.0571,
|
|
"mean_token_accuracy": 0.9836065769195557,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.5964912280701754,
|
|
"grad_norm": 1.6160295009613037,
|
|
"learning_rate": 8.971701309198744e-06,
|
|
"loss": 0.0345,
|
|
"mean_token_accuracy": 0.9874739050865173,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.6008771929824561,
|
|
"grad_norm": 1.483092188835144,
|
|
"learning_rate": 8.949661399063432e-06,
|
|
"loss": 0.026,
|
|
"mean_token_accuracy": 0.9906445145606995,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.6052631578947368,
|
|
"grad_norm": 2.4867641925811768,
|
|
"learning_rate": 8.927418950993885e-06,
|
|
"loss": 0.0674,
|
|
"mean_token_accuracy": 0.9795719981193542,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.6096491228070176,
|
|
"grad_norm": 2.1540322303771973,
|
|
"learning_rate": 8.90497527089783e-06,
|
|
"loss": 0.0647,
|
|
"mean_token_accuracy": 0.980079710483551,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.6140350877192983,
|
|
"grad_norm": 2.6197245121002197,
|
|
"learning_rate": 8.882331676497813e-06,
|
|
"loss": 0.0803,
|
|
"mean_token_accuracy": 0.9824561476707458,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.618421052631579,
|
|
"grad_norm": 3.1673412322998047,
|
|
"learning_rate": 8.859489497253833e-06,
|
|
"loss": 0.0944,
|
|
"mean_token_accuracy": 0.9688473343849182,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.6228070175438597,
|
|
"grad_norm": 3.3862664699554443,
|
|
"learning_rate": 8.83645007428528e-06,
|
|
"loss": 0.0543,
|
|
"mean_token_accuracy": 0.9792576432228088,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.6271929824561403,
|
|
"grad_norm": 2.722994327545166,
|
|
"learning_rate": 8.813214760292202e-06,
|
|
"loss": 0.0618,
|
|
"mean_token_accuracy": 0.9790794849395752,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.631578947368421,
|
|
"grad_norm": 2.353830337524414,
|
|
"learning_rate": 8.789784919475878e-06,
|
|
"loss": 0.0628,
|
|
"mean_token_accuracy": 0.9831013679504395,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.6359649122807017,
|
|
"grad_norm": 2.3788843154907227,
|
|
"learning_rate": 8.766161927458726e-06,
|
|
"loss": 0.0523,
|
|
"mean_token_accuracy": 0.9803729057312012,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.6403508771929824,
|
|
"grad_norm": 2.782294750213623,
|
|
"learning_rate": 8.742347171203542e-06,
|
|
"loss": 0.0635,
|
|
"mean_token_accuracy": 0.9749739170074463,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.6447368421052632,
|
|
"grad_norm": 1.9423679113388062,
|
|
"learning_rate": 8.718342048932054e-06,
|
|
"loss": 0.0422,
|
|
"mean_token_accuracy": 0.9858012199401855,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.6491228070175439,
|
|
"grad_norm": 2.4177629947662354,
|
|
"learning_rate": 8.694147970042842e-06,
|
|
"loss": 0.0492,
|
|
"mean_token_accuracy": 0.9847250580787659,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.6535087719298246,
|
|
"grad_norm": 3.1622345447540283,
|
|
"learning_rate": 8.669766355028584e-06,
|
|
"loss": 0.07,
|
|
"mean_token_accuracy": 0.9742063283920288,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.6578947368421053,
|
|
"grad_norm": 2.079878330230713,
|
|
"learning_rate": 8.645198635392659e-06,
|
|
"loss": 0.0496,
|
|
"mean_token_accuracy": 0.9880715608596802,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.6622807017543859,
|
|
"grad_norm": 1.5153844356536865,
|
|
"learning_rate": 8.620446253565088e-06,
|
|
"loss": 0.0359,
|
|
"mean_token_accuracy": 0.9877049326896667,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 2.652635335922241,
|
|
"learning_rate": 8.595510662817865e-06,
|
|
"loss": 0.0743,
|
|
"mean_token_accuracy": 0.9824368953704834,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.6710526315789473,
|
|
"grad_norm": 1.8495056629180908,
|
|
"learning_rate": 8.570393327179614e-06,
|
|
"loss": 0.0389,
|
|
"mean_token_accuracy": 0.9873816967010498,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.6754385964912281,
|
|
"grad_norm": 2.0945048332214355,
|
|
"learning_rate": 8.545095721349641e-06,
|
|
"loss": 0.0464,
|
|
"mean_token_accuracy": 0.9847763776779175,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.6798245614035088,
|
|
"grad_norm": 1.8139218091964722,
|
|
"learning_rate": 8.519619330611353e-06,
|
|
"loss": 0.055,
|
|
"mean_token_accuracy": 0.9879153966903687,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.6842105263157895,
|
|
"grad_norm": 1.7457425594329834,
|
|
"learning_rate": 8.493965650745043e-06,
|
|
"loss": 0.0367,
|
|
"mean_token_accuracy": 0.9838547110557556,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.6885964912280702,
|
|
"grad_norm": 2.1221344470977783,
|
|
"learning_rate": 8.468136187940087e-06,
|
|
"loss": 0.0479,
|
|
"mean_token_accuracy": 0.9841938614845276,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.6929824561403509,
|
|
"grad_norm": 1.792470932006836,
|
|
"learning_rate": 8.442132458706484e-06,
|
|
"loss": 0.0418,
|
|
"mean_token_accuracy": 0.9871794581413269,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.6973684210526315,
|
|
"grad_norm": 2.0108189582824707,
|
|
"learning_rate": 8.415955989785852e-06,
|
|
"loss": 0.0651,
|
|
"mean_token_accuracy": 0.9837563633918762,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.7017543859649122,
|
|
"grad_norm": 3.3567967414855957,
|
|
"learning_rate": 8.389608318061761e-06,
|
|
"loss": 0.0594,
|
|
"mean_token_accuracy": 0.9821615815162659,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.706140350877193,
|
|
"grad_norm": 1.998265027999878,
|
|
"learning_rate": 8.36309099046952e-06,
|
|
"loss": 0.0469,
|
|
"mean_token_accuracy": 0.9834024906158447,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.7105263157894737,
|
|
"grad_norm": 1.778838038444519,
|
|
"learning_rate": 8.336405563905333e-06,
|
|
"loss": 0.0467,
|
|
"mean_token_accuracy": 0.9862475395202637,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.7149122807017544,
|
|
"grad_norm": 2.1717565059661865,
|
|
"learning_rate": 8.309553605134904e-06,
|
|
"loss": 0.0599,
|
|
"mean_token_accuracy": 0.9841112494468689,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.7192982456140351,
|
|
"grad_norm": 1.5761713981628418,
|
|
"learning_rate": 8.282536690701446e-06,
|
|
"loss": 0.0368,
|
|
"mean_token_accuracy": 0.9864583611488342,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.7236842105263158,
|
|
"grad_norm": 2.007516384124756,
|
|
"learning_rate": 8.25535640683311e-06,
|
|
"loss": 0.0528,
|
|
"mean_token_accuracy": 0.9837398529052734,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.7280701754385965,
|
|
"grad_norm": 2.010671615600586,
|
|
"learning_rate": 8.228014349349872e-06,
|
|
"loss": 0.0387,
|
|
"mean_token_accuracy": 0.9889112710952759,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.7324561403508771,
|
|
"grad_norm": 2.147995710372925,
|
|
"learning_rate": 8.200512123569817e-06,
|
|
"loss": 0.0697,
|
|
"mean_token_accuracy": 0.9813725352287292,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.7368421052631579,
|
|
"grad_norm": 1.8654723167419434,
|
|
"learning_rate": 8.172851344214896e-06,
|
|
"loss": 0.0355,
|
|
"mean_token_accuracy": 0.9886947870254517,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.7412280701754386,
|
|
"grad_norm": 1.949415922164917,
|
|
"learning_rate": 8.14503363531613e-06,
|
|
"loss": 0.0425,
|
|
"mean_token_accuracy": 0.9887295365333557,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.7456140350877193,
|
|
"grad_norm": 2.284515857696533,
|
|
"learning_rate": 8.117060630118246e-06,
|
|
"loss": 0.0708,
|
|
"mean_token_accuracy": 0.981500506401062,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 1.6988317966461182,
|
|
"learning_rate": 8.088933970983793e-06,
|
|
"loss": 0.0411,
|
|
"mean_token_accuracy": 0.9900442361831665,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.7543859649122807,
|
|
"grad_norm": 2.115851640701294,
|
|
"learning_rate": 8.060655309296712e-06,
|
|
"loss": 0.0756,
|
|
"mean_token_accuracy": 0.9807886481285095,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.7587719298245614,
|
|
"grad_norm": 1.499506950378418,
|
|
"learning_rate": 8.032226305365383e-06,
|
|
"loss": 0.0352,
|
|
"mean_token_accuracy": 0.9842436909675598,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.7631578947368421,
|
|
"grad_norm": 1.6798099279403687,
|
|
"learning_rate": 8.003648628325136e-06,
|
|
"loss": 0.0349,
|
|
"mean_token_accuracy": 0.988095223903656,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.7675438596491229,
|
|
"grad_norm": 2.1273345947265625,
|
|
"learning_rate": 7.974923956040262e-06,
|
|
"loss": 0.0552,
|
|
"mean_token_accuracy": 0.9790836572647095,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.7719298245614035,
|
|
"grad_norm": 2.9694130420684814,
|
|
"learning_rate": 7.946053975005495e-06,
|
|
"loss": 0.0586,
|
|
"mean_token_accuracy": 0.9815950989723206,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.7763157894736842,
|
|
"grad_norm": 1.9922927618026733,
|
|
"learning_rate": 7.917040380247e-06,
|
|
"loss": 0.0384,
|
|
"mean_token_accuracy": 0.9907881021499634,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.7807017543859649,
|
|
"grad_norm": 1.1386786699295044,
|
|
"learning_rate": 7.887884875222841e-06,
|
|
"loss": 0.0269,
|
|
"mean_token_accuracy": 0.9908906817436218,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.7850877192982456,
|
|
"grad_norm": 2.196960687637329,
|
|
"learning_rate": 7.858589171722985e-06,
|
|
"loss": 0.044,
|
|
"mean_token_accuracy": 0.9834437370300293,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.7894736842105263,
|
|
"grad_norm": 1.7304738759994507,
|
|
"learning_rate": 7.829154989768784e-06,
|
|
"loss": 0.0344,
|
|
"mean_token_accuracy": 0.987590491771698,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.793859649122807,
|
|
"grad_norm": 2.542288303375244,
|
|
"learning_rate": 7.799584057511997e-06,
|
|
"loss": 0.049,
|
|
"mean_token_accuracy": 0.9837892651557922,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.7982456140350878,
|
|
"grad_norm": 1.137778639793396,
|
|
"learning_rate": 7.76987811113332e-06,
|
|
"loss": 0.0261,
|
|
"mean_token_accuracy": 0.9931372404098511,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.8026315789473685,
|
|
"grad_norm": 1.5463237762451172,
|
|
"learning_rate": 7.740038894740454e-06,
|
|
"loss": 0.036,
|
|
"mean_token_accuracy": 0.9913138151168823,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.8070175438596491,
|
|
"grad_norm": 2.4163358211517334,
|
|
"learning_rate": 7.710068160265705e-06,
|
|
"loss": 0.066,
|
|
"mean_token_accuracy": 0.9861407279968262,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.8114035087719298,
|
|
"grad_norm": 1.2295856475830078,
|
|
"learning_rate": 7.679967667363121e-06,
|
|
"loss": 0.0263,
|
|
"mean_token_accuracy": 0.9938461780548096,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.8157894736842105,
|
|
"grad_norm": 2.1350033283233643,
|
|
"learning_rate": 7.649739183305184e-06,
|
|
"loss": 0.0648,
|
|
"mean_token_accuracy": 0.9821073412895203,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.8201754385964912,
|
|
"grad_norm": 1.8890544176101685,
|
|
"learning_rate": 7.619384482879039e-06,
|
|
"loss": 0.039,
|
|
"mean_token_accuracy": 0.991769552230835,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.8245614035087719,
|
|
"grad_norm": 1.8316863775253296,
|
|
"learning_rate": 7.5889053482823015e-06,
|
|
"loss": 0.0364,
|
|
"mean_token_accuracy": 0.9871931672096252,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.8289473684210527,
|
|
"grad_norm": 1.6888874769210815,
|
|
"learning_rate": 7.558303569018417e-06,
|
|
"loss": 0.0373,
|
|
"mean_token_accuracy": 0.9896265268325806,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 2.052993059158325,
|
|
"learning_rate": 7.527580941791595e-06,
|
|
"loss": 0.0588,
|
|
"mean_token_accuracy": 0.9858155846595764,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.8377192982456141,
|
|
"grad_norm": 1.7255383729934692,
|
|
"learning_rate": 7.49673927040132e-06,
|
|
"loss": 0.0362,
|
|
"mean_token_accuracy": 0.9906250238418579,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.8421052631578947,
|
|
"grad_norm": 1.6616889238357544,
|
|
"learning_rate": 7.465780365636445e-06,
|
|
"loss": 0.0391,
|
|
"mean_token_accuracy": 0.9904153347015381,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.8464912280701754,
|
|
"grad_norm": 1.6150552034378052,
|
|
"learning_rate": 7.4347060451688805e-06,
|
|
"loss": 0.0328,
|
|
"mean_token_accuracy": 0.9899193644523621,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.8508771929824561,
|
|
"grad_norm": 1.7972464561462402,
|
|
"learning_rate": 7.403518133446866e-06,
|
|
"loss": 0.0631,
|
|
"mean_token_accuracy": 0.9849849939346313,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.8552631578947368,
|
|
"grad_norm": 2.3736069202423096,
|
|
"learning_rate": 7.37221846158786e-06,
|
|
"loss": 0.0716,
|
|
"mean_token_accuracy": 0.9841740727424622,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.8596491228070176,
|
|
"grad_norm": 1.8168158531188965,
|
|
"learning_rate": 7.340808867271031e-06,
|
|
"loss": 0.0572,
|
|
"mean_token_accuracy": 0.9852786660194397,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.8640350877192983,
|
|
"grad_norm": 2.351414203643799,
|
|
"learning_rate": 7.309291194629352e-06,
|
|
"loss": 0.0585,
|
|
"mean_token_accuracy": 0.9844617247581482,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.868421052631579,
|
|
"grad_norm": 1.9125292301177979,
|
|
"learning_rate": 7.277667294141345e-06,
|
|
"loss": 0.0449,
|
|
"mean_token_accuracy": 0.9858443140983582,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.8728070175438597,
|
|
"grad_norm": 1.656516194343567,
|
|
"learning_rate": 7.245939022522413e-06,
|
|
"loss": 0.0377,
|
|
"mean_token_accuracy": 0.9922394752502441,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.8771929824561403,
|
|
"grad_norm": 1.8018559217453003,
|
|
"learning_rate": 7.214108242615852e-06,
|
|
"loss": 0.0479,
|
|
"mean_token_accuracy": 0.9854227304458618,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.881578947368421,
|
|
"grad_norm": 2.188966989517212,
|
|
"learning_rate": 7.1821768232834595e-06,
|
|
"loss": 0.0439,
|
|
"mean_token_accuracy": 0.9846153855323792,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.8859649122807017,
|
|
"grad_norm": 2.883554697036743,
|
|
"learning_rate": 7.150146639295816e-06,
|
|
"loss": 0.0622,
|
|
"mean_token_accuracy": 0.9843912720680237,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.8903508771929824,
|
|
"grad_norm": 2.633408546447754,
|
|
"learning_rate": 7.118019571222216e-06,
|
|
"loss": 0.0701,
|
|
"mean_token_accuracy": 0.9815950989723206,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.8947368421052632,
|
|
"grad_norm": 2.7991299629211426,
|
|
"learning_rate": 7.0857975053202485e-06,
|
|
"loss": 0.0405,
|
|
"mean_token_accuracy": 0.9858155846595764,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.8991228070175439,
|
|
"grad_norm": 1.856713056564331,
|
|
"learning_rate": 7.053482333425057e-06,
|
|
"loss": 0.0295,
|
|
"mean_token_accuracy": 0.9893048405647278,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.9035087719298246,
|
|
"grad_norm": 2.318593740463257,
|
|
"learning_rate": 7.021075952838262e-06,
|
|
"loss": 0.0622,
|
|
"mean_token_accuracy": 0.9828282594680786,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.9078947368421053,
|
|
"grad_norm": 1.599013090133667,
|
|
"learning_rate": 6.988580266216566e-06,
|
|
"loss": 0.0302,
|
|
"mean_token_accuracy": 0.9893513917922974,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.9122807017543859,
|
|
"grad_norm": 3.1583027839660645,
|
|
"learning_rate": 6.955997181460041e-06,
|
|
"loss": 0.077,
|
|
"mean_token_accuracy": 0.9788732528686523,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.9166666666666666,
|
|
"grad_norm": 2.7289540767669678,
|
|
"learning_rate": 6.9233286116001194e-06,
|
|
"loss": 0.0492,
|
|
"mean_token_accuracy": 0.9878910183906555,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.9210526315789473,
|
|
"grad_norm": 3.2893893718719482,
|
|
"learning_rate": 6.890576474687264e-06,
|
|
"loss": 0.0353,
|
|
"mean_token_accuracy": 0.9867549538612366,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.9254385964912281,
|
|
"grad_norm": 1.6859092712402344,
|
|
"learning_rate": 6.857742693678367e-06,
|
|
"loss": 0.0275,
|
|
"mean_token_accuracy": 0.9899193644523621,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.9298245614035088,
|
|
"grad_norm": 4.221460342407227,
|
|
"learning_rate": 6.824829196323836e-06,
|
|
"loss": 0.0779,
|
|
"mean_token_accuracy": 0.9783315062522888,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.9342105263157895,
|
|
"grad_norm": 2.7743587493896484,
|
|
"learning_rate": 6.791837915054422e-06,
|
|
"loss": 0.0534,
|
|
"mean_token_accuracy": 0.9824561476707458,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.9385964912280702,
|
|
"grad_norm": 2.1949446201324463,
|
|
"learning_rate": 6.7587707868677566e-06,
|
|
"loss": 0.0397,
|
|
"mean_token_accuracy": 0.9856997132301331,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.9429824561403509,
|
|
"grad_norm": 1.6603456735610962,
|
|
"learning_rate": 6.725629753214624e-06,
|
|
"loss": 0.0382,
|
|
"mean_token_accuracy": 0.99170982837677,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.9473684210526315,
|
|
"grad_norm": 1.5715985298156738,
|
|
"learning_rate": 6.692416759884978e-06,
|
|
"loss": 0.0368,
|
|
"mean_token_accuracy": 0.9817258715629578,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.9517543859649122,
|
|
"grad_norm": 1.9068866968154907,
|
|
"learning_rate": 6.659133756893701e-06,
|
|
"loss": 0.0418,
|
|
"mean_token_accuracy": 0.9910537004470825,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.956140350877193,
|
|
"grad_norm": 1.595156192779541,
|
|
"learning_rate": 6.6257826983661044e-06,
|
|
"loss": 0.0243,
|
|
"mean_token_accuracy": 0.99028080701828,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.9605263157894737,
|
|
"grad_norm": 2.257960557937622,
|
|
"learning_rate": 6.592365542423213e-06,
|
|
"loss": 0.0546,
|
|
"mean_token_accuracy": 0.9854227304458618,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.9649122807017544,
|
|
"grad_norm": 1.503311276435852,
|
|
"learning_rate": 6.558884251066784e-06,
|
|
"loss": 0.0379,
|
|
"mean_token_accuracy": 0.9878172874450684,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.9692982456140351,
|
|
"grad_norm": 1.5574872493743896,
|
|
"learning_rate": 6.5253407900641195e-06,
|
|
"loss": 0.0304,
|
|
"mean_token_accuracy": 0.9918946027755737,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.9736842105263158,
|
|
"grad_norm": 1.161798357963562,
|
|
"learning_rate": 6.4917371288326554e-06,
|
|
"loss": 0.0259,
|
|
"mean_token_accuracy": 0.9921962022781372,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.9780701754385965,
|
|
"grad_norm": 1.8784377574920654,
|
|
"learning_rate": 6.458075240324324e-06,
|
|
"loss": 0.0419,
|
|
"mean_token_accuracy": 0.9917780160903931,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.9824561403508771,
|
|
"grad_norm": 1.6808606386184692,
|
|
"learning_rate": 6.424357100909724e-06,
|
|
"loss": 0.0448,
|
|
"mean_token_accuracy": 0.988095223903656,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.9868421052631579,
|
|
"grad_norm": 1.553889513015747,
|
|
"learning_rate": 6.390584690262079e-06,
|
|
"loss": 0.0274,
|
|
"mean_token_accuracy": 0.9935364723205566,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.9912280701754386,
|
|
"grad_norm": 1.9178175926208496,
|
|
"learning_rate": 6.356759991241008e-06,
|
|
"loss": 0.0525,
|
|
"mean_token_accuracy": 0.9872298836708069,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.9956140350877193,
|
|
"grad_norm": 1.2284913063049316,
|
|
"learning_rate": 6.3228849897761055e-06,
|
|
"loss": 0.0245,
|
|
"mean_token_accuracy": 0.989382266998291,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 3.0383493900299072,
|
|
"learning_rate": 6.288961674750346e-06,
|
|
"loss": 0.0677,
|
|
"mean_token_accuracy": 0.9824380278587341,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 1.0043859649122806,
|
|
"grad_norm": 1.9716832637786865,
|
|
"learning_rate": 6.2549920378833055e-06,
|
|
"loss": 0.0398,
|
|
"mean_token_accuracy": 0.9869346618652344,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 1.0087719298245614,
|
|
"grad_norm": 1.5856348276138306,
|
|
"learning_rate": 6.22097807361423e-06,
|
|
"loss": 0.0206,
|
|
"mean_token_accuracy": 0.9938837885856628,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.013157894736842,
|
|
"grad_norm": 1.0828473567962646,
|
|
"learning_rate": 6.186921778984936e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9967741966247559,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 1.0175438596491229,
|
|
"grad_norm": 1.3558406829833984,
|
|
"learning_rate": 6.152825153522552e-06,
|
|
"loss": 0.0331,
|
|
"mean_token_accuracy": 0.9902912378311157,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 1.0219298245614035,
|
|
"grad_norm": 1.2174006700515747,
|
|
"learning_rate": 6.118690199122133e-06,
|
|
"loss": 0.0291,
|
|
"mean_token_accuracy": 0.9949392676353455,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 1.0263157894736843,
|
|
"grad_norm": 1.1833233833312988,
|
|
"learning_rate": 6.084518919929112e-06,
|
|
"loss": 0.0186,
|
|
"mean_token_accuracy": 0.9923830032348633,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 1.030701754385965,
|
|
"grad_norm": 1.1958109140396118,
|
|
"learning_rate": 6.050313322221645e-06,
|
|
"loss": 0.0238,
|
|
"mean_token_accuracy": 0.9938016533851624,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 1.0350877192982457,
|
|
"grad_norm": 2.4705111980438232,
|
|
"learning_rate": 6.016075414292804e-06,
|
|
"loss": 0.0591,
|
|
"mean_token_accuracy": 0.9824198484420776,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 1.0394736842105263,
|
|
"grad_norm": 0.6340071558952332,
|
|
"learning_rate": 5.981807206332674e-06,
|
|
"loss": 0.0089,
|
|
"mean_token_accuracy": 0.9979423880577087,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 1.043859649122807,
|
|
"grad_norm": 1.5429046154022217,
|
|
"learning_rate": 5.947510710310332e-06,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.991919219493866,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 1.0482456140350878,
|
|
"grad_norm": 1.2308100461959839,
|
|
"learning_rate": 5.9131879398557125e-06,
|
|
"loss": 0.0154,
|
|
"mean_token_accuracy": 0.9931707382202148,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 1.0526315789473684,
|
|
"grad_norm": 2.413168430328369,
|
|
"learning_rate": 5.878840910141382e-06,
|
|
"loss": 0.0452,
|
|
"mean_token_accuracy": 0.989130437374115,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.0570175438596492,
|
|
"grad_norm": 1.1270220279693604,
|
|
"learning_rate": 5.844471637764232e-06,
|
|
"loss": 0.0154,
|
|
"mean_token_accuracy": 0.9969262480735779,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 1.0614035087719298,
|
|
"grad_norm": 1.9341720342636108,
|
|
"learning_rate": 5.810082140627069e-06,
|
|
"loss": 0.0201,
|
|
"mean_token_accuracy": 0.9940770268440247,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 1.0657894736842106,
|
|
"grad_norm": 0.9762948155403137,
|
|
"learning_rate": 5.77567443782015e-06,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9935622215270996,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 1.0701754385964912,
|
|
"grad_norm": 1.2798123359680176,
|
|
"learning_rate": 5.7412505495026265e-06,
|
|
"loss": 0.0204,
|
|
"mean_token_accuracy": 0.9909182786941528,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 1.0745614035087718,
|
|
"grad_norm": 1.5217316150665283,
|
|
"learning_rate": 5.70681249678394e-06,
|
|
"loss": 0.0249,
|
|
"mean_token_accuracy": 0.9922480583190918,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 1.0789473684210527,
|
|
"grad_norm": 1.015491008758545,
|
|
"learning_rate": 5.67236230160516e-06,
|
|
"loss": 0.0201,
|
|
"mean_token_accuracy": 0.995854914188385,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 1.0833333333333333,
|
|
"grad_norm": 1.6943317651748657,
|
|
"learning_rate": 5.63790198662027e-06,
|
|
"loss": 0.0299,
|
|
"mean_token_accuracy": 0.9893730282783508,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 1.087719298245614,
|
|
"grad_norm": 1.3557161092758179,
|
|
"learning_rate": 5.6034335750774086e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9929292798042297,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 1.0921052631578947,
|
|
"grad_norm": 1.4643901586532593,
|
|
"learning_rate": 5.568959090700085e-06,
|
|
"loss": 0.0185,
|
|
"mean_token_accuracy": 0.9926701784133911,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 1.0964912280701755,
|
|
"grad_norm": 1.6736350059509277,
|
|
"learning_rate": 5.534480557568358e-06,
|
|
"loss": 0.0438,
|
|
"mean_token_accuracy": 0.9869608879089355,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.1008771929824561,
|
|
"grad_norm": 1.204957127571106,
|
|
"learning_rate": 5.500000000000001e-06,
|
|
"loss": 0.0214,
|
|
"mean_token_accuracy": 0.9910891056060791,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 1.1052631578947367,
|
|
"grad_norm": 1.2613508701324463,
|
|
"learning_rate": 5.465519442431644e-06,
|
|
"loss": 0.0209,
|
|
"mean_token_accuracy": 0.9940416812896729,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 1.1096491228070176,
|
|
"grad_norm": 1.2684029340744019,
|
|
"learning_rate": 5.431040909299917e-06,
|
|
"loss": 0.0141,
|
|
"mean_token_accuracy": 0.9940357804298401,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 1.1140350877192982,
|
|
"grad_norm": 1.2532188892364502,
|
|
"learning_rate": 5.3965664249225945e-06,
|
|
"loss": 0.0256,
|
|
"mean_token_accuracy": 0.9927404522895813,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 1.118421052631579,
|
|
"grad_norm": 1.1357946395874023,
|
|
"learning_rate": 5.362098013379732e-06,
|
|
"loss": 0.0198,
|
|
"mean_token_accuracy": 0.9929078221321106,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 1.1228070175438596,
|
|
"grad_norm": 1.2526124715805054,
|
|
"learning_rate": 5.327637698394842e-06,
|
|
"loss": 0.0305,
|
|
"mean_token_accuracy": 0.9918450713157654,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 1.1271929824561404,
|
|
"grad_norm": 1.2404338121414185,
|
|
"learning_rate": 5.293187503216062e-06,
|
|
"loss": 0.0221,
|
|
"mean_token_accuracy": 0.9930555820465088,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 1.131578947368421,
|
|
"grad_norm": 1.745115041732788,
|
|
"learning_rate": 5.258749450497376e-06,
|
|
"loss": 0.0247,
|
|
"mean_token_accuracy": 0.9915878176689148,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 1.1359649122807018,
|
|
"grad_norm": 1.7782152891159058,
|
|
"learning_rate": 5.224325562179852e-06,
|
|
"loss": 0.0403,
|
|
"mean_token_accuracy": 0.9923737049102783,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 1.1403508771929824,
|
|
"grad_norm": 0.9483814835548401,
|
|
"learning_rate": 5.189917859372933e-06,
|
|
"loss": 0.0144,
|
|
"mean_token_accuracy": 0.9959514141082764,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.1447368421052633,
|
|
"grad_norm": 0.9481673240661621,
|
|
"learning_rate": 5.15552836223577e-06,
|
|
"loss": 0.008,
|
|
"mean_token_accuracy": 0.9958333373069763,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 1.1491228070175439,
|
|
"grad_norm": 1.904180884361267,
|
|
"learning_rate": 5.121159089858619e-06,
|
|
"loss": 0.0318,
|
|
"mean_token_accuracy": 0.9882978796958923,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 1.1535087719298245,
|
|
"grad_norm": 1.5921107530593872,
|
|
"learning_rate": 5.08681206014429e-06,
|
|
"loss": 0.0271,
|
|
"mean_token_accuracy": 0.9932692050933838,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 1.1578947368421053,
|
|
"grad_norm": 0.7929293513298035,
|
|
"learning_rate": 5.0524892896896685e-06,
|
|
"loss": 0.0077,
|
|
"mean_token_accuracy": 0.9979695677757263,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.162280701754386,
|
|
"grad_norm": 0.8570319414138794,
|
|
"learning_rate": 5.0181927936673265e-06,
|
|
"loss": 0.0132,
|
|
"mean_token_accuracy": 0.9949290156364441,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 1.1666666666666667,
|
|
"grad_norm": 1.7126463651657104,
|
|
"learning_rate": 4.983924585707199e-06,
|
|
"loss": 0.0254,
|
|
"mean_token_accuracy": 0.9899899959564209,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 1.1710526315789473,
|
|
"grad_norm": 1.0722357034683228,
|
|
"learning_rate": 4.949686677778357e-06,
|
|
"loss": 0.0195,
|
|
"mean_token_accuracy": 0.994908332824707,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 1.1754385964912282,
|
|
"grad_norm": 1.97144615650177,
|
|
"learning_rate": 4.915481080070887e-06,
|
|
"loss": 0.0277,
|
|
"mean_token_accuracy": 0.9918864369392395,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 1.1798245614035088,
|
|
"grad_norm": 1.4325839281082153,
|
|
"learning_rate": 4.8813098008778685e-06,
|
|
"loss": 0.0249,
|
|
"mean_token_accuracy": 0.9949135184288025,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 1.1842105263157894,
|
|
"grad_norm": 1.6865849494934082,
|
|
"learning_rate": 4.847174846477448e-06,
|
|
"loss": 0.024,
|
|
"mean_token_accuracy": 0.9917440414428711,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.1885964912280702,
|
|
"grad_norm": 1.0741541385650635,
|
|
"learning_rate": 4.813078221015065e-06,
|
|
"loss": 0.011,
|
|
"mean_token_accuracy": 0.9949698448181152,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 1.1929824561403508,
|
|
"grad_norm": 0.9860951900482178,
|
|
"learning_rate": 4.779021926385771e-06,
|
|
"loss": 0.0132,
|
|
"mean_token_accuracy": 0.9968119263648987,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 1.1973684210526316,
|
|
"grad_norm": 0.8894346952438354,
|
|
"learning_rate": 4.745007962116697e-06,
|
|
"loss": 0.0098,
|
|
"mean_token_accuracy": 0.996842086315155,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 1.2017543859649122,
|
|
"grad_norm": 1.0604538917541504,
|
|
"learning_rate": 4.711038325249655e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9912366271018982,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 1.206140350877193,
|
|
"grad_norm": 1.4495034217834473,
|
|
"learning_rate": 4.677115010223895e-06,
|
|
"loss": 0.0242,
|
|
"mean_token_accuracy": 0.9911330342292786,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.2105263157894737,
|
|
"grad_norm": 1.5287433862686157,
|
|
"learning_rate": 4.6432400087589925e-06,
|
|
"loss": 0.0239,
|
|
"mean_token_accuracy": 0.9931237697601318,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.2149122807017543,
|
|
"grad_norm": 0.8271854519844055,
|
|
"learning_rate": 4.609415309737922e-06,
|
|
"loss": 0.0118,
|
|
"mean_token_accuracy": 0.9979296326637268,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 1.219298245614035,
|
|
"grad_norm": 2.2274882793426514,
|
|
"learning_rate": 4.5756428990902765e-06,
|
|
"loss": 0.0347,
|
|
"mean_token_accuracy": 0.9863445162773132,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 1.2236842105263157,
|
|
"grad_norm": 1.3514066934585571,
|
|
"learning_rate": 4.541924759675677e-06,
|
|
"loss": 0.028,
|
|
"mean_token_accuracy": 0.9940179586410522,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 1.2280701754385965,
|
|
"grad_norm": 1.8070344924926758,
|
|
"learning_rate": 4.508262871167347e-06,
|
|
"loss": 0.0321,
|
|
"mean_token_accuracy": 0.9906928539276123,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.2324561403508771,
|
|
"grad_norm": 0.6789629459381104,
|
|
"learning_rate": 4.474659209935882e-06,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9977426528930664,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 1.236842105263158,
|
|
"grad_norm": 1.686585783958435,
|
|
"learning_rate": 4.441115748933219e-06,
|
|
"loss": 0.0238,
|
|
"mean_token_accuracy": 0.9934210777282715,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 1.2412280701754386,
|
|
"grad_norm": 2.098029375076294,
|
|
"learning_rate": 4.4076344575767895e-06,
|
|
"loss": 0.018,
|
|
"mean_token_accuracy": 0.9947643876075745,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 1.2456140350877192,
|
|
"grad_norm": 0.9982621669769287,
|
|
"learning_rate": 4.374217301633897e-06,
|
|
"loss": 0.0159,
|
|
"mean_token_accuracy": 0.9956803321838379,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 1.25,
|
|
"grad_norm": 1.0374560356140137,
|
|
"learning_rate": 4.340866243106302e-06,
|
|
"loss": 0.0211,
|
|
"mean_token_accuracy": 0.9930692911148071,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.2543859649122808,
|
|
"grad_norm": 1.416849970817566,
|
|
"learning_rate": 4.307583240115024e-06,
|
|
"loss": 0.0355,
|
|
"mean_token_accuracy": 0.9919517040252686,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 1.2587719298245614,
|
|
"grad_norm": 1.1965305805206299,
|
|
"learning_rate": 4.274370246785379e-06,
|
|
"loss": 0.0094,
|
|
"mean_token_accuracy": 0.9968051314353943,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 1.263157894736842,
|
|
"grad_norm": 0.9822368025779724,
|
|
"learning_rate": 4.241229213132245e-06,
|
|
"loss": 0.0133,
|
|
"mean_token_accuracy": 0.995708167552948,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 1.2675438596491229,
|
|
"grad_norm": 1.1890642642974854,
|
|
"learning_rate": 4.208162084945579e-06,
|
|
"loss": 0.0237,
|
|
"mean_token_accuracy": 0.994106113910675,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 1.2719298245614035,
|
|
"grad_norm": 0.9328476786613464,
|
|
"learning_rate": 4.175170803676166e-06,
|
|
"loss": 0.0177,
|
|
"mean_token_accuracy": 0.9957805871963501,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.2763157894736843,
|
|
"grad_norm": 1.1378854513168335,
|
|
"learning_rate": 4.142257306321635e-06,
|
|
"loss": 0.0119,
|
|
"mean_token_accuracy": 0.9957671761512756,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 1.280701754385965,
|
|
"grad_norm": 1.4593102931976318,
|
|
"learning_rate": 4.109423525312738e-06,
|
|
"loss": 0.0371,
|
|
"mean_token_accuracy": 0.9897330403327942,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 1.2850877192982457,
|
|
"grad_norm": 1.4562864303588867,
|
|
"learning_rate": 4.076671388399882e-06,
|
|
"loss": 0.0268,
|
|
"mean_token_accuracy": 0.9888211488723755,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 1.2894736842105263,
|
|
"grad_norm": 1.087637186050415,
|
|
"learning_rate": 4.044002818539959e-06,
|
|
"loss": 0.0126,
|
|
"mean_token_accuracy": 0.9948612451553345,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 1.293859649122807,
|
|
"grad_norm": 1.2622966766357422,
|
|
"learning_rate": 4.011419733783436e-06,
|
|
"loss": 0.0174,
|
|
"mean_token_accuracy": 0.9938587546348572,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 1.2982456140350878,
|
|
"grad_norm": 0.9577850103378296,
|
|
"learning_rate": 3.978924047161738e-06,
|
|
"loss": 0.0093,
|
|
"mean_token_accuracy": 0.9958677887916565,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 1.3026315789473684,
|
|
"grad_norm": 1.082053780555725,
|
|
"learning_rate": 3.946517666574944e-06,
|
|
"loss": 0.0215,
|
|
"mean_token_accuracy": 0.9957401752471924,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 1.3070175438596492,
|
|
"grad_norm": 1.4833756685256958,
|
|
"learning_rate": 3.914202494679753e-06,
|
|
"loss": 0.0253,
|
|
"mean_token_accuracy": 0.9936237931251526,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 1.3114035087719298,
|
|
"grad_norm": 1.4708722829818726,
|
|
"learning_rate": 3.8819804287777855e-06,
|
|
"loss": 0.0256,
|
|
"mean_token_accuracy": 0.9921104311943054,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 1.3157894736842106,
|
|
"grad_norm": 1.3255459070205688,
|
|
"learning_rate": 3.849853360704185e-06,
|
|
"loss": 0.0101,
|
|
"mean_token_accuracy": 0.9958974123001099,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.3201754385964912,
|
|
"grad_norm": 1.0774561166763306,
|
|
"learning_rate": 3.817823176716541e-06,
|
|
"loss": 0.0208,
|
|
"mean_token_accuracy": 0.997140109539032,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 1.3245614035087718,
|
|
"grad_norm": 0.8801477551460266,
|
|
"learning_rate": 3.785891757384148e-06,
|
|
"loss": 0.0129,
|
|
"mean_token_accuracy": 0.9960474371910095,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 1.3289473684210527,
|
|
"grad_norm": 1.8041269779205322,
|
|
"learning_rate": 3.7540609774775872e-06,
|
|
"loss": 0.0319,
|
|
"mean_token_accuracy": 0.9916054606437683,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 1.3232316970825195,
|
|
"learning_rate": 3.7223327058586566e-06,
|
|
"loss": 0.0249,
|
|
"mean_token_accuracy": 0.9913700222969055,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 1.337719298245614,
|
|
"grad_norm": 1.796228289604187,
|
|
"learning_rate": 3.6907088053706486e-06,
|
|
"loss": 0.0267,
|
|
"mean_token_accuracy": 0.9892683029174805,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 1.3421052631578947,
|
|
"grad_norm": 0.7775062918663025,
|
|
"learning_rate": 3.659191132728971e-06,
|
|
"loss": 0.0105,
|
|
"mean_token_accuracy": 0.997863233089447,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 1.3464912280701755,
|
|
"grad_norm": 1.0526679754257202,
|
|
"learning_rate": 3.6277815384121408e-06,
|
|
"loss": 0.0108,
|
|
"mean_token_accuracy": 0.9970443248748779,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 1.3508771929824561,
|
|
"grad_norm": 1.436594843864441,
|
|
"learning_rate": 3.5964818665531365e-06,
|
|
"loss": 0.0221,
|
|
"mean_token_accuracy": 0.9921645522117615,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 1.3552631578947367,
|
|
"grad_norm": 1.1766964197158813,
|
|
"learning_rate": 3.5652939548311217e-06,
|
|
"loss": 0.0156,
|
|
"mean_token_accuracy": 0.9943872690200806,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 1.3596491228070176,
|
|
"grad_norm": 1.0627126693725586,
|
|
"learning_rate": 3.534219634363557e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9959142208099365,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.3640350877192984,
|
|
"grad_norm": 1.667644739151001,
|
|
"learning_rate": 3.503260729598681e-06,
|
|
"loss": 0.0326,
|
|
"mean_token_accuracy": 0.9905857443809509,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 1.368421052631579,
|
|
"grad_norm": 2.0375561714172363,
|
|
"learning_rate": 3.4724190582084073e-06,
|
|
"loss": 0.0481,
|
|
"mean_token_accuracy": 0.9823834300041199,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 1.3728070175438596,
|
|
"grad_norm": 1.3789243698120117,
|
|
"learning_rate": 3.441696430981585e-06,
|
|
"loss": 0.0201,
|
|
"mean_token_accuracy": 0.9931034445762634,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 1.3771929824561404,
|
|
"grad_norm": 0.9940765500068665,
|
|
"learning_rate": 3.4110946517176995e-06,
|
|
"loss": 0.0178,
|
|
"mean_token_accuracy": 0.9958890080451965,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 1.381578947368421,
|
|
"grad_norm": 1.2933381795883179,
|
|
"learning_rate": 3.3806155171209632e-06,
|
|
"loss": 0.0252,
|
|
"mean_token_accuracy": 0.9917948842048645,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 1.3859649122807016,
|
|
"grad_norm": 1.69535493850708,
|
|
"learning_rate": 3.3502608166948166e-06,
|
|
"loss": 0.0376,
|
|
"mean_token_accuracy": 0.991062581539154,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 1.3903508771929824,
|
|
"grad_norm": 0.9709140658378601,
|
|
"learning_rate": 3.320032332636879e-06,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9969103932380676,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 1.3947368421052633,
|
|
"grad_norm": 1.2243115901947021,
|
|
"learning_rate": 3.2899318397342954e-06,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.9939024448394775,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 1.3991228070175439,
|
|
"grad_norm": 2.733086347579956,
|
|
"learning_rate": 3.2599611052595474e-06,
|
|
"loss": 0.0081,
|
|
"mean_token_accuracy": 0.9990205764770508,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 1.4035087719298245,
|
|
"grad_norm": 0.9694793820381165,
|
|
"learning_rate": 3.2301218888666807e-06,
|
|
"loss": 0.0129,
|
|
"mean_token_accuracy": 0.9959142208099365,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.4078947368421053,
|
|
"grad_norm": 1.1064481735229492,
|
|
"learning_rate": 3.200415942488003e-06,
|
|
"loss": 0.0163,
|
|
"mean_token_accuracy": 0.995854914188385,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 1.412280701754386,
|
|
"grad_norm": 1.0130923986434937,
|
|
"learning_rate": 3.170845010231216e-06,
|
|
"loss": 0.0142,
|
|
"mean_token_accuracy": 0.9929789304733276,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 1.4166666666666667,
|
|
"grad_norm": 0.9644030928611755,
|
|
"learning_rate": 3.141410828277015e-06,
|
|
"loss": 0.0111,
|
|
"mean_token_accuracy": 0.9979209899902344,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 1.4210526315789473,
|
|
"grad_norm": 0.9800613522529602,
|
|
"learning_rate": 3.1121151247771595e-06,
|
|
"loss": 0.014,
|
|
"mean_token_accuracy": 0.9939637780189514,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 1.4254385964912282,
|
|
"grad_norm": 1.5895040035247803,
|
|
"learning_rate": 3.082959619753001e-06,
|
|
"loss": 0.0252,
|
|
"mean_token_accuracy": 0.9931906461715698,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.4298245614035088,
|
|
"grad_norm": 1.449415683746338,
|
|
"learning_rate": 3.053946024994506e-06,
|
|
"loss": 0.0261,
|
|
"mean_token_accuracy": 0.993062436580658,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 1.4342105263157894,
|
|
"grad_norm": 1.403334617614746,
|
|
"learning_rate": 3.025076043959739e-06,
|
|
"loss": 0.0311,
|
|
"mean_token_accuracy": 0.9867346882820129,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 1.4385964912280702,
|
|
"grad_norm": 0.661690354347229,
|
|
"learning_rate": 2.9963513716748656e-06,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9989304542541504,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 1.4429824561403508,
|
|
"grad_norm": 1.2454185485839844,
|
|
"learning_rate": 2.96777369463462e-06,
|
|
"loss": 0.0167,
|
|
"mean_token_accuracy": 0.9941291809082031,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 1.4473684210526316,
|
|
"grad_norm": 1.5708200931549072,
|
|
"learning_rate": 2.9393446907032886e-06,
|
|
"loss": 0.0275,
|
|
"mean_token_accuracy": 0.9918946027755737,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.4517543859649122,
|
|
"grad_norm": 1.4437129497528076,
|
|
"learning_rate": 2.911066029016208e-06,
|
|
"loss": 0.0185,
|
|
"mean_token_accuracy": 0.9933142066001892,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 1.456140350877193,
|
|
"grad_norm": 1.310154914855957,
|
|
"learning_rate": 2.8829393698817566e-06,
|
|
"loss": 0.0146,
|
|
"mean_token_accuracy": 0.9949392676353455,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 1.4605263157894737,
|
|
"grad_norm": 1.182435154914856,
|
|
"learning_rate": 2.854966364683872e-06,
|
|
"loss": 0.0145,
|
|
"mean_token_accuracy": 0.9941691160202026,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 1.4649122807017543,
|
|
"grad_norm": 1.2059062719345093,
|
|
"learning_rate": 2.827148655785107e-06,
|
|
"loss": 0.0204,
|
|
"mean_token_accuracy": 0.9931640625,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 1.469298245614035,
|
|
"grad_norm": 1.8456898927688599,
|
|
"learning_rate": 2.7994878764301857e-06,
|
|
"loss": 0.0249,
|
|
"mean_token_accuracy": 0.9907975196838379,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 1.4736842105263157,
|
|
"grad_norm": 0.9849246740341187,
|
|
"learning_rate": 2.771985650650131e-06,
|
|
"loss": 0.0084,
|
|
"mean_token_accuracy": 0.9968619346618652,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 1.4780701754385965,
|
|
"grad_norm": 0.9812407493591309,
|
|
"learning_rate": 2.7446435931668913e-06,
|
|
"loss": 0.0146,
|
|
"mean_token_accuracy": 0.9926624894142151,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 1.4824561403508771,
|
|
"grad_norm": 1.353092908859253,
|
|
"learning_rate": 2.717463309298557e-06,
|
|
"loss": 0.0243,
|
|
"mean_token_accuracy": 0.990981936454773,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 1.486842105263158,
|
|
"grad_norm": 1.1078283786773682,
|
|
"learning_rate": 2.6904463948650994e-06,
|
|
"loss": 0.0099,
|
|
"mean_token_accuracy": 0.9957716464996338,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 1.4912280701754386,
|
|
"grad_norm": 1.0464521646499634,
|
|
"learning_rate": 2.663594436094669e-06,
|
|
"loss": 0.0116,
|
|
"mean_token_accuracy": 0.9948132634162903,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.4956140350877192,
|
|
"grad_norm": 1.4984095096588135,
|
|
"learning_rate": 2.6369090095304824e-06,
|
|
"loss": 0.0168,
|
|
"mean_token_accuracy": 0.9929577708244324,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 1.4189928770065308,
|
|
"learning_rate": 2.610391681938239e-06,
|
|
"loss": 0.0192,
|
|
"mean_token_accuracy": 0.9928789138793945,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 1.5043859649122808,
|
|
"grad_norm": 1.3250926733016968,
|
|
"learning_rate": 2.5840440102141506e-06,
|
|
"loss": 0.0171,
|
|
"mean_token_accuracy": 0.9968684911727905,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 1.5087719298245614,
|
|
"grad_norm": 1.1156545877456665,
|
|
"learning_rate": 2.5578675412935172e-06,
|
|
"loss": 0.012,
|
|
"mean_token_accuracy": 0.9957310557365417,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 1.513157894736842,
|
|
"grad_norm": 1.6553047895431519,
|
|
"learning_rate": 2.531863812059916e-06,
|
|
"loss": 0.0299,
|
|
"mean_token_accuracy": 0.9908722043037415,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 1.5175438596491229,
|
|
"grad_norm": 1.5688374042510986,
|
|
"learning_rate": 2.5060343492549567e-06,
|
|
"loss": 0.0273,
|
|
"mean_token_accuracy": 0.9896324276924133,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 1.5219298245614035,
|
|
"grad_norm": 0.9431729316711426,
|
|
"learning_rate": 2.480380669388648e-06,
|
|
"loss": 0.0113,
|
|
"mean_token_accuracy": 0.9944953918457031,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 1.526315789473684,
|
|
"grad_norm": 1.5625497102737427,
|
|
"learning_rate": 2.45490427865036e-06,
|
|
"loss": 0.0251,
|
|
"mean_token_accuracy": 0.991769552230835,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 1.530701754385965,
|
|
"grad_norm": 0.818818211555481,
|
|
"learning_rate": 2.429606672820387e-06,
|
|
"loss": 0.0083,
|
|
"mean_token_accuracy": 0.998031497001648,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 1.5350877192982457,
|
|
"grad_norm": 1.1630852222442627,
|
|
"learning_rate": 2.4044893371821373e-06,
|
|
"loss": 0.0156,
|
|
"mean_token_accuracy": 0.9918367266654968,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.5394736842105263,
|
|
"grad_norm": 1.1115548610687256,
|
|
"learning_rate": 2.379553746434913e-06,
|
|
"loss": 0.023,
|
|
"mean_token_accuracy": 0.9939209818840027,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 1.543859649122807,
|
|
"grad_norm": 1.636430025100708,
|
|
"learning_rate": 2.3548013646073427e-06,
|
|
"loss": 0.0279,
|
|
"mean_token_accuracy": 0.991623044013977,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 1.5482456140350878,
|
|
"grad_norm": 1.3159112930297852,
|
|
"learning_rate": 2.3302336449714166e-06,
|
|
"loss": 0.0231,
|
|
"mean_token_accuracy": 0.9938587546348572,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 1.5526315789473686,
|
|
"grad_norm": 1.300433874130249,
|
|
"learning_rate": 2.305852029957159e-06,
|
|
"loss": 0.0177,
|
|
"mean_token_accuracy": 0.9944812655448914,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 1.557017543859649,
|
|
"grad_norm": 0.9912800788879395,
|
|
"learning_rate": 2.281657951067948e-06,
|
|
"loss": 0.014,
|
|
"mean_token_accuracy": 0.9920477271080017,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 1.5614035087719298,
|
|
"grad_norm": 1.0530201196670532,
|
|
"learning_rate": 2.257652828796459e-06,
|
|
"loss": 0.0199,
|
|
"mean_token_accuracy": 0.9917184114456177,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 1.5657894736842106,
|
|
"grad_norm": 1.6638238430023193,
|
|
"learning_rate": 2.233838072541273e-06,
|
|
"loss": 0.0315,
|
|
"mean_token_accuracy": 0.9910714030265808,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 1.5701754385964912,
|
|
"grad_norm": 0.8437574505805969,
|
|
"learning_rate": 2.2102150805241233e-06,
|
|
"loss": 0.0093,
|
|
"mean_token_accuracy": 0.9969450235366821,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 1.5745614035087718,
|
|
"grad_norm": 0.6092004179954529,
|
|
"learning_rate": 2.186785239707799e-06,
|
|
"loss": 0.0062,
|
|
"mean_token_accuracy": 0.998971164226532,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 1.5789473684210527,
|
|
"grad_norm": 1.3415589332580566,
|
|
"learning_rate": 2.163549925714721e-06,
|
|
"loss": 0.0254,
|
|
"mean_token_accuracy": 0.9937824010848999,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.5833333333333335,
|
|
"grad_norm": 1.4913095235824585,
|
|
"learning_rate": 2.140510502746168e-06,
|
|
"loss": 0.0208,
|
|
"mean_token_accuracy": 0.9936237931251526,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 1.587719298245614,
|
|
"grad_norm": 1.1327183246612549,
|
|
"learning_rate": 2.1176683235021885e-06,
|
|
"loss": 0.0154,
|
|
"mean_token_accuracy": 0.9950690269470215,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 1.5921052631578947,
|
|
"grad_norm": 0.7053467631340027,
|
|
"learning_rate": 2.0950247291021713e-06,
|
|
"loss": 0.0081,
|
|
"mean_token_accuracy": 0.996999979019165,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 1.5964912280701755,
|
|
"grad_norm": 0.7855185270309448,
|
|
"learning_rate": 2.0725810490061156e-06,
|
|
"loss": 0.0195,
|
|
"mean_token_accuracy": 0.9970443248748779,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 1.6008771929824561,
|
|
"grad_norm": 1.0088672637939453,
|
|
"learning_rate": 2.0503386009365685e-06,
|
|
"loss": 0.0177,
|
|
"mean_token_accuracy": 0.9940298795700073,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 1.6052631578947367,
|
|
"grad_norm": 1.4785208702087402,
|
|
"learning_rate": 2.028298690801257e-06,
|
|
"loss": 0.0254,
|
|
"mean_token_accuracy": 0.9921645522117615,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 1.6096491228070176,
|
|
"grad_norm": 1.0246776342391968,
|
|
"learning_rate": 2.006462612616422e-06,
|
|
"loss": 0.0166,
|
|
"mean_token_accuracy": 0.9929364323616028,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 1.6140350877192984,
|
|
"grad_norm": 1.642774224281311,
|
|
"learning_rate": 1.984831648430836e-06,
|
|
"loss": 0.0268,
|
|
"mean_token_accuracy": 0.9900596141815186,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 1.618421052631579,
|
|
"grad_norm": 1.5654659271240234,
|
|
"learning_rate": 1.963407068250538e-06,
|
|
"loss": 0.0154,
|
|
"mean_token_accuracy": 0.991623044013977,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 1.6228070175438596,
|
|
"grad_norm": 1.4609651565551758,
|
|
"learning_rate": 1.9421901299642597e-06,
|
|
"loss": 0.0242,
|
|
"mean_token_accuracy": 0.9956569075584412,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.6271929824561404,
|
|
"grad_norm": 0.9023415446281433,
|
|
"learning_rate": 1.9211820792695808e-06,
|
|
"loss": 0.0117,
|
|
"mean_token_accuracy": 0.9969040155410767,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 1.631578947368421,
|
|
"grad_norm": 1.6618549823760986,
|
|
"learning_rate": 1.900384149599787e-06,
|
|
"loss": 0.0204,
|
|
"mean_token_accuracy": 0.9948979616165161,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 1.6359649122807016,
|
|
"grad_norm": 1.1040427684783936,
|
|
"learning_rate": 1.8797975620514497e-06,
|
|
"loss": 0.0156,
|
|
"mean_token_accuracy": 0.9969199299812317,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 1.6403508771929824,
|
|
"grad_norm": 0.9365503191947937,
|
|
"learning_rate": 1.8594235253127373e-06,
|
|
"loss": 0.0138,
|
|
"mean_token_accuracy": 0.9949596524238586,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 1.6447368421052633,
|
|
"grad_norm": 1.4119096994400024,
|
|
"learning_rate": 1.8392632355924454e-06,
|
|
"loss": 0.027,
|
|
"mean_token_accuracy": 0.9888888597488403,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.6491228070175439,
|
|
"grad_norm": 1.5925312042236328,
|
|
"learning_rate": 1.819317876549771e-06,
|
|
"loss": 0.0221,
|
|
"mean_token_accuracy": 0.9939698576927185,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 1.6535087719298245,
|
|
"grad_norm": 1.48525071144104,
|
|
"learning_rate": 1.7995886192248091e-06,
|
|
"loss": 0.0246,
|
|
"mean_token_accuracy": 0.9900990128517151,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 1.6578947368421053,
|
|
"grad_norm": 1.229453682899475,
|
|
"learning_rate": 1.7800766219698033e-06,
|
|
"loss": 0.024,
|
|
"mean_token_accuracy": 0.9931840300559998,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 1.662280701754386,
|
|
"grad_norm": 0.9059959053993225,
|
|
"learning_rate": 1.760783030381138e-06,
|
|
"loss": 0.0114,
|
|
"mean_token_accuracy": 0.99798184633255,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 1.1679930686950684,
|
|
"learning_rate": 1.74170897723207e-06,
|
|
"loss": 0.0271,
|
|
"mean_token_accuracy": 0.9939698576927185,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.6710526315789473,
|
|
"grad_norm": 1.0880180597305298,
|
|
"learning_rate": 1.7228555824062254e-06,
|
|
"loss": 0.0253,
|
|
"mean_token_accuracy": 0.9911330342292786,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 1.6754385964912282,
|
|
"grad_norm": 1.014100432395935,
|
|
"learning_rate": 1.7042239528318539e-06,
|
|
"loss": 0.0176,
|
|
"mean_token_accuracy": 0.9950099587440491,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 1.6798245614035088,
|
|
"grad_norm": 0.5912336707115173,
|
|
"learning_rate": 1.6858151824168254e-06,
|
|
"loss": 0.0094,
|
|
"mean_token_accuracy": 0.9989626407623291,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 1.6842105263157894,
|
|
"grad_norm": 0.47988444566726685,
|
|
"learning_rate": 1.6676303519844179e-06,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9989888668060303,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 1.6885964912280702,
|
|
"grad_norm": 0.8239793181419373,
|
|
"learning_rate": 1.649670529209848e-06,
|
|
"loss": 0.0134,
|
|
"mean_token_accuracy": 0.9968085289001465,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 1.692982456140351,
|
|
"grad_norm": 0.7349763512611389,
|
|
"learning_rate": 1.631936768557596e-06,
|
|
"loss": 0.0158,
|
|
"mean_token_accuracy": 0.9979444742202759,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 1.6973684210526314,
|
|
"grad_norm": 0.9158388376235962,
|
|
"learning_rate": 1.6144301112194843e-06,
|
|
"loss": 0.018,
|
|
"mean_token_accuracy": 0.9950787425041199,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 1.7017543859649122,
|
|
"grad_norm": 0.8673951029777527,
|
|
"learning_rate": 1.5971515850535568e-06,
|
|
"loss": 0.0143,
|
|
"mean_token_accuracy": 0.9948024749755859,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 1.706140350877193,
|
|
"grad_norm": 1.3981205224990845,
|
|
"learning_rate": 1.5801022045237252e-06,
|
|
"loss": 0.0158,
|
|
"mean_token_accuracy": 0.9948822855949402,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 1.7105263157894737,
|
|
"grad_norm": 1.0226213932037354,
|
|
"learning_rate": 1.5632829706402076e-06,
|
|
"loss": 0.0157,
|
|
"mean_token_accuracy": 0.992790937423706,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.7149122807017543,
|
|
"grad_norm": 1.0367653369903564,
|
|
"learning_rate": 1.5466948709007604e-06,
|
|
"loss": 0.0128,
|
|
"mean_token_accuracy": 0.9968944191932678,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 1.719298245614035,
|
|
"grad_norm": 0.9219287633895874,
|
|
"learning_rate": 1.5303388792326934e-06,
|
|
"loss": 0.0091,
|
|
"mean_token_accuracy": 0.9959595799446106,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 1.723684210526316,
|
|
"grad_norm": 1.0634158849716187,
|
|
"learning_rate": 1.5142159559356961e-06,
|
|
"loss": 0.0217,
|
|
"mean_token_accuracy": 0.9946178793907166,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 1.7280701754385965,
|
|
"grad_norm": 0.6948128938674927,
|
|
"learning_rate": 1.4983270476254503e-06,
|
|
"loss": 0.0101,
|
|
"mean_token_accuracy": 0.9969666600227356,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 1.7324561403508771,
|
|
"grad_norm": 0.7936986088752747,
|
|
"learning_rate": 1.4826730871780534e-06,
|
|
"loss": 0.008,
|
|
"mean_token_accuracy": 0.9979550242424011,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 1.736842105263158,
|
|
"grad_norm": 1.4505982398986816,
|
|
"learning_rate": 1.4672549936752507e-06,
|
|
"loss": 0.0238,
|
|
"mean_token_accuracy": 0.9917269945144653,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 1.7412280701754386,
|
|
"grad_norm": 1.2325935363769531,
|
|
"learning_rate": 1.4520736723504658e-06,
|
|
"loss": 0.0295,
|
|
"mean_token_accuracy": 0.992790937423706,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 1.7456140350877192,
|
|
"grad_norm": 0.6576675772666931,
|
|
"learning_rate": 1.437130014535662e-06,
|
|
"loss": 0.0054,
|
|
"mean_token_accuracy": 0.9979166388511658,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 1.75,
|
|
"grad_norm": 1.0155889987945557,
|
|
"learning_rate": 1.4224248976090016e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9958974123001099,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 1.7543859649122808,
|
|
"grad_norm": 1.2243722677230835,
|
|
"learning_rate": 1.4079591849433383e-06,
|
|
"loss": 0.0179,
|
|
"mean_token_accuracy": 0.9928352236747742,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.7587719298245614,
|
|
"grad_norm": 0.9892923831939697,
|
|
"learning_rate": 1.3937337258555252e-06,
|
|
"loss": 0.0109,
|
|
"mean_token_accuracy": 0.9957761168479919,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 1.763157894736842,
|
|
"grad_norm": 0.8138551115989685,
|
|
"learning_rate": 1.379749355556547e-06,
|
|
"loss": 0.0151,
|
|
"mean_token_accuracy": 0.9960435032844543,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 1.7675438596491229,
|
|
"grad_norm": 0.639743447303772,
|
|
"learning_rate": 1.3660068951024857e-06,
|
|
"loss": 0.0098,
|
|
"mean_token_accuracy": 0.9989506602287292,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 1.7719298245614035,
|
|
"grad_norm": 1.450443148612976,
|
|
"learning_rate": 1.3525071513463128e-06,
|
|
"loss": 0.0236,
|
|
"mean_token_accuracy": 0.9903948903083801,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 1.776315789473684,
|
|
"grad_norm": 1.0791845321655273,
|
|
"learning_rate": 1.339250916890519e-06,
|
|
"loss": 0.0215,
|
|
"mean_token_accuracy": 0.992790937423706,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 1.780701754385965,
|
|
"grad_norm": 1.399596929550171,
|
|
"learning_rate": 1.3262389700405746e-06,
|
|
"loss": 0.0186,
|
|
"mean_token_accuracy": 0.9940476417541504,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 1.7850877192982457,
|
|
"grad_norm": 0.9804695248603821,
|
|
"learning_rate": 1.3134720747592373e-06,
|
|
"loss": 0.0184,
|
|
"mean_token_accuracy": 0.9948506951332092,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 1.7894736842105263,
|
|
"grad_norm": 0.8765610456466675,
|
|
"learning_rate": 1.3009509806216986e-06,
|
|
"loss": 0.0074,
|
|
"mean_token_accuracy": 0.9968782663345337,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 1.793859649122807,
|
|
"grad_norm": 1.2848998308181763,
|
|
"learning_rate": 1.2886764227715679e-06,
|
|
"loss": 0.0178,
|
|
"mean_token_accuracy": 0.9906736016273499,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 1.7982456140350878,
|
|
"grad_norm": 1.2028905153274536,
|
|
"learning_rate": 1.2766491218777197e-06,
|
|
"loss": 0.0173,
|
|
"mean_token_accuracy": 0.9950835704803467,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.8026315789473686,
|
|
"grad_norm": 1.0921279191970825,
|
|
"learning_rate": 1.2648697840919732e-06,
|
|
"loss": 0.014,
|
|
"mean_token_accuracy": 0.9947368502616882,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 1.807017543859649,
|
|
"grad_norm": 0.8034150004386902,
|
|
"learning_rate": 1.2533391010076381e-06,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9960317611694336,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 1.8114035087719298,
|
|
"grad_norm": 1.2984596490859985,
|
|
"learning_rate": 1.2420577496189063e-06,
|
|
"loss": 0.0372,
|
|
"mean_token_accuracy": 0.9912366271018982,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 1.8157894736842106,
|
|
"grad_norm": 1.6242046356201172,
|
|
"learning_rate": 1.2310263922811048e-06,
|
|
"loss": 0.018,
|
|
"mean_token_accuracy": 0.9937499761581421,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 1.8201754385964912,
|
|
"grad_norm": 0.7923622131347656,
|
|
"learning_rate": 1.2202456766718092e-06,
|
|
"loss": 0.0183,
|
|
"mean_token_accuracy": 0.9956849813461304,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.8245614035087718,
|
|
"grad_norm": 0.9306057095527649,
|
|
"learning_rate": 1.2097162357528128e-06,
|
|
"loss": 0.0118,
|
|
"mean_token_accuracy": 0.9959555268287659,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 1.8289473684210527,
|
|
"grad_norm": 1.0107431411743164,
|
|
"learning_rate": 1.1994386877329678e-06,
|
|
"loss": 0.0182,
|
|
"mean_token_accuracy": 0.9948875308036804,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 1.8333333333333335,
|
|
"grad_norm": 1.1881065368652344,
|
|
"learning_rate": 1.189413636031886e-06,
|
|
"loss": 0.0172,
|
|
"mean_token_accuracy": 0.9960707426071167,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 1.837719298245614,
|
|
"grad_norm": 1.2647409439086914,
|
|
"learning_rate": 1.179641669244514e-06,
|
|
"loss": 0.0264,
|
|
"mean_token_accuracy": 0.990750253200531,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 1.8421052631578947,
|
|
"grad_norm": 1.1645160913467407,
|
|
"learning_rate": 1.1701233611065705e-06,
|
|
"loss": 0.0123,
|
|
"mean_token_accuracy": 0.9961464405059814,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.8464912280701755,
|
|
"grad_norm": 1.337747573852539,
|
|
"learning_rate": 1.1608592704608656e-06,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.995708167552948,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 1.8508771929824561,
|
|
"grad_norm": 1.1848918199539185,
|
|
"learning_rate": 1.1518499412244872e-06,
|
|
"loss": 0.0178,
|
|
"mean_token_accuracy": 0.9929006099700928,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 1.8552631578947367,
|
|
"grad_norm": 1.308233380317688,
|
|
"learning_rate": 1.1430959023568654e-06,
|
|
"loss": 0.0227,
|
|
"mean_token_accuracy": 0.9948186278343201,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 1.8596491228070176,
|
|
"grad_norm": 0.8815935254096985,
|
|
"learning_rate": 1.1345976678287216e-06,
|
|
"loss": 0.0095,
|
|
"mean_token_accuracy": 0.9958115220069885,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 1.8640350877192984,
|
|
"grad_norm": 1.0849359035491943,
|
|
"learning_rate": 1.126355736591882e-06,
|
|
"loss": 0.0213,
|
|
"mean_token_accuracy": 0.9948875308036804,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 1.868421052631579,
|
|
"grad_norm": 0.9567351937294006,
|
|
"learning_rate": 1.1183705925499948e-06,
|
|
"loss": 0.0096,
|
|
"mean_token_accuracy": 0.9960707426071167,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 1.8728070175438596,
|
|
"grad_norm": 0.8804600834846497,
|
|
"learning_rate": 1.1106427045301085e-06,
|
|
"loss": 0.0111,
|
|
"mean_token_accuracy": 0.99689120054245,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 1.8771929824561404,
|
|
"grad_norm": 1.2652560472488403,
|
|
"learning_rate": 1.1031725262551536e-06,
|
|
"loss": 0.0207,
|
|
"mean_token_accuracy": 0.9969574213027954,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 1.881578947368421,
|
|
"grad_norm": 0.8557901978492737,
|
|
"learning_rate": 1.0959604963172996e-06,
|
|
"loss": 0.0101,
|
|
"mean_token_accuracy": 0.9969879388809204,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 1.8859649122807016,
|
|
"grad_norm": 0.7391759753227234,
|
|
"learning_rate": 1.0890070381522038e-06,
|
|
"loss": 0.0109,
|
|
"mean_token_accuracy": 0.998993992805481,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.8903508771929824,
|
|
"grad_norm": 1.2102177143096924,
|
|
"learning_rate": 1.0823125600141529e-06,
|
|
"loss": 0.0222,
|
|
"mean_token_accuracy": 0.9951028227806091,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 1.8947368421052633,
|
|
"grad_norm": 1.0668492317199707,
|
|
"learning_rate": 1.0758774549520922e-06,
|
|
"loss": 0.0212,
|
|
"mean_token_accuracy": 0.9962581992149353,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 1.8991228070175439,
|
|
"grad_norm": 0.9030259847640991,
|
|
"learning_rate": 1.069702100786548e-06,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9958376884460449,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 1.9035087719298245,
|
|
"grad_norm": 1.0430710315704346,
|
|
"learning_rate": 1.0637868600874448e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9969103932380676,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 1.9078947368421053,
|
|
"grad_norm": 1.3210341930389404,
|
|
"learning_rate": 1.0581320801528202e-06,
|
|
"loss": 0.0143,
|
|
"mean_token_accuracy": 0.9938207864761353,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 1.912280701754386,
|
|
"grad_norm": 0.8668481111526489,
|
|
"learning_rate": 1.0527380929884324e-06,
|
|
"loss": 0.0096,
|
|
"mean_token_accuracy": 0.9968119263648987,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 1.9166666666666665,
|
|
"grad_norm": 0.8239589333534241,
|
|
"learning_rate": 1.0476052152882653e-06,
|
|
"loss": 0.0103,
|
|
"mean_token_accuracy": 0.9959636926651001,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 1.9210526315789473,
|
|
"grad_norm": 1.4379644393920898,
|
|
"learning_rate": 1.0427337484159404e-06,
|
|
"loss": 0.0382,
|
|
"mean_token_accuracy": 0.9892367720603943,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 1.9254385964912282,
|
|
"grad_norm": 1.5320522785186768,
|
|
"learning_rate": 1.0381239783870168e-06,
|
|
"loss": 0.0294,
|
|
"mean_token_accuracy": 0.9927158951759338,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 1.9298245614035088,
|
|
"grad_norm": 0.6085926294326782,
|
|
"learning_rate": 1.0337761758522028e-06,
|
|
"loss": 0.0043,
|
|
"mean_token_accuracy": 0.9989583492279053,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.9342105263157894,
|
|
"grad_norm": 1.037218451499939,
|
|
"learning_rate": 1.0296905960814626e-06,
|
|
"loss": 0.0144,
|
|
"mean_token_accuracy": 0.9921348094940186,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 1.9385964912280702,
|
|
"grad_norm": 0.5523264408111572,
|
|
"learning_rate": 1.025867478949031e-06,
|
|
"loss": 0.0044,
|
|
"mean_token_accuracy": 0.9989429116249084,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 1.942982456140351,
|
|
"grad_norm": 0.9617688655853271,
|
|
"learning_rate": 1.0223070489193277e-06,
|
|
"loss": 0.0185,
|
|
"mean_token_accuracy": 0.9959058165550232,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 1.9473684210526314,
|
|
"grad_norm": 0.8199120163917542,
|
|
"learning_rate": 1.0190095150337812e-06,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9969819188117981,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 1.9517543859649122,
|
|
"grad_norm": 0.6348384022712708,
|
|
"learning_rate": 1.015975070898552e-06,
|
|
"loss": 0.007,
|
|
"mean_token_accuracy": 0.9978540539741516,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 1.956140350877193,
|
|
"grad_norm": 1.40470552444458,
|
|
"learning_rate": 1.0132038946731682e-06,
|
|
"loss": 0.0218,
|
|
"mean_token_accuracy": 0.9930139780044556,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 1.9605263157894737,
|
|
"grad_norm": 1.2154960632324219,
|
|
"learning_rate": 1.0106961490600648e-06,
|
|
"loss": 0.0158,
|
|
"mean_token_accuracy": 0.9927158951759338,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 1.9649122807017543,
|
|
"grad_norm": 1.1413288116455078,
|
|
"learning_rate": 1.0084519812950302e-06,
|
|
"loss": 0.0191,
|
|
"mean_token_accuracy": 0.994301974773407,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 1.969298245614035,
|
|
"grad_norm": 0.8637524247169495,
|
|
"learning_rate": 1.0064715231385614e-06,
|
|
"loss": 0.0116,
|
|
"mean_token_accuracy": 0.9956803321838379,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 1.973684210526316,
|
|
"grad_norm": 0.9894522428512573,
|
|
"learning_rate": 1.0047548908681308e-06,
|
|
"loss": 0.01,
|
|
"mean_token_accuracy": 0.9977900385856628,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.9780701754385965,
|
|
"grad_norm": 0.6048401594161987,
|
|
"learning_rate": 1.003302185271355e-06,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 0.998971164226532,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 1.9824561403508771,
|
|
"grad_norm": 0.8870363831520081,
|
|
"learning_rate": 1.002113491640081e-06,
|
|
"loss": 0.013,
|
|
"mean_token_accuracy": 0.9958974123001099,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 1.986842105263158,
|
|
"grad_norm": 1.6314061880111694,
|
|
"learning_rate": 1.001188879765377e-06,
|
|
"loss": 0.0356,
|
|
"mean_token_accuracy": 0.9894958138465881,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 1.9912280701754386,
|
|
"grad_norm": 1.25128173828125,
|
|
"learning_rate": 1.000528403933433e-06,
|
|
"loss": 0.0161,
|
|
"mean_token_accuracy": 0.9958847761154175,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 1.9956140350877192,
|
|
"grad_norm": 1.4350489377975464,
|
|
"learning_rate": 1.0001321029223788e-06,
|
|
"loss": 0.0141,
|
|
"mean_token_accuracy": 0.9948875308036804,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.1879558563232422,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.0127,
|
|
"mean_token_accuracy": 0.9937952160835266,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"step": 456,
|
|
"total_flos": 1.0952542154391552e+17,
|
|
"train_loss": 0.13070940664814165,
|
|
"train_runtime": 1765.7438,
|
|
"train_samples_per_second": 8.246,
|
|
"train_steps_per_second": 0.258
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 456,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.0952542154391552e+17,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|