Model: Hyeongwon/PS_only_answer_Qwen3-4B-Base_0328-01-1e-5-seed43 Source: Original Platform
5564 lines
156 KiB
JSON
5564 lines
156 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.0,
|
|
"eval_steps": 500,
|
|
"global_step": 552,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.404266357421875,
|
|
"epoch": 0.010869565217391304,
|
|
"grad_norm": 313.00699229391836,
|
|
"learning_rate": 0.0,
|
|
"loss": 8.9634,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 737905.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.3954620361328125,
|
|
"epoch": 0.021739130434782608,
|
|
"grad_norm": 313.75057892481897,
|
|
"learning_rate": 3.5714285714285716e-07,
|
|
"loss": 8.9498,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 1470568.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.4025115966796875,
|
|
"epoch": 0.03260869565217391,
|
|
"grad_norm": 316.14542968509846,
|
|
"learning_rate": 7.142857142857143e-07,
|
|
"loss": 8.9467,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 2209158.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.397705078125,
|
|
"epoch": 0.043478260869565216,
|
|
"grad_norm": 319.02599092885146,
|
|
"learning_rate": 1.0714285714285714e-06,
|
|
"loss": 8.7856,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 2957787.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.4181060791015625,
|
|
"epoch": 0.05434782608695652,
|
|
"grad_norm": 340.7958215414108,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 8.3392,
|
|
"mean_token_accuracy": 0.0026041667442768812,
|
|
"num_tokens": 3682588.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.397796630859375,
|
|
"epoch": 0.06521739130434782,
|
|
"grad_norm": 343.8013784352059,
|
|
"learning_rate": 1.7857142857142859e-06,
|
|
"loss": 8.118,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4425820.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.4186859130859375,
|
|
"epoch": 0.07608695652173914,
|
|
"grad_norm": 299.40163631283133,
|
|
"learning_rate": 2.1428571428571427e-06,
|
|
"loss": 6.2525,
|
|
"mean_token_accuracy": 0.02343750069849193,
|
|
"num_tokens": 5129532.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.401458740234375,
|
|
"epoch": 0.08695652173913043,
|
|
"grad_norm": 218.5263668401771,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 5.4662,
|
|
"mean_token_accuracy": 0.10156250302679837,
|
|
"num_tokens": 5858216.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.41046142578125,
|
|
"epoch": 0.09782608695652174,
|
|
"grad_norm": 98.01489994287654,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 4.2172,
|
|
"mean_token_accuracy": 0.5156250153668225,
|
|
"num_tokens": 6594329.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.4373779296875,
|
|
"epoch": 0.10869565217391304,
|
|
"grad_norm": 86.34982647614594,
|
|
"learning_rate": 3.2142857142857147e-06,
|
|
"loss": 4.0132,
|
|
"mean_token_accuracy": 0.5104166818782687,
|
|
"num_tokens": 7328095.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.4120941162109375,
|
|
"epoch": 0.11956521739130435,
|
|
"grad_norm": 59.62130011360643,
|
|
"learning_rate": 3.5714285714285718e-06,
|
|
"loss": 3.3914,
|
|
"mean_token_accuracy": 0.49479168141260743,
|
|
"num_tokens": 8046632.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.3974456787109375,
|
|
"epoch": 0.13043478260869565,
|
|
"grad_norm": 59.0956607506056,
|
|
"learning_rate": 3.928571428571429e-06,
|
|
"loss": 3.3033,
|
|
"mean_token_accuracy": 0.505208348389715,
|
|
"num_tokens": 8797060.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.4252777099609375,
|
|
"epoch": 0.14130434782608695,
|
|
"grad_norm": 59.96554860625357,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 3.2304,
|
|
"mean_token_accuracy": 0.48958334792405367,
|
|
"num_tokens": 9502387.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.3999481201171875,
|
|
"epoch": 0.15217391304347827,
|
|
"grad_norm": 56.601419569611856,
|
|
"learning_rate": 4.642857142857144e-06,
|
|
"loss": 3.1392,
|
|
"mean_token_accuracy": 0.5000000149011612,
|
|
"num_tokens": 10246541.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.4176025390625,
|
|
"epoch": 0.16304347826086957,
|
|
"grad_norm": 62.70185946311896,
|
|
"learning_rate": 5e-06,
|
|
"loss": 3.0106,
|
|
"mean_token_accuracy": 0.4973958481568843,
|
|
"num_tokens": 10966263.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.4071807861328125,
|
|
"epoch": 0.17391304347826086,
|
|
"grad_norm": 56.396286145104646,
|
|
"learning_rate": 5.357142857142857e-06,
|
|
"loss": 2.97,
|
|
"mean_token_accuracy": 0.5338541825767606,
|
|
"num_tokens": 11712181.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.4021759033203125,
|
|
"epoch": 0.18478260869565216,
|
|
"grad_norm": 55.930884056182215,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 2.932,
|
|
"mean_token_accuracy": 0.5494791830424219,
|
|
"num_tokens": 12443600.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.404205322265625,
|
|
"epoch": 0.1956521739130435,
|
|
"grad_norm": 56.25248851415371,
|
|
"learning_rate": 6.071428571428571e-06,
|
|
"loss": 2.8755,
|
|
"mean_token_accuracy": 0.5625000167638063,
|
|
"num_tokens": 13174697.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.3925628662109375,
|
|
"epoch": 0.20652173913043478,
|
|
"grad_norm": 55.903558309130915,
|
|
"learning_rate": 6.4285714285714295e-06,
|
|
"loss": 2.8401,
|
|
"mean_token_accuracy": 0.5546875165309757,
|
|
"num_tokens": 13894994.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.396087646484375,
|
|
"epoch": 0.21739130434782608,
|
|
"grad_norm": 56.263861704834696,
|
|
"learning_rate": 6.785714285714287e-06,
|
|
"loss": 2.8299,
|
|
"mean_token_accuracy": 0.5625000167638063,
|
|
"num_tokens": 14640557.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.39532470703125,
|
|
"epoch": 0.22826086956521738,
|
|
"grad_norm": 66.40245470892708,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 2.7688,
|
|
"mean_token_accuracy": 0.5572916832752526,
|
|
"num_tokens": 15339345.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.42333984375,
|
|
"epoch": 0.2391304347826087,
|
|
"grad_norm": 56.667229607783376,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 2.7188,
|
|
"mean_token_accuracy": 0.5755208504851907,
|
|
"num_tokens": 16064743.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.3929443359375,
|
|
"epoch": 0.25,
|
|
"grad_norm": 56.391055572954926,
|
|
"learning_rate": 7.857142857142858e-06,
|
|
"loss": 2.6847,
|
|
"mean_token_accuracy": 0.570312516996637,
|
|
"num_tokens": 16784070.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.3961334228515625,
|
|
"epoch": 0.2608695652173913,
|
|
"grad_norm": 57.02357138888174,
|
|
"learning_rate": 8.214285714285714e-06,
|
|
"loss": 2.6387,
|
|
"mean_token_accuracy": 0.5807291839737445,
|
|
"num_tokens": 17521654.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.4019775390625,
|
|
"epoch": 0.2717391304347826,
|
|
"grad_norm": 57.12254147822104,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 2.6204,
|
|
"mean_token_accuracy": 0.5625000167638063,
|
|
"num_tokens": 18236804.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.38588714599609375,
|
|
"epoch": 0.2826086956521739,
|
|
"grad_norm": 59.64148278800173,
|
|
"learning_rate": 8.92857142857143e-06,
|
|
"loss": 2.6181,
|
|
"mean_token_accuracy": 0.5598958500195295,
|
|
"num_tokens": 18962786.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.3935699462890625,
|
|
"epoch": 0.29347826086956524,
|
|
"grad_norm": 58.77246120811099,
|
|
"learning_rate": 9.285714285714288e-06,
|
|
"loss": 2.548,
|
|
"mean_token_accuracy": 0.5651041835080832,
|
|
"num_tokens": 19683948.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.404327392578125,
|
|
"epoch": 0.30434782608695654,
|
|
"grad_norm": 57.58499849661708,
|
|
"learning_rate": 9.642857142857144e-06,
|
|
"loss": 2.5077,
|
|
"mean_token_accuracy": 0.5546875165309757,
|
|
"num_tokens": 20414675.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.390533447265625,
|
|
"epoch": 0.31521739130434784,
|
|
"grad_norm": 57.246018838660014,
|
|
"learning_rate": 1e-05,
|
|
"loss": 2.4778,
|
|
"mean_token_accuracy": 0.5468750162981451,
|
|
"num_tokens": 21107497.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.39447021484375,
|
|
"epoch": 0.32608695652173914,
|
|
"grad_norm": 57.62520170460501,
|
|
"learning_rate": 9.999910138041584e-06,
|
|
"loss": 2.4387,
|
|
"mean_token_accuracy": 0.5807291839737445,
|
|
"num_tokens": 21831103.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.38617706298828125,
|
|
"epoch": 0.33695652173913043,
|
|
"grad_norm": 57.60510786680138,
|
|
"learning_rate": 9.999640555396404e-06,
|
|
"loss": 2.389,
|
|
"mean_token_accuracy": 0.5963541844394058,
|
|
"num_tokens": 22569070.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.3925933837890625,
|
|
"epoch": 0.34782608695652173,
|
|
"grad_norm": 57.53027394715068,
|
|
"learning_rate": 9.99919126175455e-06,
|
|
"loss": 2.3485,
|
|
"mean_token_accuracy": 0.5677083495538682,
|
|
"num_tokens": 23296410.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.3921966552734375,
|
|
"epoch": 0.358695652173913,
|
|
"grad_norm": 57.92112872538713,
|
|
"learning_rate": 9.998562273265786e-06,
|
|
"loss": 2.3043,
|
|
"mean_token_accuracy": 0.830729175824672,
|
|
"num_tokens": 24014340.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.3832855224609375,
|
|
"epoch": 0.3695652173913043,
|
|
"grad_norm": 57.82783533191108,
|
|
"learning_rate": 9.997753612538963e-06,
|
|
"loss": 2.2213,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 24771904.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.3818511962890625,
|
|
"epoch": 0.3804347826086957,
|
|
"grad_norm": 58.4348316031451,
|
|
"learning_rate": 9.996765308641218e-06,
|
|
"loss": 2.158,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 25514711.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.40313720703125,
|
|
"epoch": 0.391304347826087,
|
|
"grad_norm": 58.95955253419748,
|
|
"learning_rate": 9.995597397096923e-06,
|
|
"loss": 2.1357,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 26239512.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.4049835205078125,
|
|
"epoch": 0.40217391304347827,
|
|
"grad_norm": 59.445854902474665,
|
|
"learning_rate": 9.994249919886402e-06,
|
|
"loss": 2.1124,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 26958251.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.4022369384765625,
|
|
"epoch": 0.41304347826086957,
|
|
"grad_norm": 59.872017890023315,
|
|
"learning_rate": 9.992722925444434e-06,
|
|
"loss": 2.1031,
|
|
"mean_token_accuracy": 0.8723958409391344,
|
|
"num_tokens": 27685926.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.4070281982421875,
|
|
"epoch": 0.42391304347826086,
|
|
"grad_norm": 58.89159607531823,
|
|
"learning_rate": 9.9910164686585e-06,
|
|
"loss": 2.0126,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 28400037.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.3868255615234375,
|
|
"epoch": 0.43478260869565216,
|
|
"grad_norm": 59.10128789166655,
|
|
"learning_rate": 9.989130610866822e-06,
|
|
"loss": 1.9784,
|
|
"mean_token_accuracy": 0.8828125069849193,
|
|
"num_tokens": 29130447.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.3898162841796875,
|
|
"epoch": 0.44565217391304346,
|
|
"grad_norm": 58.788742536631304,
|
|
"learning_rate": 9.98706541985615e-06,
|
|
"loss": 1.9241,
|
|
"mean_token_accuracy": 0.8932291730307043,
|
|
"num_tokens": 29852981.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.3857574462890625,
|
|
"epoch": 0.45652173913043476,
|
|
"grad_norm": 58.265802853351445,
|
|
"learning_rate": 9.984820969859326e-06,
|
|
"loss": 1.8673,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 30610823.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.394622802734375,
|
|
"epoch": 0.4673913043478261,
|
|
"grad_norm": 58.33827701953295,
|
|
"learning_rate": 9.98239734155262e-06,
|
|
"loss": 1.8113,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 31352343.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.394317626953125,
|
|
"epoch": 0.4782608695652174,
|
|
"grad_norm": 59.83144529336275,
|
|
"learning_rate": 9.979794622052825e-06,
|
|
"loss": 1.7759,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 32077379.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.394744873046875,
|
|
"epoch": 0.4891304347826087,
|
|
"grad_norm": 59.628579098514365,
|
|
"learning_rate": 9.977012904914133e-06,
|
|
"loss": 1.7458,
|
|
"mean_token_accuracy": 0.8750000074505806,
|
|
"num_tokens": 32817982.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.3867645263671875,
|
|
"epoch": 0.5,
|
|
"grad_norm": 60.155439032681016,
|
|
"learning_rate": 9.97405229012476e-06,
|
|
"loss": 1.6497,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 33528777.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.4013519287109375,
|
|
"epoch": 0.5108695652173914,
|
|
"grad_norm": 58.18768229722744,
|
|
"learning_rate": 9.970912884103365e-06,
|
|
"loss": 1.5985,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 34250894.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.3849639892578125,
|
|
"epoch": 0.5217391304347826,
|
|
"grad_norm": 58.28644918886613,
|
|
"learning_rate": 9.967594799695218e-06,
|
|
"loss": 1.5554,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 34982448.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.388427734375,
|
|
"epoch": 0.532608695652174,
|
|
"grad_norm": 56.70252712779969,
|
|
"learning_rate": 9.964098156168143e-06,
|
|
"loss": 1.4927,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 35700246.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.3887176513671875,
|
|
"epoch": 0.5434782608695652,
|
|
"grad_norm": 58.595352893563565,
|
|
"learning_rate": 9.960423079208235e-06,
|
|
"loss": 1.4329,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 36435162.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.3783721923828125,
|
|
"epoch": 0.5543478260869565,
|
|
"grad_norm": 57.102356098204076,
|
|
"learning_rate": 9.956569700915338e-06,
|
|
"loss": 1.3891,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 37188304.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.3859405517578125,
|
|
"epoch": 0.5652173913043478,
|
|
"grad_norm": 56.705212667862135,
|
|
"learning_rate": 9.9525381597983e-06,
|
|
"loss": 1.3302,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 37917370.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.4050750732421875,
|
|
"epoch": 0.5760869565217391,
|
|
"grad_norm": 56.7397897280413,
|
|
"learning_rate": 9.948328600769996e-06,
|
|
"loss": 1.2835,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 38647541.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.392974853515625,
|
|
"epoch": 0.5869565217391305,
|
|
"grad_norm": 57.78691302950355,
|
|
"learning_rate": 9.943941175142109e-06,
|
|
"loss": 1.26,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 39380674.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.4039459228515625,
|
|
"epoch": 0.5978260869565217,
|
|
"grad_norm": 56.218356320519,
|
|
"learning_rate": 9.939376040619707e-06,
|
|
"loss": 1.1865,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 40070970.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.37506103515625,
|
|
"epoch": 0.6086956521739131,
|
|
"grad_norm": 57.04525550772644,
|
|
"learning_rate": 9.934633361295558e-06,
|
|
"loss": 1.1413,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 40845659.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.3908843994140625,
|
|
"epoch": 0.6195652173913043,
|
|
"grad_norm": 55.749337153225746,
|
|
"learning_rate": 9.929713307644245e-06,
|
|
"loss": 1.0913,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 41609437.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.3894805908203125,
|
|
"epoch": 0.6304347826086957,
|
|
"grad_norm": 55.22233207753206,
|
|
"learning_rate": 9.924616056516027e-06,
|
|
"loss": 1.0639,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 42352646.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.4128570556640625,
|
|
"epoch": 0.6413043478260869,
|
|
"grad_norm": 55.105924084996,
|
|
"learning_rate": 9.919341791130496e-06,
|
|
"loss": 1.0074,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 43077972.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.380615234375,
|
|
"epoch": 0.6521739130434783,
|
|
"grad_norm": 54.21147172022638,
|
|
"learning_rate": 9.91389070106998e-06,
|
|
"loss": 0.9486,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 43807767.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.3988494873046875,
|
|
"epoch": 0.6630434782608695,
|
|
"grad_norm": 54.07818048206698,
|
|
"learning_rate": 9.908262982272724e-06,
|
|
"loss": 0.8903,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 44520446.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.4084320068359375,
|
|
"epoch": 0.6739130434782609,
|
|
"grad_norm": 52.54750581121588,
|
|
"learning_rate": 9.902458837025865e-06,
|
|
"loss": 0.848,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 45275581.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.3879547119140625,
|
|
"epoch": 0.6847826086956522,
|
|
"grad_norm": 56.027093676059366,
|
|
"learning_rate": 9.896478473958147e-06,
|
|
"loss": 0.8663,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 46026844.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.4013824462890625,
|
|
"epoch": 0.6956521739130435,
|
|
"grad_norm": 50.96307473661193,
|
|
"learning_rate": 9.890322108032423e-06,
|
|
"loss": 0.7745,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 46750916.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.3828887939453125,
|
|
"epoch": 0.7065217391304348,
|
|
"grad_norm": 50.389437492051776,
|
|
"learning_rate": 9.883989960537934e-06,
|
|
"loss": 0.7381,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 47486655.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.3843231201171875,
|
|
"epoch": 0.717391304347826,
|
|
"grad_norm": 47.17589413565191,
|
|
"learning_rate": 9.87748225908235e-06,
|
|
"loss": 0.664,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 48218385.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.37799072265625,
|
|
"epoch": 0.7282608695652174,
|
|
"grad_norm": 46.667895955085605,
|
|
"learning_rate": 9.870799237583586e-06,
|
|
"loss": 0.6395,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 48972792.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.3830108642578125,
|
|
"epoch": 0.7391304347826086,
|
|
"grad_norm": 44.115438060089616,
|
|
"learning_rate": 9.863941136261409e-06,
|
|
"loss": 0.6268,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 49692331.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.38470458984375,
|
|
"epoch": 0.75,
|
|
"grad_norm": 41.88531100195252,
|
|
"learning_rate": 9.85690820162878e-06,
|
|
"loss": 0.5393,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 50431563.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.3878631591796875,
|
|
"epoch": 0.7608695652173914,
|
|
"grad_norm": 40.513472398944074,
|
|
"learning_rate": 9.849700686483016e-06,
|
|
"loss": 0.5039,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 51170706.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.39067840576171875,
|
|
"epoch": 0.7717391304347826,
|
|
"grad_norm": 37.870616432973094,
|
|
"learning_rate": 9.842318849896679e-06,
|
|
"loss": 0.4897,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 51906750.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.3930816650390625,
|
|
"epoch": 0.782608695652174,
|
|
"grad_norm": 34.87448568873165,
|
|
"learning_rate": 9.834762957208293e-06,
|
|
"loss": 0.4491,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 52624885.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.3891143798828125,
|
|
"epoch": 0.7934782608695652,
|
|
"grad_norm": 33.876566943004526,
|
|
"learning_rate": 9.827033280012783e-06,
|
|
"loss": 0.4113,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 53355337.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.3836669921875,
|
|
"epoch": 0.8043478260869565,
|
|
"grad_norm": 30.968290270524257,
|
|
"learning_rate": 9.819130096151718e-06,
|
|
"loss": 0.4121,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 54085628.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.37982177734375,
|
|
"epoch": 0.8152173913043478,
|
|
"grad_norm": 28.551813432489794,
|
|
"learning_rate": 9.811053689703333e-06,
|
|
"loss": 0.344,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 54831609.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.3806304931640625,
|
|
"epoch": 0.8260869565217391,
|
|
"grad_norm": 27.351293029913823,
|
|
"learning_rate": 9.802804350972308e-06,
|
|
"loss": 0.3856,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 55591360.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.39495849609375,
|
|
"epoch": 0.8369565217391305,
|
|
"grad_norm": 27.96968705797433,
|
|
"learning_rate": 9.794382376479334e-06,
|
|
"loss": 0.3901,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 56315901.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.407562255859375,
|
|
"epoch": 0.8478260869565217,
|
|
"grad_norm": 22.195809606606115,
|
|
"learning_rate": 9.785788068950463e-06,
|
|
"loss": 0.3129,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 57028542.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.4049224853515625,
|
|
"epoch": 0.8586956521739131,
|
|
"grad_norm": 20.691379692081384,
|
|
"learning_rate": 9.777021737306214e-06,
|
|
"loss": 0.3198,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 57754799.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.38623046875,
|
|
"epoch": 0.8695652173913043,
|
|
"grad_norm": 17.70222195746143,
|
|
"learning_rate": 9.768083696650481e-06,
|
|
"loss": 0.2826,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 58510466.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.404876708984375,
|
|
"epoch": 0.8804347826086957,
|
|
"grad_norm": 15.865923298269012,
|
|
"learning_rate": 9.7589742682592e-06,
|
|
"loss": 0.2596,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 59242528.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.3817596435546875,
|
|
"epoch": 0.8913043478260869,
|
|
"grad_norm": 14.378682257322561,
|
|
"learning_rate": 9.749693779568799e-06,
|
|
"loss": 0.2773,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 59962928.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.38201904296875,
|
|
"epoch": 0.9021739130434783,
|
|
"grad_norm": 19.516621900864383,
|
|
"learning_rate": 9.740242564164433e-06,
|
|
"loss": 0.3002,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 60703378.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.401397705078125,
|
|
"epoch": 0.9130434782608695,
|
|
"grad_norm": 17.30058721989812,
|
|
"learning_rate": 9.730620961767996e-06,
|
|
"loss": 0.2655,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 61442404.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.3871307373046875,
|
|
"epoch": 0.9239130434782609,
|
|
"grad_norm": 16.400403369174175,
|
|
"learning_rate": 9.720829318225897e-06,
|
|
"loss": 0.2453,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 62169057.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.3823089599609375,
|
|
"epoch": 0.9347826086956522,
|
|
"grad_norm": 9.792279028320644,
|
|
"learning_rate": 9.710867985496644e-06,
|
|
"loss": 0.2507,
|
|
"mean_token_accuracy": 0.8932291730307043,
|
|
"num_tokens": 62927387.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.4188385009765625,
|
|
"epoch": 0.9456521739130435,
|
|
"grad_norm": 13.109216552306195,
|
|
"learning_rate": 9.700737321638185e-06,
|
|
"loss": 0.2564,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 63647298.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.3993682861328125,
|
|
"epoch": 0.9565217391304348,
|
|
"grad_norm": 7.97485349566985,
|
|
"learning_rate": 9.690437690795038e-06,
|
|
"loss": 0.2339,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 64387267.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.4092254638671875,
|
|
"epoch": 0.967391304347826,
|
|
"grad_norm": 9.535127876476075,
|
|
"learning_rate": 9.6799694631852e-06,
|
|
"loss": 0.2246,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 65097138.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.39093017578125,
|
|
"epoch": 0.9782608695652174,
|
|
"grad_norm": 9.140928485660337,
|
|
"learning_rate": 9.669333015086847e-06,
|
|
"loss": 0.2337,
|
|
"mean_token_accuracy": 0.8958333395421505,
|
|
"num_tokens": 65843379.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.3945465087890625,
|
|
"epoch": 0.9891304347826086,
|
|
"grad_norm": 4.929134630182514,
|
|
"learning_rate": 9.658528728824799e-06,
|
|
"loss": 0.223,
|
|
"mean_token_accuracy": 0.8958333395421505,
|
|
"num_tokens": 66590566.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.4042205810546875,
|
|
"epoch": 1.0,
|
|
"grad_norm": 5.77765239310204,
|
|
"learning_rate": 9.647556992756789e-06,
|
|
"loss": 0.2142,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 67315445.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.4054107666015625,
|
|
"epoch": 1.0108695652173914,
|
|
"grad_norm": 4.857210309614276,
|
|
"learning_rate": 9.63641820125949e-06,
|
|
"loss": 0.2067,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 68031682.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.399932861328125,
|
|
"epoch": 1.0217391304347827,
|
|
"grad_norm": 5.240216929774036,
|
|
"learning_rate": 9.62511275471435e-06,
|
|
"loss": 0.1983,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 68751806.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.3963165283203125,
|
|
"epoch": 1.0326086956521738,
|
|
"grad_norm": 4.054301637090316,
|
|
"learning_rate": 9.613641059493197e-06,
|
|
"loss": 0.1974,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 69477676.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.4044189453125,
|
|
"epoch": 1.0434782608695652,
|
|
"grad_norm": 7.145114180373915,
|
|
"learning_rate": 9.602003527943629e-06,
|
|
"loss": 0.1832,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 70197694.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.408233642578125,
|
|
"epoch": 1.0543478260869565,
|
|
"grad_norm": 5.430331027918885,
|
|
"learning_rate": 9.590200578374198e-06,
|
|
"loss": 0.1772,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 70945779.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.4096832275390625,
|
|
"epoch": 1.065217391304348,
|
|
"grad_norm": 4.270232507634772,
|
|
"learning_rate": 9.578232635039368e-06,
|
|
"loss": 0.1775,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 71664028.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.421051025390625,
|
|
"epoch": 1.0760869565217392,
|
|
"grad_norm": 3.9695370856932826,
|
|
"learning_rate": 9.56610012812427e-06,
|
|
"loss": 0.1678,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 72408794.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.4118804931640625,
|
|
"epoch": 1.0869565217391304,
|
|
"grad_norm": 17.29045184129257,
|
|
"learning_rate": 9.553803493729237e-06,
|
|
"loss": 0.284,
|
|
"mean_token_accuracy": 0.8489583423361182,
|
|
"num_tokens": 73141726.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.3996429443359375,
|
|
"epoch": 1.0978260869565217,
|
|
"grad_norm": 10.629649374093658,
|
|
"learning_rate": 9.541343173854128e-06,
|
|
"loss": 0.2217,
|
|
"mean_token_accuracy": 0.8671875079162419,
|
|
"num_tokens": 73897605.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.409454345703125,
|
|
"epoch": 1.108695652173913,
|
|
"grad_norm": 15.72927603948126,
|
|
"learning_rate": 9.528719616382443e-06,
|
|
"loss": 0.25,
|
|
"mean_token_accuracy": 0.8776041739620268,
|
|
"num_tokens": 74642203.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.41534423828125,
|
|
"epoch": 1.1195652173913044,
|
|
"grad_norm": 10.042919208808142,
|
|
"learning_rate": 9.515933275065218e-06,
|
|
"loss": 0.1974,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 75394309.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.4244537353515625,
|
|
"epoch": 1.1304347826086956,
|
|
"grad_norm": 2.176286292791564,
|
|
"learning_rate": 9.502984609504724e-06,
|
|
"loss": 0.2024,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 76151322.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.415985107421875,
|
|
"epoch": 1.141304347826087,
|
|
"grad_norm": 4.280092657053722,
|
|
"learning_rate": 9.48987408513794e-06,
|
|
"loss": 0.1836,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 76879458.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.420654296875,
|
|
"epoch": 1.1521739130434783,
|
|
"grad_norm": 3.525336295473361,
|
|
"learning_rate": 9.476602173219822e-06,
|
|
"loss": 0.1736,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 77591958.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.424224853515625,
|
|
"epoch": 1.1630434782608696,
|
|
"grad_norm": 1.9021196035214551,
|
|
"learning_rate": 9.463169350806369e-06,
|
|
"loss": 0.1594,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 78337677.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.41119384765625,
|
|
"epoch": 1.1739130434782608,
|
|
"grad_norm": 1.8119460568467807,
|
|
"learning_rate": 9.449576100737474e-06,
|
|
"loss": 0.1463,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 79100370.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.4214324951171875,
|
|
"epoch": 1.184782608695652,
|
|
"grad_norm": 2.8374372935351366,
|
|
"learning_rate": 9.435822911619564e-06,
|
|
"loss": 0.1466,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 79841371.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.4080810546875,
|
|
"epoch": 1.1956521739130435,
|
|
"grad_norm": 3.7043981274926305,
|
|
"learning_rate": 9.421910277808044e-06,
|
|
"loss": 0.1787,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 80592159.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.411712646484375,
|
|
"epoch": 1.2065217391304348,
|
|
"grad_norm": 3.3918465126188138,
|
|
"learning_rate": 9.407838699389525e-06,
|
|
"loss": 0.166,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 81336000.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.4184722900390625,
|
|
"epoch": 1.2173913043478262,
|
|
"grad_norm": 2.5386167667201716,
|
|
"learning_rate": 9.39360868216384e-06,
|
|
"loss": 0.1836,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 82082498.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.440460205078125,
|
|
"epoch": 1.2282608695652173,
|
|
"grad_norm": 10.088789989811675,
|
|
"learning_rate": 9.379220737625877e-06,
|
|
"loss": 0.198,
|
|
"mean_token_accuracy": 0.8619791748933494,
|
|
"num_tokens": 82811025.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.4620513916015625,
|
|
"epoch": 1.2391304347826086,
|
|
"grad_norm": 8.542279297131682,
|
|
"learning_rate": 9.364675382947185e-06,
|
|
"loss": 0.213,
|
|
"mean_token_accuracy": 0.8619791748933494,
|
|
"num_tokens": 83546094.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.4711151123046875,
|
|
"epoch": 1.25,
|
|
"grad_norm": 3.6285525129452836,
|
|
"learning_rate": 9.349973140957392e-06,
|
|
"loss": 0.2057,
|
|
"mean_token_accuracy": 0.8750000074505806,
|
|
"num_tokens": 84280315.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.489990234375,
|
|
"epoch": 1.2608695652173914,
|
|
"grad_norm": 1.6769267236014311,
|
|
"learning_rate": 9.335114540125393e-06,
|
|
"loss": 0.2081,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 85016816.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.4967041015625,
|
|
"epoch": 1.2717391304347827,
|
|
"grad_norm": 3.7889439259474567,
|
|
"learning_rate": 9.320100114540382e-06,
|
|
"loss": 0.2001,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 85731976.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.4990386962890625,
|
|
"epoch": 1.2826086956521738,
|
|
"grad_norm": 1.6118784226504503,
|
|
"learning_rate": 9.304930403892633e-06,
|
|
"loss": 0.1784,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 86483313.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.493865966796875,
|
|
"epoch": 1.2934782608695652,
|
|
"grad_norm": 1.8947761283891513,
|
|
"learning_rate": 9.289605953454108e-06,
|
|
"loss": 0.1972,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 87207481.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.50048828125,
|
|
"epoch": 1.3043478260869565,
|
|
"grad_norm": 5.003093687217722,
|
|
"learning_rate": 9.274127314058857e-06,
|
|
"loss": 0.1919,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 87948741.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.507080078125,
|
|
"epoch": 1.315217391304348,
|
|
"grad_norm": 3.712545493228915,
|
|
"learning_rate": 9.258495042083222e-06,
|
|
"loss": 0.1873,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 88699059.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.5162506103515625,
|
|
"epoch": 1.3260869565217392,
|
|
"grad_norm": 6.409166648683115,
|
|
"learning_rate": 9.242709699425833e-06,
|
|
"loss": 0.2108,
|
|
"mean_token_accuracy": 0.8958333395421505,
|
|
"num_tokens": 89411784.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.52899169921875,
|
|
"epoch": 1.3369565217391304,
|
|
"grad_norm": 4.833778906865952,
|
|
"learning_rate": 9.226771853487411e-06,
|
|
"loss": 0.2008,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 90116299.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.53887939453125,
|
|
"epoch": 1.3478260869565217,
|
|
"grad_norm": 1.7021028270137335,
|
|
"learning_rate": 9.210682077150375e-06,
|
|
"loss": 0.1857,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 90847082.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.549591064453125,
|
|
"epoch": 1.358695652173913,
|
|
"grad_norm": 4.963878722743053,
|
|
"learning_rate": 9.19444094875825e-06,
|
|
"loss": 0.1893,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 91575210.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.51055908203125,
|
|
"epoch": 1.3695652173913042,
|
|
"grad_norm": 2.0086235135787263,
|
|
"learning_rate": 9.178049052094881e-06,
|
|
"loss": 0.1672,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 92312235.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.485198974609375,
|
|
"epoch": 1.3804347826086958,
|
|
"grad_norm": 6.826339807432627,
|
|
"learning_rate": 9.161506976363438e-06,
|
|
"loss": 0.1678,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 93063809.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.470733642578125,
|
|
"epoch": 1.391304347826087,
|
|
"grad_norm": 7.95081315348721,
|
|
"learning_rate": 9.144815316165251e-06,
|
|
"loss": 0.1848,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 93771837.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.452362060546875,
|
|
"epoch": 1.4021739130434783,
|
|
"grad_norm": 1.8105762849680864,
|
|
"learning_rate": 9.127974671478432e-06,
|
|
"loss": 0.1694,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 94501830.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.4649658203125,
|
|
"epoch": 1.4130434782608696,
|
|
"grad_norm": 11.50576473375785,
|
|
"learning_rate": 9.110985647636303e-06,
|
|
"loss": 0.2392,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 95243219.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.471923828125,
|
|
"epoch": 1.4239130434782608,
|
|
"grad_norm": 9.578726003089912,
|
|
"learning_rate": 9.09384885530565e-06,
|
|
"loss": 0.2086,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 95991533.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.471954345703125,
|
|
"epoch": 1.434782608695652,
|
|
"grad_norm": 2.6118919501859477,
|
|
"learning_rate": 9.076564910464753e-06,
|
|
"loss": 0.161,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 96734722.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.4808197021484375,
|
|
"epoch": 1.4456521739130435,
|
|
"grad_norm": 8.297350921612512,
|
|
"learning_rate": 9.059134434381274e-06,
|
|
"loss": 0.2269,
|
|
"mean_token_accuracy": 0.8802083404734731,
|
|
"num_tokens": 97452664.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.472747802734375,
|
|
"epoch": 1.4565217391304348,
|
|
"grad_norm": 8.968811717529865,
|
|
"learning_rate": 9.041558053589894e-06,
|
|
"loss": 0.2247,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 98168742.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.48162841796875,
|
|
"epoch": 1.4673913043478262,
|
|
"grad_norm": 8.180768911421971,
|
|
"learning_rate": 9.023836399869814e-06,
|
|
"loss": 0.2334,
|
|
"mean_token_accuracy": 0.8828125069849193,
|
|
"num_tokens": 98889361.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.495330810546875,
|
|
"epoch": 1.4782608695652173,
|
|
"grad_norm": 5.34659130045578,
|
|
"learning_rate": 9.00597011022204e-06,
|
|
"loss": 0.186,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 99614835.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.51507568359375,
|
|
"epoch": 1.4891304347826086,
|
|
"grad_norm": 3.7240151526908787,
|
|
"learning_rate": 8.987959826846479e-06,
|
|
"loss": 0.1788,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 100323099.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.5226287841796875,
|
|
"epoch": 1.5,
|
|
"grad_norm": 4.539517974586188,
|
|
"learning_rate": 8.96980619711887e-06,
|
|
"loss": 0.1861,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 101051404.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.5224456787109375,
|
|
"epoch": 1.5108695652173914,
|
|
"grad_norm": 4.274006110895741,
|
|
"learning_rate": 8.951509873567498e-06,
|
|
"loss": 0.1876,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 101758696.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.5308685302734375,
|
|
"epoch": 1.5217391304347827,
|
|
"grad_norm": 2.6002897584716926,
|
|
"learning_rate": 8.93307151384975e-06,
|
|
"loss": 0.1922,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 102492263.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.522216796875,
|
|
"epoch": 1.5326086956521738,
|
|
"grad_norm": 1.9710418581447129,
|
|
"learning_rate": 8.914491780728471e-06,
|
|
"loss": 0.1841,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 103222679.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.534393310546875,
|
|
"epoch": 1.5434782608695652,
|
|
"grad_norm": 2.7269029924493116,
|
|
"learning_rate": 8.895771342048145e-06,
|
|
"loss": 0.1756,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 103942719.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.5534515380859375,
|
|
"epoch": 1.5543478260869565,
|
|
"grad_norm": 1.66164632557191,
|
|
"learning_rate": 8.876910870710885e-06,
|
|
"loss": 0.1655,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 104664485.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.54241943359375,
|
|
"epoch": 1.5652173913043477,
|
|
"grad_norm": 1.6078483336178693,
|
|
"learning_rate": 8.857911044652244e-06,
|
|
"loss": 0.1524,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 105388628.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.5342254638671875,
|
|
"epoch": 1.5760869565217392,
|
|
"grad_norm": 0.8442636098424123,
|
|
"learning_rate": 8.838772546816857e-06,
|
|
"loss": 0.1666,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 106131226.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.5332489013671875,
|
|
"epoch": 1.5869565217391304,
|
|
"grad_norm": 1.2684095234762156,
|
|
"learning_rate": 8.819496065133879e-06,
|
|
"loss": 0.1576,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 106879153.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.5341796875,
|
|
"epoch": 1.5978260869565217,
|
|
"grad_norm": 1.0436852092197715,
|
|
"learning_rate": 8.800082292492274e-06,
|
|
"loss": 0.1806,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 107633473.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.5566558837890625,
|
|
"epoch": 1.608695652173913,
|
|
"grad_norm": 6.134709463777617,
|
|
"learning_rate": 8.780531926715888e-06,
|
|
"loss": 0.1921,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 108351755.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.5587158203125,
|
|
"epoch": 1.6195652173913042,
|
|
"grad_norm": 3.5112432848045243,
|
|
"learning_rate": 8.760845670538387e-06,
|
|
"loss": 0.1624,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 109083015.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.5608673095703125,
|
|
"epoch": 1.6304347826086958,
|
|
"grad_norm": 0.7326893619330642,
|
|
"learning_rate": 8.741024231577983e-06,
|
|
"loss": 0.163,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 109815647.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.5925140380859375,
|
|
"epoch": 1.641304347826087,
|
|
"grad_norm": 5.584570276458169,
|
|
"learning_rate": 8.721068322312007e-06,
|
|
"loss": 0.1959,
|
|
"mean_token_accuracy": 0.8958333395421505,
|
|
"num_tokens": 110528968.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.5667266845703125,
|
|
"epoch": 1.6521739130434783,
|
|
"grad_norm": 6.838200374579168,
|
|
"learning_rate": 8.700978660051293e-06,
|
|
"loss": 0.1903,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 111248060.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.56097412109375,
|
|
"epoch": 1.6630434782608696,
|
|
"grad_norm": 0.6972842221810776,
|
|
"learning_rate": 8.6807559669144e-06,
|
|
"loss": 0.17,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 111967132.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.5479583740234375,
|
|
"epoch": 1.6739130434782608,
|
|
"grad_norm": 3.4383678459213978,
|
|
"learning_rate": 8.660400969801653e-06,
|
|
"loss": 0.1764,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 112716217.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.5639801025390625,
|
|
"epoch": 1.6847826086956523,
|
|
"grad_norm": 3.1852463610484794,
|
|
"learning_rate": 8.63991440036901e-06,
|
|
"loss": 0.2071,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 113431374.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.53643798828125,
|
|
"epoch": 1.6956521739130435,
|
|
"grad_norm": 3.6175794497486238,
|
|
"learning_rate": 8.619296995001773e-06,
|
|
"loss": 0.1696,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 114167816.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.54644775390625,
|
|
"epoch": 1.7065217391304348,
|
|
"grad_norm": 1.3865803355664044,
|
|
"learning_rate": 8.598549494788111e-06,
|
|
"loss": 0.186,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 114903413.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.548004150390625,
|
|
"epoch": 1.7173913043478262,
|
|
"grad_norm": 2.653688275138002,
|
|
"learning_rate": 8.577672645492426e-06,
|
|
"loss": 0.165,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 115623366.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.5385894775390625,
|
|
"epoch": 1.7282608695652173,
|
|
"grad_norm": 0.7822811850260336,
|
|
"learning_rate": 8.556667197528543e-06,
|
|
"loss": 0.17,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 116362166.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.536895751953125,
|
|
"epoch": 1.7391304347826086,
|
|
"grad_norm": 1.7739805888910092,
|
|
"learning_rate": 8.535533905932739e-06,
|
|
"loss": 0.1561,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 117112454.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.53466796875,
|
|
"epoch": 1.75,
|
|
"grad_norm": 2.101867707489836,
|
|
"learning_rate": 8.5142735303366e-06,
|
|
"loss": 0.1541,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 117847256.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.533599853515625,
|
|
"epoch": 1.7608695652173914,
|
|
"grad_norm": 0.9328284759331247,
|
|
"learning_rate": 8.492886834939722e-06,
|
|
"loss": 0.1535,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 118584448.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.533416748046875,
|
|
"epoch": 1.7717391304347827,
|
|
"grad_norm": 2.4281166916544574,
|
|
"learning_rate": 8.47137458848224e-06,
|
|
"loss": 0.1666,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 119322567.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.50384521484375,
|
|
"epoch": 1.7826086956521738,
|
|
"grad_norm": 3.1494649384809628,
|
|
"learning_rate": 8.44973756421719e-06,
|
|
"loss": 0.1728,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 120088916.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.5146026611328125,
|
|
"epoch": 1.7934782608695652,
|
|
"grad_norm": 2.585527261367522,
|
|
"learning_rate": 8.427976539882725e-06,
|
|
"loss": 0.1841,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 120837637.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.526214599609375,
|
|
"epoch": 1.8043478260869565,
|
|
"grad_norm": 1.9511400388042828,
|
|
"learning_rate": 8.406092297674146e-06,
|
|
"loss": 0.1855,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 121570699.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.5091094970703125,
|
|
"epoch": 1.8152173913043477,
|
|
"grad_norm": 3.5895588960689797,
|
|
"learning_rate": 8.384085624215801e-06,
|
|
"loss": 0.185,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 122325290.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.5145416259765625,
|
|
"epoch": 1.8260869565217392,
|
|
"grad_norm": 0.6005583022342595,
|
|
"learning_rate": 8.3619573105328e-06,
|
|
"loss": 0.151,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 123055627.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.517425537109375,
|
|
"epoch": 1.8369565217391304,
|
|
"grad_norm": 1.1304923014978465,
|
|
"learning_rate": 8.339708152022586e-06,
|
|
"loss": 0.1263,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 123803364.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.5149078369140625,
|
|
"epoch": 1.8478260869565217,
|
|
"grad_norm": 1.2929179934538764,
|
|
"learning_rate": 8.317338948426338e-06,
|
|
"loss": 0.1754,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 124551899.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.5270538330078125,
|
|
"epoch": 1.858695652173913,
|
|
"grad_norm": 4.205343650835889,
|
|
"learning_rate": 8.294850503800237e-06,
|
|
"loss": 0.1863,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 125265394.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.50408935546875,
|
|
"epoch": 1.8695652173913042,
|
|
"grad_norm": 0.7021881484160751,
|
|
"learning_rate": 8.272243626486553e-06,
|
|
"loss": 0.162,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 126017217.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.507598876953125,
|
|
"epoch": 1.8804347826086958,
|
|
"grad_norm": 5.287780971096949,
|
|
"learning_rate": 8.24951912908459e-06,
|
|
"loss": 0.1671,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 126721530.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.4849395751953125,
|
|
"epoch": 1.891304347826087,
|
|
"grad_norm": 6.0933045006759645,
|
|
"learning_rate": 8.22667782842149e-06,
|
|
"loss": 0.1826,
|
|
"mean_token_accuracy": 0.8880208400078118,
|
|
"num_tokens": 127455753.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.494964599609375,
|
|
"epoch": 1.9021739130434783,
|
|
"grad_norm": 3.326184049417288,
|
|
"learning_rate": 8.203720545522852e-06,
|
|
"loss": 0.2036,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 128186194.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.4926910400390625,
|
|
"epoch": 1.9130434782608696,
|
|
"grad_norm": 3.237403541900177,
|
|
"learning_rate": 8.18064810558324e-06,
|
|
"loss": 0.163,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 128916881.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.4944610595703125,
|
|
"epoch": 1.9239130434782608,
|
|
"grad_norm": 4.22187747727222,
|
|
"learning_rate": 8.157461337936506e-06,
|
|
"loss": 0.157,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 129643214.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.490142822265625,
|
|
"epoch": 1.9347826086956523,
|
|
"grad_norm": 1.2461685125932693,
|
|
"learning_rate": 8.134161076025992e-06,
|
|
"loss": 0.1553,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 130388861.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.4827728271484375,
|
|
"epoch": 1.9456521739130435,
|
|
"grad_norm": 3.337799556349266,
|
|
"learning_rate": 8.110748157374566e-06,
|
|
"loss": 0.1524,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 131127069.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.50323486328125,
|
|
"epoch": 1.9565217391304348,
|
|
"grad_norm": 6.704083628456024,
|
|
"learning_rate": 8.087223423554513e-06,
|
|
"loss": 0.224,
|
|
"mean_token_accuracy": 0.890625006519258,
|
|
"num_tokens": 131832947.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.4977569580078125,
|
|
"epoch": 1.9673913043478262,
|
|
"grad_norm": 4.739180816273611,
|
|
"learning_rate": 8.063587720157298e-06,
|
|
"loss": 0.1916,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 132554681.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.5055694580078125,
|
|
"epoch": 1.9782608695652173,
|
|
"grad_norm": 3.527467220261128,
|
|
"learning_rate": 8.039841896763157e-06,
|
|
"loss": 0.1989,
|
|
"mean_token_accuracy": 0.8958333395421505,
|
|
"num_tokens": 133277156.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.5235443115234375,
|
|
"epoch": 1.9891304347826086,
|
|
"grad_norm": 2.9914772490261488,
|
|
"learning_rate": 8.01598680691057e-06,
|
|
"loss": 0.1742,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 134024358.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.531524658203125,
|
|
"epoch": 2.0,
|
|
"grad_norm": 5.194618648022684,
|
|
"learning_rate": 7.99202330806557e-06,
|
|
"loss": 0.1828,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 134767516.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.546051025390625,
|
|
"epoch": 2.010869565217391,
|
|
"grad_norm": 5.5457492209724295,
|
|
"learning_rate": 7.967952261590936e-06,
|
|
"loss": 0.183,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 135493659.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.548583984375,
|
|
"epoch": 2.0217391304347827,
|
|
"grad_norm": 3.641106844026026,
|
|
"learning_rate": 7.943774532715215e-06,
|
|
"loss": 0.1528,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 136213244.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.5458221435546875,
|
|
"epoch": 2.032608695652174,
|
|
"grad_norm": 1.118188823587022,
|
|
"learning_rate": 7.919490990501636e-06,
|
|
"loss": 0.155,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 136955915.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.5222015380859375,
|
|
"epoch": 2.0434782608695654,
|
|
"grad_norm": 2.7699906551383804,
|
|
"learning_rate": 7.895102507816866e-06,
|
|
"loss": 0.1874,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 137688426.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.517578125,
|
|
"epoch": 2.0543478260869565,
|
|
"grad_norm": 4.895036542301294,
|
|
"learning_rate": 7.870609961299627e-06,
|
|
"loss": 0.1581,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 138409916.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.515869140625,
|
|
"epoch": 2.0652173913043477,
|
|
"grad_norm": 3.7445462521408293,
|
|
"learning_rate": 7.8460142313292e-06,
|
|
"loss": 0.1754,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 139134932.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.518218994140625,
|
|
"epoch": 2.0760869565217392,
|
|
"grad_norm": 2.7788989684966174,
|
|
"learning_rate": 7.821316201993768e-06,
|
|
"loss": 0.15,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 139839540.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.5122528076171875,
|
|
"epoch": 2.0869565217391304,
|
|
"grad_norm": 0.6530392998319222,
|
|
"learning_rate": 7.796516761058649e-06,
|
|
"loss": 0.1595,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 140591809.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.495574951171875,
|
|
"epoch": 2.097826086956522,
|
|
"grad_norm": 2.6546781136836883,
|
|
"learning_rate": 7.771616799934372e-06,
|
|
"loss": 0.1876,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 141330633.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.5094757080078125,
|
|
"epoch": 2.108695652173913,
|
|
"grad_norm": 2.6438443009151964,
|
|
"learning_rate": 7.746617213644646e-06,
|
|
"loss": 0.1567,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 142047594.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.4922332763671875,
|
|
"epoch": 2.119565217391304,
|
|
"grad_norm": 2.6532052341807173,
|
|
"learning_rate": 7.721518900794186e-06,
|
|
"loss": 0.1391,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 142790782.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.4940185546875,
|
|
"epoch": 2.130434782608696,
|
|
"grad_norm": 1.276179935634721,
|
|
"learning_rate": 7.696322763536408e-06,
|
|
"loss": 0.163,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 143536404.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.4945526123046875,
|
|
"epoch": 2.141304347826087,
|
|
"grad_norm": 2.0119946644457554,
|
|
"learning_rate": 7.67102970754101e-06,
|
|
"loss": 0.1554,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 144266027.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.4831085205078125,
|
|
"epoch": 2.1521739130434785,
|
|
"grad_norm": 0.6621962707718312,
|
|
"learning_rate": 7.645640641961407e-06,
|
|
"loss": 0.1368,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 145021594.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.4850311279296875,
|
|
"epoch": 2.1630434782608696,
|
|
"grad_norm": 0.8570157470009787,
|
|
"learning_rate": 7.620156479402066e-06,
|
|
"loss": 0.1328,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 145738624.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.4987640380859375,
|
|
"epoch": 2.1739130434782608,
|
|
"grad_norm": 4.3782220025551615,
|
|
"learning_rate": 7.594578135885684e-06,
|
|
"loss": 0.1889,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 146462473.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.4835052490234375,
|
|
"epoch": 2.1847826086956523,
|
|
"grad_norm": 5.944807995176715,
|
|
"learning_rate": 7.568906530820281e-06,
|
|
"loss": 0.1802,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 147187579.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.4782562255859375,
|
|
"epoch": 2.1956521739130435,
|
|
"grad_norm": 2.2566778910394256,
|
|
"learning_rate": 7.543142586966139e-06,
|
|
"loss": 0.1589,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 147916518.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.4561614990234375,
|
|
"epoch": 2.2065217391304346,
|
|
"grad_norm": 0.794342992945764,
|
|
"learning_rate": 7.517287230402639e-06,
|
|
"loss": 0.1729,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 148644862.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.4606170654296875,
|
|
"epoch": 2.217391304347826,
|
|
"grad_norm": 5.350983414719361,
|
|
"learning_rate": 7.491341390494971e-06,
|
|
"loss": 0.1636,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 149341540.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.4589080810546875,
|
|
"epoch": 2.2282608695652173,
|
|
"grad_norm": 6.355166885946308,
|
|
"learning_rate": 7.465305999860728e-06,
|
|
"loss": 0.2003,
|
|
"mean_token_accuracy": 0.8932291730307043,
|
|
"num_tokens": 150049563.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.460845947265625,
|
|
"epoch": 2.239130434782609,
|
|
"grad_norm": 3.840950671995982,
|
|
"learning_rate": 7.439181994336389e-06,
|
|
"loss": 0.1792,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 150761751.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.45330810546875,
|
|
"epoch": 2.25,
|
|
"grad_norm": 1.2688825517392195,
|
|
"learning_rate": 7.412970312943672e-06,
|
|
"loss": 0.1403,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 151509386.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.4748992919921875,
|
|
"epoch": 2.260869565217391,
|
|
"grad_norm": 1.2529155699739036,
|
|
"learning_rate": 7.386671897855786e-06,
|
|
"loss": 0.1431,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 152232093.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.4568023681640625,
|
|
"epoch": 2.2717391304347827,
|
|
"grad_norm": 1.2060669741264025,
|
|
"learning_rate": 7.360287694363566e-06,
|
|
"loss": 0.1784,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 152973716.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.4611968994140625,
|
|
"epoch": 2.282608695652174,
|
|
"grad_norm": 3.1096597044394776,
|
|
"learning_rate": 7.333818650841489e-06,
|
|
"loss": 0.1412,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 153685710.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.48712158203125,
|
|
"epoch": 2.2934782608695654,
|
|
"grad_norm": 0.998414516080045,
|
|
"learning_rate": 7.3072657187135895e-06,
|
|
"loss": 0.1305,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 154392422.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.4686279296875,
|
|
"epoch": 2.3043478260869565,
|
|
"grad_norm": 1.0574009599691976,
|
|
"learning_rate": 7.280629852419263e-06,
|
|
"loss": 0.1182,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 155133112.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.4687652587890625,
|
|
"epoch": 2.3152173913043477,
|
|
"grad_norm": 1.5434590340882235,
|
|
"learning_rate": 7.253912009378953e-06,
|
|
"loss": 0.1277,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 155870419.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.46307373046875,
|
|
"epoch": 2.3260869565217392,
|
|
"grad_norm": 2.4974463200242663,
|
|
"learning_rate": 7.227113149959738e-06,
|
|
"loss": 0.1418,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 156632182.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.4817657470703125,
|
|
"epoch": 2.3369565217391304,
|
|
"grad_norm": 1.1720076114545368,
|
|
"learning_rate": 7.200234237440815e-06,
|
|
"loss": 0.1434,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 157367132.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.5074920654296875,
|
|
"epoch": 2.3478260869565215,
|
|
"grad_norm": 2.3529053622735763,
|
|
"learning_rate": 7.173276237978872e-06,
|
|
"loss": 0.1396,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 158082511.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.5045166015625,
|
|
"epoch": 2.358695652173913,
|
|
"grad_norm": 1.1876401406675843,
|
|
"learning_rate": 7.146240120573358e-06,
|
|
"loss": 0.1596,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 158836791.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.5250396728515625,
|
|
"epoch": 2.369565217391304,
|
|
"grad_norm": 5.319868227670781,
|
|
"learning_rate": 7.1191268570316575e-06,
|
|
"loss": 0.1433,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 159572495.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.5190887451171875,
|
|
"epoch": 2.380434782608696,
|
|
"grad_norm": 5.411052996724338,
|
|
"learning_rate": 7.091937421934158e-06,
|
|
"loss": 0.1652,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 160312284.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.5269622802734375,
|
|
"epoch": 2.391304347826087,
|
|
"grad_norm": 1.4445703655873523,
|
|
"learning_rate": 7.064672792599208e-06,
|
|
"loss": 0.1559,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 161043235.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.5228729248046875,
|
|
"epoch": 2.4021739130434785,
|
|
"grad_norm": 2.3829973031255713,
|
|
"learning_rate": 7.037333949048005e-06,
|
|
"loss": 0.1501,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 161752920.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.5212249755859375,
|
|
"epoch": 2.4130434782608696,
|
|
"grad_norm": 1.1343541485405513,
|
|
"learning_rate": 7.009921873969359e-06,
|
|
"loss": 0.1378,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 162507939.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.520965576171875,
|
|
"epoch": 2.4239130434782608,
|
|
"grad_norm": 0.831032023339617,
|
|
"learning_rate": 6.9824375526843705e-06,
|
|
"loss": 0.1281,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 163219122.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.5089874267578125,
|
|
"epoch": 2.4347826086956523,
|
|
"grad_norm": 3.8034131684837558,
|
|
"learning_rate": 6.954881973111013e-06,
|
|
"loss": 0.1657,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 163962018.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.528564453125,
|
|
"epoch": 2.4456521739130435,
|
|
"grad_norm": 2.1845844175686495,
|
|
"learning_rate": 6.927256125728624e-06,
|
|
"loss": 0.12,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 164670228.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.5126800537109375,
|
|
"epoch": 2.4565217391304346,
|
|
"grad_norm": 3.8712279594237646,
|
|
"learning_rate": 6.8995610035423044e-06,
|
|
"loss": 0.1654,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 165399091.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.5189971923828125,
|
|
"epoch": 2.467391304347826,
|
|
"grad_norm": 3.26297349877328,
|
|
"learning_rate": 6.871797602047221e-06,
|
|
"loss": 0.1762,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 166114567.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.500518798828125,
|
|
"epoch": 2.4782608695652173,
|
|
"grad_norm": 1.0627902710963428,
|
|
"learning_rate": 6.843966919192827e-06,
|
|
"loss": 0.1617,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 166864530.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.5054779052734375,
|
|
"epoch": 2.489130434782609,
|
|
"grad_norm": 3.3587017419864695,
|
|
"learning_rate": 6.816069955346986e-06,
|
|
"loss": 0.1583,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 167594819.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.515777587890625,
|
|
"epoch": 2.5,
|
|
"grad_norm": 4.988915841711047,
|
|
"learning_rate": 6.788107713260023e-06,
|
|
"loss": 0.1586,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 168322196.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.5275115966796875,
|
|
"epoch": 2.5108695652173916,
|
|
"grad_norm": 1.8907326981464134,
|
|
"learning_rate": 6.760081198028671e-06,
|
|
"loss": 0.122,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 169068640.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.528472900390625,
|
|
"epoch": 2.5217391304347827,
|
|
"grad_norm": 2.223142070781773,
|
|
"learning_rate": 6.731991417059947e-06,
|
|
"loss": 0.1532,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 169812602.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.5394134521484375,
|
|
"epoch": 2.532608695652174,
|
|
"grad_norm": 2.509078456447013,
|
|
"learning_rate": 6.703839380034945e-06,
|
|
"loss": 0.1342,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 170573503.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.542877197265625,
|
|
"epoch": 2.5434782608695654,
|
|
"grad_norm": 2.1739642646057544,
|
|
"learning_rate": 6.675626098872536e-06,
|
|
"loss": 0.1491,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 171308789.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.552459716796875,
|
|
"epoch": 2.5543478260869565,
|
|
"grad_norm": 0.5785511391669176,
|
|
"learning_rate": 6.647352587693001e-06,
|
|
"loss": 0.1588,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 172036762.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.5586700439453125,
|
|
"epoch": 2.5652173913043477,
|
|
"grad_norm": 1.2352613961801158,
|
|
"learning_rate": 6.619019862781571e-06,
|
|
"loss": 0.1616,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 172770310.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.542572021484375,
|
|
"epoch": 2.5760869565217392,
|
|
"grad_norm": 1.2904070581616358,
|
|
"learning_rate": 6.590628942551909e-06,
|
|
"loss": 0.1478,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 173503232.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.5292510986328125,
|
|
"epoch": 2.5869565217391304,
|
|
"grad_norm": 0.796470258682214,
|
|
"learning_rate": 6.5621808475094904e-06,
|
|
"loss": 0.1745,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 174249314.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.5342559814453125,
|
|
"epoch": 2.5978260869565215,
|
|
"grad_norm": 2.936582284327684,
|
|
"learning_rate": 6.533676600214929e-06,
|
|
"loss": 0.135,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 174986430.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.525421142578125,
|
|
"epoch": 2.608695652173913,
|
|
"grad_norm": 0.9552105594989622,
|
|
"learning_rate": 6.505117225247218e-06,
|
|
"loss": 0.1593,
|
|
"mean_token_accuracy": 0.9062500055879354,
|
|
"num_tokens": 175707045.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.532073974609375,
|
|
"epoch": 2.619565217391304,
|
|
"grad_norm": 1.784451773688197,
|
|
"learning_rate": 6.476503749166903e-06,
|
|
"loss": 0.1371,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 176425496.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.5127410888671875,
|
|
"epoch": 2.630434782608696,
|
|
"grad_norm": 1.799298324582578,
|
|
"learning_rate": 6.447837200479187e-06,
|
|
"loss": 0.1374,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 177156941.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.49639892578125,
|
|
"epoch": 2.641304347826087,
|
|
"grad_norm": 0.7415170847026947,
|
|
"learning_rate": 6.419118609596948e-06,
|
|
"loss": 0.1602,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 177917197.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.5039825439453125,
|
|
"epoch": 2.6521739130434785,
|
|
"grad_norm": 0.9990170572769999,
|
|
"learning_rate": 6.390349008803717e-06,
|
|
"loss": 0.1483,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 178653479.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.497955322265625,
|
|
"epoch": 2.6630434782608696,
|
|
"grad_norm": 1.6579494628042188,
|
|
"learning_rate": 6.36152943221656e-06,
|
|
"loss": 0.1311,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 179390930.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.4999237060546875,
|
|
"epoch": 2.6739130434782608,
|
|
"grad_norm": 1.6421221898848504,
|
|
"learning_rate": 6.332660915748915e-06,
|
|
"loss": 0.1289,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 180127958.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.50103759765625,
|
|
"epoch": 2.6847826086956523,
|
|
"grad_norm": 1.481349030513425,
|
|
"learning_rate": 6.303744497073352e-06,
|
|
"loss": 0.1355,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 180864479.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.4853515625,
|
|
"epoch": 2.6956521739130435,
|
|
"grad_norm": 2.96890316283001,
|
|
"learning_rate": 6.274781215584277e-06,
|
|
"loss": 0.1168,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 181612806.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.482879638671875,
|
|
"epoch": 2.7065217391304346,
|
|
"grad_norm": 2.4263839451945075,
|
|
"learning_rate": 6.245772112360568e-06,
|
|
"loss": 0.1276,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 182357173.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.4721527099609375,
|
|
"epoch": 2.717391304347826,
|
|
"grad_norm": 3.72858753698793,
|
|
"learning_rate": 6.216718230128156e-06,
|
|
"loss": 0.1267,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 183084411.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.4805145263671875,
|
|
"epoch": 2.7282608695652173,
|
|
"grad_norm": 3.0150680355573325,
|
|
"learning_rate": 6.187620613222544e-06,
|
|
"loss": 0.1613,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 183808513.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.47900390625,
|
|
"epoch": 2.7391304347826084,
|
|
"grad_norm": 1.4526145850715015,
|
|
"learning_rate": 6.158480307551269e-06,
|
|
"loss": 0.1452,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 184542988.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.4749755859375,
|
|
"epoch": 2.75,
|
|
"grad_norm": 2.2105893493030266,
|
|
"learning_rate": 6.129298360556304e-06,
|
|
"loss": 0.1194,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 185269527.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.482421875,
|
|
"epoch": 2.7608695652173916,
|
|
"grad_norm": 2.1548081199268996,
|
|
"learning_rate": 6.100075821176412e-06,
|
|
"loss": 0.1433,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 186011532.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.487548828125,
|
|
"epoch": 2.7717391304347827,
|
|
"grad_norm": 1.764179651210478,
|
|
"learning_rate": 6.070813739809443e-06,
|
|
"loss": 0.1701,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 186754434.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.4849853515625,
|
|
"epoch": 2.782608695652174,
|
|
"grad_norm": 2.079298729971693,
|
|
"learning_rate": 6.041513168274568e-06,
|
|
"loss": 0.1651,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 187472103.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.503082275390625,
|
|
"epoch": 2.7934782608695654,
|
|
"grad_norm": 1.452865577962535,
|
|
"learning_rate": 6.012175159774488e-06,
|
|
"loss": 0.1362,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 188189773.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.5058135986328125,
|
|
"epoch": 2.8043478260869565,
|
|
"grad_norm": 1.1574478631255485,
|
|
"learning_rate": 5.982800768857561e-06,
|
|
"loss": 0.1297,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 188936311.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.515655517578125,
|
|
"epoch": 2.8152173913043477,
|
|
"grad_norm": 1.007212330363737,
|
|
"learning_rate": 5.953391051379904e-06,
|
|
"loss": 0.1373,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 189665264.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.510162353515625,
|
|
"epoch": 2.8260869565217392,
|
|
"grad_norm": 2.523598667392452,
|
|
"learning_rate": 5.9239470644674425e-06,
|
|
"loss": 0.1467,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 190390024.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.5079803466796875,
|
|
"epoch": 2.8369565217391304,
|
|
"grad_norm": 1.6030024836662948,
|
|
"learning_rate": 5.894469866477905e-06,
|
|
"loss": 0.148,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 191114098.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.4966888427734375,
|
|
"epoch": 2.8478260869565215,
|
|
"grad_norm": 1.259610196820505,
|
|
"learning_rate": 5.864960516962791e-06,
|
|
"loss": 0.1277,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 191853725.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.487640380859375,
|
|
"epoch": 2.858695652173913,
|
|
"grad_norm": 1.8997961528888037,
|
|
"learning_rate": 5.835420076629273e-06,
|
|
"loss": 0.1352,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 192592834.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.485992431640625,
|
|
"epoch": 2.869565217391304,
|
|
"grad_norm": 1.1311799882762155,
|
|
"learning_rate": 5.805849607302081e-06,
|
|
"loss": 0.1186,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 193318463.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.493988037109375,
|
|
"epoch": 2.880434782608696,
|
|
"grad_norm": 2.9598280446115353,
|
|
"learning_rate": 5.776250171885329e-06,
|
|
"loss": 0.1499,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 194060408.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.49200439453125,
|
|
"epoch": 2.891304347826087,
|
|
"grad_norm": 2.6509383827600352,
|
|
"learning_rate": 5.74662283432431e-06,
|
|
"loss": 0.1357,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 194789863.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.5071868896484375,
|
|
"epoch": 2.9021739130434785,
|
|
"grad_norm": 2.6570677752593497,
|
|
"learning_rate": 5.716968659567256e-06,
|
|
"loss": 0.1317,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 195513083.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.4799957275390625,
|
|
"epoch": 2.9130434782608696,
|
|
"grad_norm": 1.435852867141667,
|
|
"learning_rate": 5.687288713527051e-06,
|
|
"loss": 0.1348,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 196289975.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.50189208984375,
|
|
"epoch": 2.9239130434782608,
|
|
"grad_norm": 4.493817088046351,
|
|
"learning_rate": 5.6575840630429295e-06,
|
|
"loss": 0.1374,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 197020989.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.5019073486328125,
|
|
"epoch": 2.9347826086956523,
|
|
"grad_norm": 3.9144060621986734,
|
|
"learning_rate": 5.627855775842116e-06,
|
|
"loss": 0.1688,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 197757220.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.4936065673828125,
|
|
"epoch": 2.9456521739130435,
|
|
"grad_norm": 1.4156310056826011,
|
|
"learning_rate": 5.598104920501455e-06,
|
|
"loss": 0.127,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 198499571.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.51116943359375,
|
|
"epoch": 2.9565217391304346,
|
|
"grad_norm": 4.02324822227324,
|
|
"learning_rate": 5.568332566408995e-06,
|
|
"loss": 0.1521,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 199231680.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.4932098388671875,
|
|
"epoch": 2.967391304347826,
|
|
"grad_norm": 4.07784250425575,
|
|
"learning_rate": 5.538539783725556e-06,
|
|
"loss": 0.1294,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 199988378.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.485107421875,
|
|
"epoch": 2.9782608695652173,
|
|
"grad_norm": 3.163834975563216,
|
|
"learning_rate": 5.508727643346257e-06,
|
|
"loss": 0.1521,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 200747091.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.5088043212890625,
|
|
"epoch": 2.9891304347826084,
|
|
"grad_norm": 2.2016312803750906,
|
|
"learning_rate": 5.478897216862026e-06,
|
|
"loss": 0.1755,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 201471197.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.5051727294921875,
|
|
"epoch": 3.0,
|
|
"grad_norm": 4.694614298581807,
|
|
"learning_rate": 5.4490495765210795e-06,
|
|
"loss": 0.1552,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 202206514.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.50433349609375,
|
|
"epoch": 3.010869565217391,
|
|
"grad_norm": 3.06769665618971,
|
|
"learning_rate": 5.4191857951903825e-06,
|
|
"loss": 0.1167,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 202942563.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.4972076416015625,
|
|
"epoch": 3.0217391304347827,
|
|
"grad_norm": 1.6397565472662876,
|
|
"learning_rate": 5.389306946317089e-06,
|
|
"loss": 0.1181,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 203670291.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.509796142578125,
|
|
"epoch": 3.032608695652174,
|
|
"grad_norm": 1.907212437154809,
|
|
"learning_rate": 5.359414103889947e-06,
|
|
"loss": 0.1175,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 204425814.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.5008544921875,
|
|
"epoch": 3.0434782608695654,
|
|
"grad_norm": 2.8679083943500014,
|
|
"learning_rate": 5.329508342400702e-06,
|
|
"loss": 0.1219,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 205186257.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.52252197265625,
|
|
"epoch": 3.0543478260869565,
|
|
"grad_norm": 2.0857163642942864,
|
|
"learning_rate": 5.29959073680547e-06,
|
|
"loss": 0.0953,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 205912664.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.513763427734375,
|
|
"epoch": 3.0652173913043477,
|
|
"grad_norm": 1.6757912933946717,
|
|
"learning_rate": 5.2696623624861065e-06,
|
|
"loss": 0.1115,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 206651702.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.5019683837890625,
|
|
"epoch": 3.0760869565217392,
|
|
"grad_norm": 1.8995652736397595,
|
|
"learning_rate": 5.239724295211541e-06,
|
|
"loss": 0.0742,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 207394471.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.51483154296875,
|
|
"epoch": 3.0869565217391304,
|
|
"grad_norm": 1.34259531697699,
|
|
"learning_rate": 5.209777611099117e-06,
|
|
"loss": 0.1067,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 208127516.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.515350341796875,
|
|
"epoch": 3.097826086956522,
|
|
"grad_norm": 3.1316206853954216,
|
|
"learning_rate": 5.179823386575908e-06,
|
|
"loss": 0.1374,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 208858656.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.51507568359375,
|
|
"epoch": 3.108695652173913,
|
|
"grad_norm": 1.9562947247931248,
|
|
"learning_rate": 5.1498626983400215e-06,
|
|
"loss": 0.1345,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 209600412.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.517486572265625,
|
|
"epoch": 3.119565217391304,
|
|
"grad_norm": 2.8209203009540316,
|
|
"learning_rate": 5.11989662332191e-06,
|
|
"loss": 0.1009,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 210309190.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.5090179443359375,
|
|
"epoch": 3.130434782608696,
|
|
"grad_norm": 4.20234994537978,
|
|
"learning_rate": 5.089926238645645e-06,
|
|
"loss": 0.1374,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 211015804.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.5020599365234375,
|
|
"epoch": 3.141304347826087,
|
|
"grad_norm": 2.6598109660263467,
|
|
"learning_rate": 5.059952621590216e-06,
|
|
"loss": 0.1255,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 211763802.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.495635986328125,
|
|
"epoch": 3.1521739130434785,
|
|
"grad_norm": 3.168928505472299,
|
|
"learning_rate": 5.029976849550789e-06,
|
|
"loss": 0.091,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 212526112.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.5042724609375,
|
|
"epoch": 3.1630434782608696,
|
|
"grad_norm": 2.9638392449681628,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1358,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 213230739.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.491973876953125,
|
|
"epoch": 3.1739130434782608,
|
|
"grad_norm": 2.287280064117479,
|
|
"learning_rate": 4.970023150449212e-06,
|
|
"loss": 0.0909,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 213988716.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.51019287109375,
|
|
"epoch": 3.1847826086956523,
|
|
"grad_norm": 1.7009219485144782,
|
|
"learning_rate": 4.940047378409786e-06,
|
|
"loss": 0.1136,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 214708259.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.5054168701171875,
|
|
"epoch": 3.1956521739130435,
|
|
"grad_norm": 1.5102170961098071,
|
|
"learning_rate": 4.910073761354354e-06,
|
|
"loss": 0.0846,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 215444925.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.4853057861328125,
|
|
"epoch": 3.2065217391304346,
|
|
"grad_norm": 1.9840102733725111,
|
|
"learning_rate": 4.880103376678092e-06,
|
|
"loss": 0.1281,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 216183251.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.4936370849609375,
|
|
"epoch": 3.217391304347826,
|
|
"grad_norm": 4.2232183265569665,
|
|
"learning_rate": 4.85013730165998e-06,
|
|
"loss": 0.1202,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 216919643.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.5063629150390625,
|
|
"epoch": 3.2282608695652173,
|
|
"grad_norm": 2.4607267304782607,
|
|
"learning_rate": 4.820176613424095e-06,
|
|
"loss": 0.1283,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 217620794.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.4898681640625,
|
|
"epoch": 3.239130434782609,
|
|
"grad_norm": 3.443753153014187,
|
|
"learning_rate": 4.790222388900884e-06,
|
|
"loss": 0.1133,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 218352923.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.4887847900390625,
|
|
"epoch": 3.25,
|
|
"grad_norm": 2.8767809849140384,
|
|
"learning_rate": 4.76027570478846e-06,
|
|
"loss": 0.1247,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 219082938.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.4860992431640625,
|
|
"epoch": 3.260869565217391,
|
|
"grad_norm": 2.006610702080693,
|
|
"learning_rate": 4.730337637513895e-06,
|
|
"loss": 0.1283,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 219814270.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.482025146484375,
|
|
"epoch": 3.2717391304347827,
|
|
"grad_norm": 2.633067097224965,
|
|
"learning_rate": 4.7004092631945315e-06,
|
|
"loss": 0.1039,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 220546102.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.5012969970703125,
|
|
"epoch": 3.282608695652174,
|
|
"grad_norm": 2.983542365323401,
|
|
"learning_rate": 4.6704916575993005e-06,
|
|
"loss": 0.1081,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 221258742.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.4999542236328125,
|
|
"epoch": 3.2934782608695654,
|
|
"grad_norm": 2.0398338831031237,
|
|
"learning_rate": 4.640585896110054e-06,
|
|
"loss": 0.1357,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 221967994.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.4818572998046875,
|
|
"epoch": 3.3043478260869565,
|
|
"grad_norm": 2.4365160834765134,
|
|
"learning_rate": 4.610693053682912e-06,
|
|
"loss": 0.1172,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 222724712.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.4759521484375,
|
|
"epoch": 3.3152173913043477,
|
|
"grad_norm": 2.1986887965299036,
|
|
"learning_rate": 4.580814204809618e-06,
|
|
"loss": 0.0791,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 223439890.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.4654693603515625,
|
|
"epoch": 3.3260869565217392,
|
|
"grad_norm": 1.8859034330884508,
|
|
"learning_rate": 4.550950423478923e-06,
|
|
"loss": 0.108,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 224172122.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.4481048583984375,
|
|
"epoch": 3.3369565217391304,
|
|
"grad_norm": 4.751940431862742,
|
|
"learning_rate": 4.521102783137976e-06,
|
|
"loss": 0.1312,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 224894838.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.4436187744140625,
|
|
"epoch": 3.3478260869565215,
|
|
"grad_norm": 4.425254145904883,
|
|
"learning_rate": 4.491272356653744e-06,
|
|
"loss": 0.1064,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 225615725.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.4452667236328125,
|
|
"epoch": 3.358695652173913,
|
|
"grad_norm": 3.3389210580266178,
|
|
"learning_rate": 4.4614602162744455e-06,
|
|
"loss": 0.1129,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 226340619.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.454864501953125,
|
|
"epoch": 3.369565217391304,
|
|
"grad_norm": 4.471417255794463,
|
|
"learning_rate": 4.431667433591006e-06,
|
|
"loss": 0.1204,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 227064708.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.4317169189453125,
|
|
"epoch": 3.380434782608696,
|
|
"grad_norm": 5.650025362027991,
|
|
"learning_rate": 4.401895079498547e-06,
|
|
"loss": 0.1075,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 227815323.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.4376068115234375,
|
|
"epoch": 3.391304347826087,
|
|
"grad_norm": 3.994918766466525,
|
|
"learning_rate": 4.372144224157886e-06,
|
|
"loss": 0.1231,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 228557986.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.4536895751953125,
|
|
"epoch": 3.4021739130434785,
|
|
"grad_norm": 1.9762459953580778,
|
|
"learning_rate": 4.342415936957073e-06,
|
|
"loss": 0.0872,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 229288342.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.432769775390625,
|
|
"epoch": 3.4130434782608696,
|
|
"grad_norm": 2.460713403150911,
|
|
"learning_rate": 4.312711286472951e-06,
|
|
"loss": 0.0964,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 230004736.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.433319091796875,
|
|
"epoch": 3.4239130434782608,
|
|
"grad_norm": 3.045778631143028,
|
|
"learning_rate": 4.2830313404327475e-06,
|
|
"loss": 0.0895,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 230744082.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.439117431640625,
|
|
"epoch": 3.4347826086956523,
|
|
"grad_norm": 2.2475691741164696,
|
|
"learning_rate": 4.253377165675691e-06,
|
|
"loss": 0.0813,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 231488919.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.434326171875,
|
|
"epoch": 3.4456521739130435,
|
|
"grad_norm": 3.367606483147211,
|
|
"learning_rate": 4.223749828114672e-06,
|
|
"loss": 0.0875,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 232248463.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.43182373046875,
|
|
"epoch": 3.4565217391304346,
|
|
"grad_norm": 3.3155618921146934,
|
|
"learning_rate": 4.19415039269792e-06,
|
|
"loss": 0.08,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 232990840.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.431884765625,
|
|
"epoch": 3.467391304347826,
|
|
"grad_norm": 4.1854446314250575,
|
|
"learning_rate": 4.1645799233707286e-06,
|
|
"loss": 0.1066,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 233701397.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.4250335693359375,
|
|
"epoch": 3.4782608695652173,
|
|
"grad_norm": 2.8961742939589756,
|
|
"learning_rate": 4.1350394830372106e-06,
|
|
"loss": 0.0808,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 234455710.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.4479217529296875,
|
|
"epoch": 3.489130434782609,
|
|
"grad_norm": 2.6974319050787794,
|
|
"learning_rate": 4.105530133522096e-06,
|
|
"loss": 0.0951,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 235150020.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.4250335693359375,
|
|
"epoch": 3.5,
|
|
"grad_norm": 2.9049558679735297,
|
|
"learning_rate": 4.076052935532559e-06,
|
|
"loss": 0.1139,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 235900334.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.4369659423828125,
|
|
"epoch": 3.5108695652173916,
|
|
"grad_norm": 2.7880576269094193,
|
|
"learning_rate": 4.046608948620098e-06,
|
|
"loss": 0.0687,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 236630311.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.451568603515625,
|
|
"epoch": 3.5217391304347827,
|
|
"grad_norm": 3.0094934257098225,
|
|
"learning_rate": 4.017199231142441e-06,
|
|
"loss": 0.1004,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 237340824.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.4543304443359375,
|
|
"epoch": 3.532608695652174,
|
|
"grad_norm": 2.4388474991031996,
|
|
"learning_rate": 3.987824840225512e-06,
|
|
"loss": 0.0858,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 238057730.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.4311676025390625,
|
|
"epoch": 3.5434782608695654,
|
|
"grad_norm": 2.4558151962901924,
|
|
"learning_rate": 3.9584868317254325e-06,
|
|
"loss": 0.0848,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 238829262.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.4520111083984375,
|
|
"epoch": 3.5543478260869565,
|
|
"grad_norm": 2.6639668813954804,
|
|
"learning_rate": 3.92918626019056e-06,
|
|
"loss": 0.0877,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 239548363.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.436553955078125,
|
|
"epoch": 3.5652173913043477,
|
|
"grad_norm": 3.9839519201774207,
|
|
"learning_rate": 3.8999241788235896e-06,
|
|
"loss": 0.1037,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 240284884.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.4551239013671875,
|
|
"epoch": 3.5760869565217392,
|
|
"grad_norm": 3.0033678445642864,
|
|
"learning_rate": 3.8707016394436985e-06,
|
|
"loss": 0.1392,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 241006167.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.4543304443359375,
|
|
"epoch": 3.5869565217391304,
|
|
"grad_norm": 2.5275259027806,
|
|
"learning_rate": 3.841519692448732e-06,
|
|
"loss": 0.1127,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 241714934.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.456878662109375,
|
|
"epoch": 3.5978260869565215,
|
|
"grad_norm": 4.185455396688406,
|
|
"learning_rate": 3.8123793867774573e-06,
|
|
"loss": 0.122,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 242442425.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.451507568359375,
|
|
"epoch": 3.608695652173913,
|
|
"grad_norm": 1.9364274254882414,
|
|
"learning_rate": 3.7832817698718456e-06,
|
|
"loss": 0.0957,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 243181556.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.4564666748046875,
|
|
"epoch": 3.619565217391304,
|
|
"grad_norm": 3.0536084728321464,
|
|
"learning_rate": 3.754227887639434e-06,
|
|
"loss": 0.1134,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 243902910.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.4476776123046875,
|
|
"epoch": 3.630434782608696,
|
|
"grad_norm": 2.4345681594231317,
|
|
"learning_rate": 3.725218784415723e-06,
|
|
"loss": 0.1075,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 244647088.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.4428558349609375,
|
|
"epoch": 3.641304347826087,
|
|
"grad_norm": 2.30753438899568,
|
|
"learning_rate": 3.6962555029266488e-06,
|
|
"loss": 0.0732,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 245408005.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.450592041015625,
|
|
"epoch": 3.6521739130434785,
|
|
"grad_norm": 2.2552988728251138,
|
|
"learning_rate": 3.667339084251087e-06,
|
|
"loss": 0.0866,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 246148504.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.4475250244140625,
|
|
"epoch": 3.6630434782608696,
|
|
"grad_norm": 1.8821968454721285,
|
|
"learning_rate": 3.638470567783442e-06,
|
|
"loss": 0.0713,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 246890120.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.4526214599609375,
|
|
"epoch": 3.6739130434782608,
|
|
"grad_norm": 2.6874202524065938,
|
|
"learning_rate": 3.609650991196285e-06,
|
|
"loss": 0.1105,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 247638948.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.43621826171875,
|
|
"epoch": 3.6847826086956523,
|
|
"grad_norm": 4.23700000906774,
|
|
"learning_rate": 3.5808813904030517e-06,
|
|
"loss": 0.1126,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 248379322.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.46246337890625,
|
|
"epoch": 3.6956521739130435,
|
|
"grad_norm": 4.506170167517848,
|
|
"learning_rate": 3.5521627995208146e-06,
|
|
"loss": 0.0856,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 249093858.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.4495086669921875,
|
|
"epoch": 3.7065217391304346,
|
|
"grad_norm": 3.743666500227273,
|
|
"learning_rate": 3.523496250833098e-06,
|
|
"loss": 0.0841,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 249805318.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.440765380859375,
|
|
"epoch": 3.717391304347826,
|
|
"grad_norm": 2.9823469258416524,
|
|
"learning_rate": 3.4948827747527846e-06,
|
|
"loss": 0.0663,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 250535140.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.42999267578125,
|
|
"epoch": 3.7282608695652173,
|
|
"grad_norm": 3.9873086228575962,
|
|
"learning_rate": 3.466323399785072e-06,
|
|
"loss": 0.0842,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 251257332.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.423736572265625,
|
|
"epoch": 3.7391304347826084,
|
|
"grad_norm": 2.701122835614665,
|
|
"learning_rate": 3.4378191524905104e-06,
|
|
"loss": 0.0762,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 252028117.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.41546630859375,
|
|
"epoch": 3.75,
|
|
"grad_norm": 2.2704454244519887,
|
|
"learning_rate": 3.4093710574480926e-06,
|
|
"loss": 0.0723,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 252790365.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.4179229736328125,
|
|
"epoch": 3.7608695652173916,
|
|
"grad_norm": 4.147397062235881,
|
|
"learning_rate": 3.3809801372184305e-06,
|
|
"loss": 0.0921,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 253547279.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.413238525390625,
|
|
"epoch": 3.7717391304347827,
|
|
"grad_norm": 3.9374726861982765,
|
|
"learning_rate": 3.352647412307002e-06,
|
|
"loss": 0.0949,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 254269311.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.4278717041015625,
|
|
"epoch": 3.782608695652174,
|
|
"grad_norm": 4.518046928950192,
|
|
"learning_rate": 3.3243739011274645e-06,
|
|
"loss": 0.0655,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 254970333.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.4228973388671875,
|
|
"epoch": 3.7934782608695654,
|
|
"grad_norm": 2.9316072032323266,
|
|
"learning_rate": 3.296160619965056e-06,
|
|
"loss": 0.1006,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 255683339.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.4070281982421875,
|
|
"epoch": 3.8043478260869565,
|
|
"grad_norm": 3.833452072486913,
|
|
"learning_rate": 3.2680085829400553e-06,
|
|
"loss": 0.1031,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 256436983.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.411865234375,
|
|
"epoch": 3.8152173913043477,
|
|
"grad_norm": 3.688263127157705,
|
|
"learning_rate": 3.2399188019713325e-06,
|
|
"loss": 0.0734,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 257174761.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.4146881103515625,
|
|
"epoch": 3.8260869565217392,
|
|
"grad_norm": 3.2549612757619224,
|
|
"learning_rate": 3.2118922867399776e-06,
|
|
"loss": 0.0779,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 257935780.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.4317626953125,
|
|
"epoch": 3.8369565217391304,
|
|
"grad_norm": 2.7264910531997737,
|
|
"learning_rate": 3.183930044653014e-06,
|
|
"loss": 0.0617,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 258646120.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.4184112548828125,
|
|
"epoch": 3.8478260869565215,
|
|
"grad_norm": 3.8142241673763424,
|
|
"learning_rate": 3.156033080807175e-06,
|
|
"loss": 0.0686,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 259380556.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.4139862060546875,
|
|
"epoch": 3.858695652173913,
|
|
"grad_norm": 4.372949658189445,
|
|
"learning_rate": 3.128202397952781e-06,
|
|
"loss": 0.0898,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 260128604.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.4131927490234375,
|
|
"epoch": 3.869565217391304,
|
|
"grad_norm": 4.924894871810679,
|
|
"learning_rate": 3.1004389964576976e-06,
|
|
"loss": 0.0641,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 260863582.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.4102783203125,
|
|
"epoch": 3.880434782608696,
|
|
"grad_norm": 3.3646960100734975,
|
|
"learning_rate": 3.0727438742713766e-06,
|
|
"loss": 0.074,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 261578362.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.4154815673828125,
|
|
"epoch": 3.891304347826087,
|
|
"grad_norm": 4.250173020317946,
|
|
"learning_rate": 3.045118026888988e-06,
|
|
"loss": 0.107,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 262345214.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.4351654052734375,
|
|
"epoch": 3.9021739130434785,
|
|
"grad_norm": 4.809460955061771,
|
|
"learning_rate": 3.0175624473156315e-06,
|
|
"loss": 0.0811,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 263064032.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.425384521484375,
|
|
"epoch": 3.9130434782608696,
|
|
"grad_norm": 3.948282663788381,
|
|
"learning_rate": 2.9900781260306427e-06,
|
|
"loss": 0.0574,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 263781270.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.4114227294921875,
|
|
"epoch": 3.9239130434782608,
|
|
"grad_norm": 2.5593132878486293,
|
|
"learning_rate": 2.962666050951997e-06,
|
|
"loss": 0.0566,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 264508431.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.420318603515625,
|
|
"epoch": 3.9347826086956523,
|
|
"grad_norm": 4.587850195646838,
|
|
"learning_rate": 2.9353272074007933e-06,
|
|
"loss": 0.074,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 265239134.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.4097137451171875,
|
|
"epoch": 3.9456521739130435,
|
|
"grad_norm": 3.3057653009544894,
|
|
"learning_rate": 2.9080625780658455e-06,
|
|
"loss": 0.0722,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 265973420.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.4185333251953125,
|
|
"epoch": 3.9565217391304346,
|
|
"grad_norm": 3.5263173852376037,
|
|
"learning_rate": 2.8808731429683433e-06,
|
|
"loss": 0.0705,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 266700866.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.4109649658203125,
|
|
"epoch": 3.967391304347826,
|
|
"grad_norm": 2.650506973888306,
|
|
"learning_rate": 2.853759879426644e-06,
|
|
"loss": 0.0366,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 267442558.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.42547607421875,
|
|
"epoch": 3.9782608695652173,
|
|
"grad_norm": 1.754893030032466,
|
|
"learning_rate": 2.8267237620211296e-06,
|
|
"loss": 0.0335,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 268190029.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.4144439697265625,
|
|
"epoch": 3.9891304347826084,
|
|
"grad_norm": 3.5718543066413067,
|
|
"learning_rate": 2.7997657625591866e-06,
|
|
"loss": 0.0732,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 268914832.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.415283203125,
|
|
"epoch": 4.0,
|
|
"grad_norm": 3.1929269768363184,
|
|
"learning_rate": 2.772886850040264e-06,
|
|
"loss": 0.0734,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 269653395.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.4150848388671875,
|
|
"epoch": 4.010869565217392,
|
|
"grad_norm": 1.5132263544632103,
|
|
"learning_rate": 2.7460879906210485e-06,
|
|
"loss": 0.0415,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 270395812.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.400604248046875,
|
|
"epoch": 4.021739130434782,
|
|
"grad_norm": 2.0865150177061604,
|
|
"learning_rate": 2.7193701475807376e-06,
|
|
"loss": 0.0429,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 271129526.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.409576416015625,
|
|
"epoch": 4.032608695652174,
|
|
"grad_norm": 5.199496474721622,
|
|
"learning_rate": 2.6927342812864117e-06,
|
|
"loss": 0.0627,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 271841243.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.416473388671875,
|
|
"epoch": 4.043478260869565,
|
|
"grad_norm": 4.051577221690835,
|
|
"learning_rate": 2.6661813491585133e-06,
|
|
"loss": 0.0418,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 272566659.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.4016876220703125,
|
|
"epoch": 4.054347826086956,
|
|
"grad_norm": 2.81380429615059,
|
|
"learning_rate": 2.6397123056364364e-06,
|
|
"loss": 0.0284,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 273300743.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.4071807861328125,
|
|
"epoch": 4.065217391304348,
|
|
"grad_norm": 2.8965302353341973,
|
|
"learning_rate": 2.613328102144216e-06,
|
|
"loss": 0.0462,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 274044022.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.40545654296875,
|
|
"epoch": 4.076086956521739,
|
|
"grad_norm": 3.603123410756133,
|
|
"learning_rate": 2.5870296870563287e-06,
|
|
"loss": 0.0414,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 274784986.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.4001007080078125,
|
|
"epoch": 4.086956521739131,
|
|
"grad_norm": 3.544406566843901,
|
|
"learning_rate": 2.5608180056636123e-06,
|
|
"loss": 0.0466,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 275515764.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.401123046875,
|
|
"epoch": 4.0978260869565215,
|
|
"grad_norm": 2.259644357036766,
|
|
"learning_rate": 2.534694000139273e-06,
|
|
"loss": 0.0283,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 276246351.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.4036865234375,
|
|
"epoch": 4.108695652173913,
|
|
"grad_norm": 4.043357354945065,
|
|
"learning_rate": 2.5086586095050314e-06,
|
|
"loss": 0.0507,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 276988802.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.4044189453125,
|
|
"epoch": 4.119565217391305,
|
|
"grad_norm": 3.819632385443738,
|
|
"learning_rate": 2.482712769597363e-06,
|
|
"loss": 0.0353,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 277721084.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.391021728515625,
|
|
"epoch": 4.130434782608695,
|
|
"grad_norm": 3.310735020294903,
|
|
"learning_rate": 2.4568574130338624e-06,
|
|
"loss": 0.0297,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 278469985.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.3829345703125,
|
|
"epoch": 4.141304347826087,
|
|
"grad_norm": 4.44347136546685,
|
|
"learning_rate": 2.4310934691797207e-06,
|
|
"loss": 0.0654,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 279239322.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.4026641845703125,
|
|
"epoch": 4.1521739130434785,
|
|
"grad_norm": 4.3151807279472925,
|
|
"learning_rate": 2.405421864114318e-06,
|
|
"loss": 0.039,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 279961803.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.39117431640625,
|
|
"epoch": 4.163043478260869,
|
|
"grad_norm": 9.99095027218017,
|
|
"learning_rate": 2.379843520597937e-06,
|
|
"loss": 0.076,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 280711242.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.4055023193359375,
|
|
"epoch": 4.173913043478261,
|
|
"grad_norm": 9.104578252695514,
|
|
"learning_rate": 2.3543593580385925e-06,
|
|
"loss": 0.0682,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 281441066.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.38916015625,
|
|
"epoch": 4.184782608695652,
|
|
"grad_norm": 5.080775795998749,
|
|
"learning_rate": 2.3289702924589914e-06,
|
|
"loss": 0.0559,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 282192750.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.402587890625,
|
|
"epoch": 4.195652173913044,
|
|
"grad_norm": 5.025507520079099,
|
|
"learning_rate": 2.303677236463593e-06,
|
|
"loss": 0.0496,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 282922610.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.397308349609375,
|
|
"epoch": 4.206521739130435,
|
|
"grad_norm": 3.3035189510563234,
|
|
"learning_rate": 2.2784810992058155e-06,
|
|
"loss": 0.0489,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 283665363.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.3996124267578125,
|
|
"epoch": 4.217391304347826,
|
|
"grad_norm": 4.65180929633092,
|
|
"learning_rate": 2.2533827863553552e-06,
|
|
"loss": 0.0833,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 284417865.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.3952789306640625,
|
|
"epoch": 4.228260869565218,
|
|
"grad_norm": 2.9641983428680714,
|
|
"learning_rate": 2.2283832000656304e-06,
|
|
"loss": 0.0495,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 285186616.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.4062652587890625,
|
|
"epoch": 4.239130434782608,
|
|
"grad_norm": 3.2948923015425096,
|
|
"learning_rate": 2.2034832389413536e-06,
|
|
"loss": 0.0315,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 285917914.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.410980224609375,
|
|
"epoch": 4.25,
|
|
"grad_norm": 3.3042561012123453,
|
|
"learning_rate": 2.178683798006234e-06,
|
|
"loss": 0.0264,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 286666533.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.408477783203125,
|
|
"epoch": 4.260869565217392,
|
|
"grad_norm": 3.297590206291124,
|
|
"learning_rate": 2.153985768670803e-06,
|
|
"loss": 0.0395,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 287400442.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.42022705078125,
|
|
"epoch": 4.271739130434782,
|
|
"grad_norm": 3.7110171590894696,
|
|
"learning_rate": 2.1293900387003742e-06,
|
|
"loss": 0.0488,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 288141382.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.4134063720703125,
|
|
"epoch": 4.282608695652174,
|
|
"grad_norm": 3.677257799412749,
|
|
"learning_rate": 2.104897492183135e-06,
|
|
"loss": 0.0437,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 288852719.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.414154052734375,
|
|
"epoch": 4.293478260869565,
|
|
"grad_norm": 3.708786737602445,
|
|
"learning_rate": 2.080509009498364e-06,
|
|
"loss": 0.0401,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 289601862.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.4218597412109375,
|
|
"epoch": 4.304347826086957,
|
|
"grad_norm": 3.3693657926976512,
|
|
"learning_rate": 2.056225467284786e-06,
|
|
"loss": 0.051,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 290346395.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.409637451171875,
|
|
"epoch": 4.315217391304348,
|
|
"grad_norm": 1.530704321616427,
|
|
"learning_rate": 2.0320477384090665e-06,
|
|
"loss": 0.0285,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 291058610.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.4040985107421875,
|
|
"epoch": 4.326086956521739,
|
|
"grad_norm": 2.7941269253382055,
|
|
"learning_rate": 2.007976691934432e-06,
|
|
"loss": 0.0411,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 291832775.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.4112396240234375,
|
|
"epoch": 4.336956521739131,
|
|
"grad_norm": 1.9103029289853792,
|
|
"learning_rate": 1.9840131930894334e-06,
|
|
"loss": 0.023,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 292572255.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.4288177490234375,
|
|
"epoch": 4.3478260869565215,
|
|
"grad_norm": 2.525503550268621,
|
|
"learning_rate": 1.9601581032368457e-06,
|
|
"loss": 0.0347,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 293264390.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.408050537109375,
|
|
"epoch": 4.358695652173913,
|
|
"grad_norm": 2.1628290862416257,
|
|
"learning_rate": 1.936412279842705e-06,
|
|
"loss": 0.0349,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 293967991.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.3993682861328125,
|
|
"epoch": 4.369565217391305,
|
|
"grad_norm": 2.1193099985559107,
|
|
"learning_rate": 1.912776576445488e-06,
|
|
"loss": 0.0296,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 294712662.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.39886474609375,
|
|
"epoch": 4.380434782608695,
|
|
"grad_norm": 1.676721724134583,
|
|
"learning_rate": 1.8892518426254363e-06,
|
|
"loss": 0.0322,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 295461426.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.4094085693359375,
|
|
"epoch": 4.391304347826087,
|
|
"grad_norm": 3.191554758599242,
|
|
"learning_rate": 1.8658389239740094e-06,
|
|
"loss": 0.046,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 296184778.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.4062957763671875,
|
|
"epoch": 4.4021739130434785,
|
|
"grad_norm": 3.4471174235230926,
|
|
"learning_rate": 1.8425386620634961e-06,
|
|
"loss": 0.0385,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 296916529.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.3897247314453125,
|
|
"epoch": 4.413043478260869,
|
|
"grad_norm": 2.1921533231147774,
|
|
"learning_rate": 1.8193518944167625e-06,
|
|
"loss": 0.0301,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 297652822.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.4058990478515625,
|
|
"epoch": 4.423913043478261,
|
|
"grad_norm": 1.9177948058349774,
|
|
"learning_rate": 1.7962794544771477e-06,
|
|
"loss": 0.0295,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 298365151.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.3916015625,
|
|
"epoch": 4.434782608695652,
|
|
"grad_norm": 4.951209147319255,
|
|
"learning_rate": 1.773322171578512e-06,
|
|
"loss": 0.0483,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 299125388.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.3885498046875,
|
|
"epoch": 4.445652173913043,
|
|
"grad_norm": 2.199860580833,
|
|
"learning_rate": 1.7504808709154104e-06,
|
|
"loss": 0.0224,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 299850619.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.39691162109375,
|
|
"epoch": 4.456521739130435,
|
|
"grad_norm": 2.963294110112377,
|
|
"learning_rate": 1.727756373513449e-06,
|
|
"loss": 0.0324,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 300561565.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.3807220458984375,
|
|
"epoch": 4.467391304347826,
|
|
"grad_norm": 2.374326356214355,
|
|
"learning_rate": 1.7051494961997623e-06,
|
|
"loss": 0.0255,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 301300285.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.3920135498046875,
|
|
"epoch": 4.478260869565218,
|
|
"grad_norm": 1.8339803572373474,
|
|
"learning_rate": 1.6826610515736618e-06,
|
|
"loss": 0.0165,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 302039974.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.3795318603515625,
|
|
"epoch": 4.489130434782608,
|
|
"grad_norm": 4.028792577927132,
|
|
"learning_rate": 1.660291847977415e-06,
|
|
"loss": 0.0334,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 302775160.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.389373779296875,
|
|
"epoch": 4.5,
|
|
"grad_norm": 4.692642140279215,
|
|
"learning_rate": 1.6380426894672003e-06,
|
|
"loss": 0.0383,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 303513768.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.404937744140625,
|
|
"epoch": 4.510869565217392,
|
|
"grad_norm": 3.0128933831398697,
|
|
"learning_rate": 1.6159143757842005e-06,
|
|
"loss": 0.0205,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 304207786.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.399322509765625,
|
|
"epoch": 4.521739130434782,
|
|
"grad_norm": 4.6334582678307115,
|
|
"learning_rate": 1.5939077023258547e-06,
|
|
"loss": 0.0298,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 304901897.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.374053955078125,
|
|
"epoch": 4.532608695652174,
|
|
"grad_norm": 7.177966548437757,
|
|
"learning_rate": 1.5720234601172767e-06,
|
|
"loss": 0.0289,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 305636413.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.3793487548828125,
|
|
"epoch": 4.543478260869565,
|
|
"grad_norm": 4.234249422410969,
|
|
"learning_rate": 1.5502624357828118e-06,
|
|
"loss": 0.0341,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 306369969.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.37646484375,
|
|
"epoch": 4.554347826086957,
|
|
"grad_norm": 2.5841092696139687,
|
|
"learning_rate": 1.5286254115177623e-06,
|
|
"loss": 0.0124,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 307136605.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.3839111328125,
|
|
"epoch": 4.565217391304348,
|
|
"grad_norm": 3.884209886203836,
|
|
"learning_rate": 1.5071131650602782e-06,
|
|
"loss": 0.0205,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 307859637.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.3831329345703125,
|
|
"epoch": 4.576086956521739,
|
|
"grad_norm": 3.48550962291133,
|
|
"learning_rate": 1.485726469663401e-06,
|
|
"loss": 0.0134,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 308574520.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.379058837890625,
|
|
"epoch": 4.586956521739131,
|
|
"grad_norm": 4.525051705172931,
|
|
"learning_rate": 1.4644660940672628e-06,
|
|
"loss": 0.0522,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 309304130.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.391448974609375,
|
|
"epoch": 4.5978260869565215,
|
|
"grad_norm": 2.8569303119996508,
|
|
"learning_rate": 1.4433328024714583e-06,
|
|
"loss": 0.0133,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 310018700.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.368743896484375,
|
|
"epoch": 4.608695652173913,
|
|
"grad_norm": 4.064577958118665,
|
|
"learning_rate": 1.422327354507575e-06,
|
|
"loss": 0.0352,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 310754020.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.39080810546875,
|
|
"epoch": 4.619565217391305,
|
|
"grad_norm": 3.2209483395419123,
|
|
"learning_rate": 1.4014505052118893e-06,
|
|
"loss": 0.0235,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 311472836.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.3695526123046875,
|
|
"epoch": 4.630434782608695,
|
|
"grad_norm": 4.787519250335814,
|
|
"learning_rate": 1.3807030049982284e-06,
|
|
"loss": 0.0433,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 312215436.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.3708648681640625,
|
|
"epoch": 4.641304347826087,
|
|
"grad_norm": 4.855797162262989,
|
|
"learning_rate": 1.3600855996309937e-06,
|
|
"loss": 0.034,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 312934072.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.3697357177734375,
|
|
"epoch": 4.6521739130434785,
|
|
"grad_norm": 1.9548634609538587,
|
|
"learning_rate": 1.339599030198351e-06,
|
|
"loss": 0.0297,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 313678041.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.36126708984375,
|
|
"epoch": 4.663043478260869,
|
|
"grad_norm": 4.928776509619712,
|
|
"learning_rate": 1.3192440330856005e-06,
|
|
"loss": 0.0498,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 314446477.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.3810882568359375,
|
|
"epoch": 4.673913043478261,
|
|
"grad_norm": 4.6854147381266085,
|
|
"learning_rate": 1.2990213399487078e-06,
|
|
"loss": 0.0398,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 315183168.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.382904052734375,
|
|
"epoch": 4.684782608695652,
|
|
"grad_norm": 2.4651005556668077,
|
|
"learning_rate": 1.278931677687994e-06,
|
|
"loss": 0.0145,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 315925362.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.3859710693359375,
|
|
"epoch": 4.695652173913043,
|
|
"grad_norm": 6.194863365884358,
|
|
"learning_rate": 1.2589757684220182e-06,
|
|
"loss": 0.0605,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 316649033.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.3759765625,
|
|
"epoch": 4.706521739130435,
|
|
"grad_norm": 2.0634682817077654,
|
|
"learning_rate": 1.239154329461615e-06,
|
|
"loss": 0.0334,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 317388112.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.3852996826171875,
|
|
"epoch": 4.717391304347826,
|
|
"grad_norm": 3.7986961184925736,
|
|
"learning_rate": 1.2194680732841125e-06,
|
|
"loss": 0.028,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 318122299.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.3957672119140625,
|
|
"epoch": 4.728260869565218,
|
|
"grad_norm": 3.7136181113233957,
|
|
"learning_rate": 1.1999177075077278e-06,
|
|
"loss": 0.0226,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 318827965.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.386199951171875,
|
|
"epoch": 4.739130434782608,
|
|
"grad_norm": 3.456311451956069,
|
|
"learning_rate": 1.1805039348661213e-06,
|
|
"loss": 0.0174,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 319516633.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.3855743408203125,
|
|
"epoch": 4.75,
|
|
"grad_norm": 6.237180446498904,
|
|
"learning_rate": 1.1612274531831463e-06,
|
|
"loss": 0.0153,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 320247609.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.37713623046875,
|
|
"epoch": 4.760869565217392,
|
|
"grad_norm": 3.7989906045122517,
|
|
"learning_rate": 1.1420889553477577e-06,
|
|
"loss": 0.0227,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 320959744.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.388458251953125,
|
|
"epoch": 4.771739130434782,
|
|
"grad_norm": 1.7665295188429861,
|
|
"learning_rate": 1.1230891292891173e-06,
|
|
"loss": 0.0231,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 321696157.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.3807830810546875,
|
|
"epoch": 4.782608695652174,
|
|
"grad_norm": 2.12118855432572,
|
|
"learning_rate": 1.1042286579518556e-06,
|
|
"loss": 0.0158,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 322437815.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.3824615478515625,
|
|
"epoch": 4.793478260869565,
|
|
"grad_norm": 2.720827860677872,
|
|
"learning_rate": 1.0855082192715294e-06,
|
|
"loss": 0.019,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 323168930.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.3710784912109375,
|
|
"epoch": 4.804347826086957,
|
|
"grad_norm": 3.5294444778986565,
|
|
"learning_rate": 1.0669284861502517e-06,
|
|
"loss": 0.0266,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 323932957.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.38372802734375,
|
|
"epoch": 4.815217391304348,
|
|
"grad_norm": 3.9462285815837195,
|
|
"learning_rate": 1.0484901264325026e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 324663394.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.3775482177734375,
|
|
"epoch": 4.826086956521739,
|
|
"grad_norm": 2.6526007119525543,
|
|
"learning_rate": 1.0301938028811303e-06,
|
|
"loss": 0.0366,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 325399440.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.37030029296875,
|
|
"epoch": 4.836956521739131,
|
|
"grad_norm": 1.9282014105914622,
|
|
"learning_rate": 1.0120401731535213e-06,
|
|
"loss": 0.0185,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 326153969.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.384490966796875,
|
|
"epoch": 4.8478260869565215,
|
|
"grad_norm": 2.339716590869493,
|
|
"learning_rate": 9.940298897779615e-07,
|
|
"loss": 0.034,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 326870475.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.37078857421875,
|
|
"epoch": 4.858695652173913,
|
|
"grad_norm": 2.2546192067061166,
|
|
"learning_rate": 9.761636001301872e-07,
|
|
"loss": 0.022,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 327610111.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.389862060546875,
|
|
"epoch": 4.869565217391305,
|
|
"grad_norm": 3.286126857507311,
|
|
"learning_rate": 9.58441946410108e-07,
|
|
"loss": 0.0126,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 328339243.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.367828369140625,
|
|
"epoch": 4.880434782608695,
|
|
"grad_norm": 1.293229716893453,
|
|
"learning_rate": 9.408655656187282e-07,
|
|
"loss": 0.016,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 329092009.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.3848114013671875,
|
|
"epoch": 4.891304347826087,
|
|
"grad_norm": 5.434786257257631,
|
|
"learning_rate": 9.234350895352479e-07,
|
|
"loss": 0.0482,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 329818250.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.3868408203125,
|
|
"epoch": 4.9021739130434785,
|
|
"grad_norm": 1.181485451837999,
|
|
"learning_rate": 9.061511446943533e-07,
|
|
"loss": 0.0064,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 330538022.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.383880615234375,
|
|
"epoch": 4.913043478260869,
|
|
"grad_norm": 3.0698903999136857,
|
|
"learning_rate": 8.890143523636968e-07,
|
|
"loss": 0.0214,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 331262621.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.36474609375,
|
|
"epoch": 4.923913043478261,
|
|
"grad_norm": 1.3995011760176632,
|
|
"learning_rate": 8.720253285215685e-07,
|
|
"loss": 0.018,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 332014085.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.377227783203125,
|
|
"epoch": 4.934782608695652,
|
|
"grad_norm": 2.100280032290679,
|
|
"learning_rate": 8.551846838347489e-07,
|
|
"loss": 0.0292,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 332765484.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.36614990234375,
|
|
"epoch": 4.945652173913043,
|
|
"grad_norm": 2.7861329612144092,
|
|
"learning_rate": 8.384930236365629e-07,
|
|
"loss": 0.0201,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 333519790.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.364776611328125,
|
|
"epoch": 4.956521739130435,
|
|
"grad_norm": 5.341895283326283,
|
|
"learning_rate": 8.219509479051202e-07,
|
|
"loss": 0.0376,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 334259523.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.3604736328125,
|
|
"epoch": 4.967391304347826,
|
|
"grad_norm": 3.5651128802864305,
|
|
"learning_rate": 8.055590512417499e-07,
|
|
"loss": 0.0192,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 335022125.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.37713623046875,
|
|
"epoch": 4.978260869565218,
|
|
"grad_norm": 3.112512399794995,
|
|
"learning_rate": 7.893179228496261e-07,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 335734028.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.3864593505859375,
|
|
"epoch": 4.989130434782608,
|
|
"grad_norm": 2.78572082976228,
|
|
"learning_rate": 7.732281465125907e-07,
|
|
"loss": 0.0274,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 336440046.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.3579559326171875,
|
|
"epoch": 5.0,
|
|
"grad_norm": 1.7339058571647206,
|
|
"learning_rate": 7.572903005741689e-07,
|
|
"loss": 0.0231,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 337176700.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.3771820068359375,
|
|
"epoch": 5.010869565217392,
|
|
"grad_norm": 1.4317687703631763,
|
|
"learning_rate": 7.415049579167783e-07,
|
|
"loss": 0.0073,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 337893321.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.372344970703125,
|
|
"epoch": 5.021739130434782,
|
|
"grad_norm": 5.601063371459146,
|
|
"learning_rate": 7.258726859411435e-07,
|
|
"loss": 0.0142,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 338629322.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.3688201904296875,
|
|
"epoch": 5.032608695652174,
|
|
"grad_norm": 0.7441813173341757,
|
|
"learning_rate": 7.103940465458936e-07,
|
|
"loss": 0.0059,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 339357552.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.37188720703125,
|
|
"epoch": 5.043478260869565,
|
|
"grad_norm": 4.471379494752968,
|
|
"learning_rate": 6.950695961073684e-07,
|
|
"loss": 0.022,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 340087463.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.376220703125,
|
|
"epoch": 5.054347826086956,
|
|
"grad_norm": 1.2045356708427954,
|
|
"learning_rate": 6.79899885459619e-07,
|
|
"loss": 0.0055,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 340804846.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.3759307861328125,
|
|
"epoch": 5.065217391304348,
|
|
"grad_norm": 0.4866161810197756,
|
|
"learning_rate": 6.64885459874608e-07,
|
|
"loss": 0.0033,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 341512883.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.35706329345703125,
|
|
"epoch": 5.076086956521739,
|
|
"grad_norm": 1.0245386454327172,
|
|
"learning_rate": 6.500268590426107e-07,
|
|
"loss": 0.0205,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 342256625.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.3574371337890625,
|
|
"epoch": 5.086956521739131,
|
|
"grad_norm": 3.4242498375550223,
|
|
"learning_rate": 6.353246170528149e-07,
|
|
"loss": 0.004,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 342976950.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.375244140625,
|
|
"epoch": 5.0978260869565215,
|
|
"grad_norm": 1.8715490799774765,
|
|
"learning_rate": 6.207792623741249e-07,
|
|
"loss": 0.0104,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 343667206.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.3539276123046875,
|
|
"epoch": 5.108695652173913,
|
|
"grad_norm": 1.184220255204378,
|
|
"learning_rate": 6.063913178361614e-07,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 344406620.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.3880157470703125,
|
|
"epoch": 5.119565217391305,
|
|
"grad_norm": 2.278466667985271,
|
|
"learning_rate": 5.921613006104765e-07,
|
|
"loss": 0.0062,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 345111449.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.3537139892578125,
|
|
"epoch": 5.130434782608695,
|
|
"grad_norm": 2.545678797703154,
|
|
"learning_rate": 5.780897221919551e-07,
|
|
"loss": 0.0133,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 345827537.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.3666229248046875,
|
|
"epoch": 5.141304347826087,
|
|
"grad_norm": 3.1812216874231556,
|
|
"learning_rate": 5.641770883804365e-07,
|
|
"loss": 0.0046,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 346554810.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.3426513671875,
|
|
"epoch": 5.1521739130434785,
|
|
"grad_norm": 0.31179635110112475,
|
|
"learning_rate": 5.504238992625277e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 347301119.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.3370513916015625,
|
|
"epoch": 5.163043478260869,
|
|
"grad_norm": 2.385624568875711,
|
|
"learning_rate": 5.368306491936326e-07,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 348053634.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.350250244140625,
|
|
"epoch": 5.173913043478261,
|
|
"grad_norm": 4.0588469803082035,
|
|
"learning_rate": 5.233978267801798e-07,
|
|
"loss": 0.0215,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 348783933.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.3390960693359375,
|
|
"epoch": 5.184782608695652,
|
|
"grad_norm": 8.745633741771352,
|
|
"learning_rate": 5.101259148620618e-07,
|
|
"loss": 0.0276,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 349515874.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.3403472900390625,
|
|
"epoch": 5.195652173913044,
|
|
"grad_norm": 0.5157221910546766,
|
|
"learning_rate": 4.970153904952768e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 350254659.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.3349151611328125,
|
|
"epoch": 5.206521739130435,
|
|
"grad_norm": 1.6999109586517638,
|
|
"learning_rate": 4.840667249347824e-07,
|
|
"loss": 0.0131,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 350999891.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.3381500244140625,
|
|
"epoch": 5.217391304347826,
|
|
"grad_norm": 4.17910153509278,
|
|
"learning_rate": 4.7128038361755836e-07,
|
|
"loss": 0.0278,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 351735858.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.3470306396484375,
|
|
"epoch": 5.228260869565218,
|
|
"grad_norm": 4.661486038452666,
|
|
"learning_rate": 4.586568261458729e-07,
|
|
"loss": 0.0083,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 352445537.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.3560638427734375,
|
|
"epoch": 5.239130434782608,
|
|
"grad_norm": 3.698008462480336,
|
|
"learning_rate": 4.461965062707646e-07,
|
|
"loss": 0.0429,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 353165696.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.3307342529296875,
|
|
"epoch": 5.25,
|
|
"grad_norm": 2.3951775996184237,
|
|
"learning_rate": 4.338998718757315e-07,
|
|
"loss": 0.0037,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 353903304.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.35088348388671875,
|
|
"epoch": 5.260869565217392,
|
|
"grad_norm": 7.71581385366145,
|
|
"learning_rate": 4.2176736496063406e-07,
|
|
"loss": 0.0168,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 354605819.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.3419189453125,
|
|
"epoch": 5.271739130434782,
|
|
"grad_norm": 3.297181128794067,
|
|
"learning_rate": 4.0979942162580387e-07,
|
|
"loss": 0.0067,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 355351145.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.34766387939453125,
|
|
"epoch": 5.282608695652174,
|
|
"grad_norm": 1.3321666607896645,
|
|
"learning_rate": 3.979964720563728e-07,
|
|
"loss": 0.0027,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 356069553.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.32390594482421875,
|
|
"epoch": 5.293478260869565,
|
|
"grad_norm": 3.9326795558925114,
|
|
"learning_rate": 3.863589405068047e-07,
|
|
"loss": 0.0085,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 356833522.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.3388824462890625,
|
|
"epoch": 5.304347826086957,
|
|
"grad_norm": 3.858204769882527,
|
|
"learning_rate": 3.748872452856506e-07,
|
|
"loss": 0.0258,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 357542589.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.33782958984375,
|
|
"epoch": 5.315217391304348,
|
|
"grad_norm": 1.2867969492672284,
|
|
"learning_rate": 3.63581798740511e-07,
|
|
"loss": 0.0194,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 358275996.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.3350677490234375,
|
|
"epoch": 5.326086956521739,
|
|
"grad_norm": 3.2769728216030236,
|
|
"learning_rate": 3.524430072432117e-07,
|
|
"loss": 0.0151,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 359016682.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.3552398681640625,
|
|
"epoch": 5.336956521739131,
|
|
"grad_norm": 3.371299318524504,
|
|
"learning_rate": 3.414712711752011e-07,
|
|
"loss": 0.0188,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 359732345.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.34136962890625,
|
|
"epoch": 5.3478260869565215,
|
|
"grad_norm": 9.508372180363546,
|
|
"learning_rate": 3.306669849131544e-07,
|
|
"loss": 0.0275,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 360462966.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.3476104736328125,
|
|
"epoch": 5.358695652173913,
|
|
"grad_norm": 3.375679655545073,
|
|
"learning_rate": 3.20030536814801e-07,
|
|
"loss": 0.0054,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 361195863.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.3326568603515625,
|
|
"epoch": 5.369565217391305,
|
|
"grad_norm": 0.1903952257569681,
|
|
"learning_rate": 3.095623092049632e-07,
|
|
"loss": 0.0011,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 361932732.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.33864593505859375,
|
|
"epoch": 5.380434782608695,
|
|
"grad_norm": 4.176400622746979,
|
|
"learning_rate": 2.992626783618152e-07,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 362670502.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.3571929931640625,
|
|
"epoch": 5.391304347826087,
|
|
"grad_norm": 0.44105406137175723,
|
|
"learning_rate": 2.891320145033566e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 363388078.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.34087371826171875,
|
|
"epoch": 5.4021739130434785,
|
|
"grad_norm": 3.5861865506981867,
|
|
"learning_rate": 2.791706817741041e-07,
|
|
"loss": 0.0037,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 364120352.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.3389739990234375,
|
|
"epoch": 5.413043478260869,
|
|
"grad_norm": 0.24697831888375507,
|
|
"learning_rate": 2.693790382320055e-07,
|
|
"loss": 0.0012,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 364847468.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.3396148681640625,
|
|
"epoch": 5.423913043478261,
|
|
"grad_norm": 0.3577105924556279,
|
|
"learning_rate": 2.59757435835567e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 365579587.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.3470458984375,
|
|
"epoch": 5.434782608695652,
|
|
"grad_norm": 8.154047110779663,
|
|
"learning_rate": 2.5030622043120237e-07,
|
|
"loss": 0.0159,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 366303062.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.34600830078125,
|
|
"epoch": 5.445652173913043,
|
|
"grad_norm": 0.36629120267371895,
|
|
"learning_rate": 2.41025731740801e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 367027323.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.3304595947265625,
|
|
"epoch": 5.456521739130435,
|
|
"grad_norm": 1.7443492578612907,
|
|
"learning_rate": 2.319163033495192e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 367757359.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.338287353515625,
|
|
"epoch": 5.467391304347826,
|
|
"grad_norm": 2.451026888868035,
|
|
"learning_rate": 2.2297826269378653e-07,
|
|
"loss": 0.0113,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 368484157.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.3620452880859375,
|
|
"epoch": 5.478260869565218,
|
|
"grad_norm": 3.779401706962842,
|
|
"learning_rate": 2.142119310495383e-07,
|
|
"loss": 0.0106,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 369194116.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.33890533447265625,
|
|
"epoch": 5.489130434782608,
|
|
"grad_norm": 1.971026340670922,
|
|
"learning_rate": 2.0561762352066638e-07,
|
|
"loss": 0.0038,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 369908073.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.33306884765625,
|
|
"epoch": 5.5,
|
|
"grad_norm": 2.345747076854806,
|
|
"learning_rate": 1.9719564902769272e-07,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 370629804.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.3396148681640625,
|
|
"epoch": 5.510869565217392,
|
|
"grad_norm": 1.1256644586704976,
|
|
"learning_rate": 1.889463102966671e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 371360478.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.31915283203125,
|
|
"epoch": 5.521739130434782,
|
|
"grad_norm": 4.143967697470826,
|
|
"learning_rate": 1.8086990384828195e-07,
|
|
"loss": 0.0176,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 372128452.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.328765869140625,
|
|
"epoch": 5.532608695652174,
|
|
"grad_norm": 3.4721160936249693,
|
|
"learning_rate": 1.729667199872187e-07,
|
|
"loss": 0.0175,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 372889206.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.339447021484375,
|
|
"epoch": 5.543478260869565,
|
|
"grad_norm": 4.229558804673854,
|
|
"learning_rate": 1.6523704279170773e-07,
|
|
"loss": 0.0259,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 373639508.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.34881591796875,
|
|
"epoch": 5.554347826086957,
|
|
"grad_norm": 6.263021156251456,
|
|
"learning_rate": 1.5768115010332207e-07,
|
|
"loss": 0.0168,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 374348495.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.325714111328125,
|
|
"epoch": 5.565217391304348,
|
|
"grad_norm": 4.611080868571382,
|
|
"learning_rate": 1.5029931351698723e-07,
|
|
"loss": 0.0256,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 375104378.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.35272216796875,
|
|
"epoch": 5.576086956521739,
|
|
"grad_norm": 6.816170209930842,
|
|
"learning_rate": 1.4309179837122045e-07,
|
|
"loss": 0.0134,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 375818686.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.33832550048828125,
|
|
"epoch": 5.586956521739131,
|
|
"grad_norm": 4.613487993643065,
|
|
"learning_rate": 1.3605886373859234e-07,
|
|
"loss": 0.0194,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 376543473.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.3258819580078125,
|
|
"epoch": 5.5978260869565215,
|
|
"grad_norm": 1.0488387481178343,
|
|
"learning_rate": 1.2920076241641376e-07,
|
|
"loss": 0.0043,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 377301519.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.327392578125,
|
|
"epoch": 5.608695652173913,
|
|
"grad_norm": 3.174233485263601,
|
|
"learning_rate": 1.22517740917652e-07,
|
|
"loss": 0.0231,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 378028218.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.33404541015625,
|
|
"epoch": 5.619565217391305,
|
|
"grad_norm": 1.0051742426375578,
|
|
"learning_rate": 1.1601003946206723e-07,
|
|
"loss": 0.01,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 378763946.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.33514404296875,
|
|
"epoch": 5.630434782608695,
|
|
"grad_norm": 2.5411657071258116,
|
|
"learning_rate": 1.0967789196757839e-07,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 379473782.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.33676910400390625,
|
|
"epoch": 5.641304347826087,
|
|
"grad_norm": 1.1015219149910704,
|
|
"learning_rate": 1.0352152604185429e-07,
|
|
"loss": 0.0028,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 380194885.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.3310699462890625,
|
|
"epoch": 5.6521739130434785,
|
|
"grad_norm": 0.9608652025141723,
|
|
"learning_rate": 9.754116297413574e-08,
|
|
"loss": 0.0029,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 380928252.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.324371337890625,
|
|
"epoch": 5.663043478260869,
|
|
"grad_norm": 4.782987590602979,
|
|
"learning_rate": 9.17370177272775e-08,
|
|
"loss": 0.0351,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 381674240.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.32247161865234375,
|
|
"epoch": 5.673913043478261,
|
|
"grad_norm": 3.5180723425321916,
|
|
"learning_rate": 8.610929893002274e-08,
|
|
"loss": 0.0167,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 382422446.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.33092498779296875,
|
|
"epoch": 5.684782608695652,
|
|
"grad_norm": 1.3950988718809394,
|
|
"learning_rate": 8.065820886950404e-08,
|
|
"loss": 0.004,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 383168467.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.3417205810546875,
|
|
"epoch": 5.695652173913043,
|
|
"grad_norm": 3.7361797627838427,
|
|
"learning_rate": 7.538394348397316e-08,
|
|
"loss": 0.0076,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 383859763.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.35143280029296875,
|
|
"epoch": 5.706521739130435,
|
|
"grad_norm": 2.0397382555338406,
|
|
"learning_rate": 7.028669235575714e-08,
|
|
"loss": 0.0195,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 384580605.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.32248687744140625,
|
|
"epoch": 5.717391304347826,
|
|
"grad_norm": 3.18158963358261,
|
|
"learning_rate": 6.536663870444382e-08,
|
|
"loss": 0.0169,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 385319103.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.326385498046875,
|
|
"epoch": 5.728260869565218,
|
|
"grad_norm": 0.4049125003862208,
|
|
"learning_rate": 6.062395938029485e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 386063689.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.32640838623046875,
|
|
"epoch": 5.739130434782608,
|
|
"grad_norm": 1.5296490606026132,
|
|
"learning_rate": 5.605882485789138e-08,
|
|
"loss": 0.012,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 386792990.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"entropy": 0.3341522216796875,
|
|
"epoch": 5.75,
|
|
"grad_norm": 0.638940000111559,
|
|
"learning_rate": 5.167139923000553e-08,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 387540410.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"entropy": 0.32831573486328125,
|
|
"epoch": 5.760869565217392,
|
|
"grad_norm": 1.0758924654692672,
|
|
"learning_rate": 4.746184020170019e-08,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 388260554.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.34026336669921875,
|
|
"epoch": 5.771739130434782,
|
|
"grad_norm": 0.536978083638096,
|
|
"learning_rate": 4.3430299084663006e-08,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 388989748.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"entropy": 0.34828948974609375,
|
|
"epoch": 5.782608695652174,
|
|
"grad_norm": 1.929004828335525,
|
|
"learning_rate": 3.957692079176623e-08,
|
|
"loss": 0.0039,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 389696692.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"entropy": 0.3245391845703125,
|
|
"epoch": 5.793478260869565,
|
|
"grad_norm": 0.5389777595397602,
|
|
"learning_rate": 3.590184383185758e-08,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 390451357.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"entropy": 0.32309722900390625,
|
|
"epoch": 5.804347826086957,
|
|
"grad_norm": 0.6420768785606711,
|
|
"learning_rate": 3.240520030478256e-08,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 391191935.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"entropy": 0.34619140625,
|
|
"epoch": 5.815217391304348,
|
|
"grad_norm": 0.9053141657206187,
|
|
"learning_rate": 2.9087115896635486e-08,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 391910794.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 0.3461151123046875,
|
|
"epoch": 5.826086956521739,
|
|
"grad_norm": 2.561085063695768,
|
|
"learning_rate": 2.5947709875240867e-08,
|
|
"loss": 0.0069,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 392636535.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"entropy": 0.337615966796875,
|
|
"epoch": 5.836956521739131,
|
|
"grad_norm": 1.3288329838673714,
|
|
"learning_rate": 2.298709508586794e-08,
|
|
"loss": 0.0028,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 393371482.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"entropy": 0.34088134765625,
|
|
"epoch": 5.8478260869565215,
|
|
"grad_norm": 4.644430203862523,
|
|
"learning_rate": 2.0205377947174475e-08,
|
|
"loss": 0.0243,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 394104491.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"entropy": 0.33001708984375,
|
|
"epoch": 5.858695652173913,
|
|
"grad_norm": 3.101124689870193,
|
|
"learning_rate": 1.760265844738096e-08,
|
|
"loss": 0.0166,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 394851417.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"entropy": 0.3329925537109375,
|
|
"epoch": 5.869565217391305,
|
|
"grad_norm": 7.351512238428031,
|
|
"learning_rate": 1.5179030140675122e-08,
|
|
"loss": 0.0106,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 395593545.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.3298187255859375,
|
|
"epoch": 5.880434782608695,
|
|
"grad_norm": 4.3208242588462396,
|
|
"learning_rate": 1.2934580143851294e-08,
|
|
"loss": 0.0225,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 396323896.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"entropy": 0.34270477294921875,
|
|
"epoch": 5.891304347826087,
|
|
"grad_norm": 2.120230083958054,
|
|
"learning_rate": 1.0869389133178477e-08,
|
|
"loss": 0.0113,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 397067596.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"entropy": 0.3221435546875,
|
|
"epoch": 5.9021739130434785,
|
|
"grad_norm": 5.923579134521112,
|
|
"learning_rate": 8.983531341500984e-09,
|
|
"loss": 0.0139,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 397827629.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"entropy": 0.3325042724609375,
|
|
"epoch": 5.913043478260869,
|
|
"grad_norm": 4.6785425155892275,
|
|
"learning_rate": 7.277074555567809e-09,
|
|
"loss": 0.0311,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 398566673.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"entropy": 0.3405609130859375,
|
|
"epoch": 5.923913043478261,
|
|
"grad_norm": 1.8622199788219247,
|
|
"learning_rate": 5.750080113598455e-09,
|
|
"loss": 0.0302,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 399281215.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 0.32552337646484375,
|
|
"epoch": 5.934782608695652,
|
|
"grad_norm": 4.5117049691897,
|
|
"learning_rate": 4.40260290307748e-09,
|
|
"loss": 0.0238,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 400015843.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"entropy": 0.343475341796875,
|
|
"epoch": 5.945652173913043,
|
|
"grad_norm": 1.724914269369089,
|
|
"learning_rate": 3.2346913587816275e-09,
|
|
"loss": 0.0219,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 400738262.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"entropy": 0.32178497314453125,
|
|
"epoch": 5.956521739130435,
|
|
"grad_norm": 0.36686516730699115,
|
|
"learning_rate": 2.2463874610378912e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 401480012.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"entropy": 0.32819366455078125,
|
|
"epoch": 5.967391304347826,
|
|
"grad_norm": 4.270932638406214,
|
|
"learning_rate": 1.4377267342158274e-09,
|
|
"loss": 0.0079,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 402216292.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"entropy": 0.3307037353515625,
|
|
"epoch": 5.978260869565218,
|
|
"grad_norm": 6.2860168018954665,
|
|
"learning_rate": 8.087382454502468e-10,
|
|
"loss": 0.0106,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 402958946.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.339447021484375,
|
|
"epoch": 5.989130434782608,
|
|
"grad_norm": 4.766809613888067,
|
|
"learning_rate": 3.594446035964927e-10,
|
|
"loss": 0.012,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 403702809.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"entropy": 0.3319854736328125,
|
|
"epoch": 6.0,
|
|
"grad_norm": 0.5426965823730799,
|
|
"learning_rate": 8.986195841609313e-11,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 404444666.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"step": 552,
|
|
"total_flos": 475851760140288.0,
|
|
"train_loss": 0.4284659847488502,
|
|
"train_runtime": 50380.648,
|
|
"train_samples_per_second": 2.218,
|
|
"train_steps_per_second": 0.011
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 552,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 6,
|
|
"save_steps": 46,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 475851760140288.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|