Model: Hyeongwon/P9-split1_only_answer_Qwen3-4B-Base_0402-01-5e-6 Source: Original Platform
5324 lines
148 KiB
JSON
5324 lines
148 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.0,
|
|
"eval_steps": 500,
|
|
"global_step": 528,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5595855712890625,
|
|
"epoch": 0.011363636363636364,
|
|
"grad_norm": 381.7633221456865,
|
|
"learning_rate": 0.0,
|
|
"loss": 8.3191,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 852123.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.5646438598632812,
|
|
"epoch": 0.022727272727272728,
|
|
"grad_norm": 384.6180271880605,
|
|
"learning_rate": 1.8518518518518518e-07,
|
|
"loss": 8.2985,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 1667244.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.5536346435546875,
|
|
"epoch": 0.03409090909090909,
|
|
"grad_norm": 383.29626165109,
|
|
"learning_rate": 3.7037037037037036e-07,
|
|
"loss": 8.3131,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 2503572.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.5472640991210938,
|
|
"epoch": 0.045454545454545456,
|
|
"grad_norm": 384.1270985770701,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": 8.2624,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 3345459.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.5576705932617188,
|
|
"epoch": 0.056818181818181816,
|
|
"grad_norm": 395.68326610795435,
|
|
"learning_rate": 7.407407407407407e-07,
|
|
"loss": 8.0807,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4166938.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.5459136962890625,
|
|
"epoch": 0.06818181818181818,
|
|
"grad_norm": 392.47836187332365,
|
|
"learning_rate": 9.259259259259259e-07,
|
|
"loss": 8.0151,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 5016940.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5500946044921875,
|
|
"epoch": 0.07954545454545454,
|
|
"grad_norm": 402.3592451505352,
|
|
"learning_rate": 1.111111111111111e-06,
|
|
"loss": 7.4355,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 5848503.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.5400238037109375,
|
|
"epoch": 0.09090909090909091,
|
|
"grad_norm": 270.5312649845278,
|
|
"learning_rate": 1.2962962962962962e-06,
|
|
"loss": 5.8653,
|
|
"mean_token_accuracy": 0.003906250116415322,
|
|
"num_tokens": 6709898.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.5549163818359375,
|
|
"epoch": 0.10227272727272728,
|
|
"grad_norm": 228.946154453409,
|
|
"learning_rate": 1.4814814814814815e-06,
|
|
"loss": 5.5906,
|
|
"mean_token_accuracy": 0.006510416860692203,
|
|
"num_tokens": 7560854.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.5582351684570312,
|
|
"epoch": 0.11363636363636363,
|
|
"grad_norm": 187.58670277138384,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 5.2685,
|
|
"mean_token_accuracy": 0.014322917093522847,
|
|
"num_tokens": 8391135.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.5673904418945312,
|
|
"epoch": 0.125,
|
|
"grad_norm": 102.9653781365581,
|
|
"learning_rate": 1.8518518518518519e-06,
|
|
"loss": 4.115,
|
|
"mean_token_accuracy": 0.5117187652504072,
|
|
"num_tokens": 9185279.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.5558929443359375,
|
|
"epoch": 0.13636363636363635,
|
|
"grad_norm": 96.60373813990032,
|
|
"learning_rate": 2.037037037037037e-06,
|
|
"loss": 4.0292,
|
|
"mean_token_accuracy": 0.49218751466833055,
|
|
"num_tokens": 10024891.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.5634613037109375,
|
|
"epoch": 0.14772727272727273,
|
|
"grad_norm": 82.81078074953965,
|
|
"learning_rate": 2.222222222222222e-06,
|
|
"loss": 3.8265,
|
|
"mean_token_accuracy": 0.5312500158324838,
|
|
"num_tokens": 10842191.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.5619354248046875,
|
|
"epoch": 0.1590909090909091,
|
|
"grad_norm": 74.59071680304716,
|
|
"learning_rate": 2.4074074074074075e-06,
|
|
"loss": 3.7086,
|
|
"mean_token_accuracy": 0.5039062650175765,
|
|
"num_tokens": 11650475.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.5547027587890625,
|
|
"epoch": 0.17045454545454544,
|
|
"grad_norm": 59.45145903761326,
|
|
"learning_rate": 2.5925925925925925e-06,
|
|
"loss": 3.2698,
|
|
"mean_token_accuracy": 0.5039062650175765,
|
|
"num_tokens": 12464155.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.5290374755859375,
|
|
"epoch": 0.18181818181818182,
|
|
"grad_norm": 58.53812027781114,
|
|
"learning_rate": 2.7777777777777783e-06,
|
|
"loss": 3.204,
|
|
"mean_token_accuracy": 0.5299479324603453,
|
|
"num_tokens": 13346836.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.5463485717773438,
|
|
"epoch": 0.19318181818181818,
|
|
"grad_norm": 57.542412544507386,
|
|
"learning_rate": 2.962962962962963e-06,
|
|
"loss": 3.1529,
|
|
"mean_token_accuracy": 0.5247395989717916,
|
|
"num_tokens": 14174968.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.5584182739257812,
|
|
"epoch": 0.20454545454545456,
|
|
"grad_norm": 57.52665347282901,
|
|
"learning_rate": 3.1481481481481483e-06,
|
|
"loss": 3.0902,
|
|
"mean_token_accuracy": 0.5468750162981451,
|
|
"num_tokens": 14975189.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5614852905273438,
|
|
"epoch": 0.2159090909090909,
|
|
"grad_norm": 57.53281016106306,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 3.0511,
|
|
"mean_token_accuracy": 0.5286458490882069,
|
|
"num_tokens": 15764524.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.5534286499023438,
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 58.14048343492545,
|
|
"learning_rate": 3.5185185185185187e-06,
|
|
"loss": 2.9643,
|
|
"mean_token_accuracy": 0.5442708495538682,
|
|
"num_tokens": 16594212.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.5393600463867188,
|
|
"epoch": 0.23863636363636365,
|
|
"grad_norm": 57.29516812284279,
|
|
"learning_rate": 3.7037037037037037e-06,
|
|
"loss": 2.9211,
|
|
"mean_token_accuracy": 0.5468750162981451,
|
|
"num_tokens": 17431524.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.554290771484375,
|
|
"epoch": 0.25,
|
|
"grad_norm": 61.67178199646207,
|
|
"learning_rate": 3.88888888888889e-06,
|
|
"loss": 2.9303,
|
|
"mean_token_accuracy": 0.5195312654832378,
|
|
"num_tokens": 18240206.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.543975830078125,
|
|
"epoch": 0.26136363636363635,
|
|
"grad_norm": 61.19499349619627,
|
|
"learning_rate": 4.074074074074074e-06,
|
|
"loss": 2.9146,
|
|
"mean_token_accuracy": 0.5325520992046222,
|
|
"num_tokens": 19067759.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.54351806640625,
|
|
"epoch": 0.2727272727272727,
|
|
"grad_norm": 58.05690393582671,
|
|
"learning_rate": 4.2592592592592596e-06,
|
|
"loss": 2.8641,
|
|
"mean_token_accuracy": 0.5494791830424219,
|
|
"num_tokens": 19896857.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.5491256713867188,
|
|
"epoch": 0.2840909090909091,
|
|
"grad_norm": 57.22079479568823,
|
|
"learning_rate": 4.444444444444444e-06,
|
|
"loss": 2.8347,
|
|
"mean_token_accuracy": 0.5638021001359448,
|
|
"num_tokens": 20712844.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.5425262451171875,
|
|
"epoch": 0.29545454545454547,
|
|
"grad_norm": 57.786592169293364,
|
|
"learning_rate": 4.62962962962963e-06,
|
|
"loss": 2.8192,
|
|
"mean_token_accuracy": 0.537760432693176,
|
|
"num_tokens": 21562110.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.5401382446289062,
|
|
"epoch": 0.3068181818181818,
|
|
"grad_norm": 57.99056805330064,
|
|
"learning_rate": 4.814814814814815e-06,
|
|
"loss": 2.7831,
|
|
"mean_token_accuracy": 0.5468750162981451,
|
|
"num_tokens": 22406352.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.5256195068359375,
|
|
"epoch": 0.3181818181818182,
|
|
"grad_norm": 57.22234236181939,
|
|
"learning_rate": 5e-06,
|
|
"loss": 2.7438,
|
|
"mean_token_accuracy": 0.5520833497866988,
|
|
"num_tokens": 23252892.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.5403366088867188,
|
|
"epoch": 0.32954545454545453,
|
|
"grad_norm": 57.097280204957976,
|
|
"learning_rate": 4.999950848940538e-06,
|
|
"loss": 2.7117,
|
|
"mean_token_accuracy": 0.5520833497866988,
|
|
"num_tokens": 24068796.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.5393600463867188,
|
|
"epoch": 0.3409090909090909,
|
|
"grad_norm": 57.21667456892074,
|
|
"learning_rate": 4.999803397694811e-06,
|
|
"loss": 2.6725,
|
|
"mean_token_accuracy": 0.5638021001359448,
|
|
"num_tokens": 24888182.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.5381240844726562,
|
|
"epoch": 0.3522727272727273,
|
|
"grad_norm": 57.65592126762044,
|
|
"learning_rate": 4.999557652060729e-06,
|
|
"loss": 2.65,
|
|
"mean_token_accuracy": 0.5611979333916679,
|
|
"num_tokens": 25701818.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.5250473022460938,
|
|
"epoch": 0.36363636363636365,
|
|
"grad_norm": 57.94845921985987,
|
|
"learning_rate": 4.9992136217012184e-06,
|
|
"loss": 2.6265,
|
|
"mean_token_accuracy": 0.5559895999031141,
|
|
"num_tokens": 26550803.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.534454345703125,
|
|
"epoch": 0.375,
|
|
"grad_norm": 60.09221471903111,
|
|
"learning_rate": 4.998771320143843e-06,
|
|
"loss": 2.6194,
|
|
"mean_token_accuracy": 0.5455729329260066,
|
|
"num_tokens": 27362749.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.5318450927734375,
|
|
"epoch": 0.38636363636363635,
|
|
"grad_norm": 58.867253178774526,
|
|
"learning_rate": 4.998230764780277e-06,
|
|
"loss": 2.5514,
|
|
"mean_token_accuracy": 0.5781250172294676,
|
|
"num_tokens": 28197226.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.52191162109375,
|
|
"epoch": 0.3977272727272727,
|
|
"grad_norm": 59.28541087193654,
|
|
"learning_rate": 4.9975919768656125e-06,
|
|
"loss": 2.5631,
|
|
"mean_token_accuracy": 0.5664062668802217,
|
|
"num_tokens": 29070075.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.5337066650390625,
|
|
"epoch": 0.4090909090909091,
|
|
"grad_norm": 59.001328074934214,
|
|
"learning_rate": 4.996854981517535e-06,
|
|
"loss": 2.5256,
|
|
"mean_token_accuracy": 0.5716146003687754,
|
|
"num_tokens": 29909320.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.5183563232421875,
|
|
"epoch": 0.42045454545454547,
|
|
"grad_norm": 59.065376167680974,
|
|
"learning_rate": 4.996019807715324e-06,
|
|
"loss": 2.4876,
|
|
"mean_token_accuracy": 0.5677083502523601,
|
|
"num_tokens": 30778680.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.5281143188476562,
|
|
"epoch": 0.4318181818181818,
|
|
"grad_norm": 59.32377062126641,
|
|
"learning_rate": 4.995086488298723e-06,
|
|
"loss": 2.4747,
|
|
"mean_token_accuracy": 0.5598958500195295,
|
|
"num_tokens": 31596240.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.5444107055664062,
|
|
"epoch": 0.4431818181818182,
|
|
"grad_norm": 59.42893671370653,
|
|
"learning_rate": 4.994055059966641e-06,
|
|
"loss": 2.4461,
|
|
"mean_token_accuracy": 0.5690104336244985,
|
|
"num_tokens": 32396599.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.5415267944335938,
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 59.56204485899904,
|
|
"learning_rate": 4.992925563275714e-06,
|
|
"loss": 2.4156,
|
|
"mean_token_accuracy": 0.5755208504851907,
|
|
"num_tokens": 33192180.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.532440185546875,
|
|
"epoch": 0.4659090909090909,
|
|
"grad_norm": 59.54887107492075,
|
|
"learning_rate": 4.991698042638711e-06,
|
|
"loss": 2.3971,
|
|
"mean_token_accuracy": 0.5729166837409139,
|
|
"num_tokens": 34019780.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.5341949462890625,
|
|
"epoch": 0.4772727272727273,
|
|
"grad_norm": 59.74650962478605,
|
|
"learning_rate": 4.990372546322782e-06,
|
|
"loss": 2.3637,
|
|
"mean_token_accuracy": 0.5755208504851907,
|
|
"num_tokens": 34845898.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.5344314575195312,
|
|
"epoch": 0.48863636363636365,
|
|
"grad_norm": 59.949205398945104,
|
|
"learning_rate": 4.988949126447567e-06,
|
|
"loss": 2.3412,
|
|
"mean_token_accuracy": 0.5833333507180214,
|
|
"num_tokens": 35658003.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.519012451171875,
|
|
"epoch": 0.5,
|
|
"grad_norm": 61.0931007069813,
|
|
"learning_rate": 4.987427838983141e-06,
|
|
"loss": 2.3435,
|
|
"mean_token_accuracy": 0.5807291839737445,
|
|
"num_tokens": 36513090.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.5371322631835938,
|
|
"epoch": 0.5113636363636364,
|
|
"grad_norm": 60.86784941074918,
|
|
"learning_rate": 4.985808743747817e-06,
|
|
"loss": 2.3204,
|
|
"mean_token_accuracy": 0.6158854321110994,
|
|
"num_tokens": 37335753.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.537994384765625,
|
|
"epoch": 0.5227272727272727,
|
|
"grad_norm": 60.25561544990646,
|
|
"learning_rate": 4.984091904405793e-06,
|
|
"loss": 2.2697,
|
|
"mean_token_accuracy": 0.7734375107102096,
|
|
"num_tokens": 38171714.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.5361480712890625,
|
|
"epoch": 0.5340909090909091,
|
|
"grad_norm": 60.867084288518306,
|
|
"learning_rate": 4.9822773884646444e-06,
|
|
"loss": 2.2367,
|
|
"mean_token_accuracy": 0.8880208396585658,
|
|
"num_tokens": 39004676.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.54638671875,
|
|
"epoch": 0.5454545454545454,
|
|
"grad_norm": 60.66915590360354,
|
|
"learning_rate": 4.980365267272679e-06,
|
|
"loss": 2.2215,
|
|
"mean_token_accuracy": 0.9257812544237822,
|
|
"num_tokens": 39839454.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.55718994140625,
|
|
"epoch": 0.5568181818181818,
|
|
"grad_norm": 60.73328529237807,
|
|
"learning_rate": 4.97835561601612e-06,
|
|
"loss": 2.1965,
|
|
"mean_token_accuracy": 0.9075520888436586,
|
|
"num_tokens": 40636201.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.549163818359375,
|
|
"epoch": 0.5681818181818182,
|
|
"grad_norm": 60.75274840918011,
|
|
"learning_rate": 4.97624851371616e-06,
|
|
"loss": 2.1713,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 41456439.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.5414886474609375,
|
|
"epoch": 0.5795454545454546,
|
|
"grad_norm": 60.49880992250543,
|
|
"learning_rate": 4.974044043225846e-06,
|
|
"loss": 2.1378,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 42306168.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.5356521606445312,
|
|
"epoch": 0.5909090909090909,
|
|
"grad_norm": 60.520844933288195,
|
|
"learning_rate": 4.9717422912268265e-06,
|
|
"loss": 2.1084,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 43156058.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.5290069580078125,
|
|
"epoch": 0.6022727272727273,
|
|
"grad_norm": 60.45515973060584,
|
|
"learning_rate": 4.969343348225942e-06,
|
|
"loss": 2.0952,
|
|
"mean_token_accuracy": 0.9114583386108279,
|
|
"num_tokens": 44026197.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.5451889038085938,
|
|
"epoch": 0.6136363636363636,
|
|
"grad_norm": 60.15895348971788,
|
|
"learning_rate": 4.966847308551664e-06,
|
|
"loss": 2.0768,
|
|
"mean_token_accuracy": 0.8984375060535967,
|
|
"num_tokens": 44830346.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.543731689453125,
|
|
"epoch": 0.625,
|
|
"grad_norm": 60.141288719361974,
|
|
"learning_rate": 4.9642542703503874e-06,
|
|
"loss": 2.0532,
|
|
"mean_token_accuracy": 0.912760421866551,
|
|
"num_tokens": 45639894.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.5470046997070312,
|
|
"epoch": 0.6363636363636364,
|
|
"grad_norm": 59.994234499334425,
|
|
"learning_rate": 4.961564335582572e-06,
|
|
"loss": 2.0265,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 46453026.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.5457687377929688,
|
|
"epoch": 0.6477272727272727,
|
|
"grad_norm": 59.48482455264734,
|
|
"learning_rate": 4.958777610018734e-06,
|
|
"loss": 1.9859,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 47264316.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.5581283569335938,
|
|
"epoch": 0.6590909090909091,
|
|
"grad_norm": 60.20361031008577,
|
|
"learning_rate": 4.955894203235285e-06,
|
|
"loss": 1.9645,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 48069506.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.5583419799804688,
|
|
"epoch": 0.6704545454545454,
|
|
"grad_norm": 60.67814698892302,
|
|
"learning_rate": 4.952914228610221e-06,
|
|
"loss": 1.9421,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 48869835.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.5423202514648438,
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 59.547065391114195,
|
|
"learning_rate": 4.949837803318672e-06,
|
|
"loss": 1.9176,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 49727599.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.5429611206054688,
|
|
"epoch": 0.6931818181818182,
|
|
"grad_norm": 59.83074013238096,
|
|
"learning_rate": 4.946665048328288e-06,
|
|
"loss": 1.8815,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 50548928.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.5543136596679688,
|
|
"epoch": 0.7045454545454546,
|
|
"grad_norm": 60.55220373965871,
|
|
"learning_rate": 4.943396088394482e-06,
|
|
"loss": 1.8644,
|
|
"mean_token_accuracy": 0.9179687548894435,
|
|
"num_tokens": 51360270.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.5471343994140625,
|
|
"epoch": 0.7159090909090909,
|
|
"grad_norm": 61.796111299701316,
|
|
"learning_rate": 4.940031052055532e-06,
|
|
"loss": 1.8707,
|
|
"mean_token_accuracy": 0.9179687548894435,
|
|
"num_tokens": 52194089.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.5440444946289062,
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 60.40349534727321,
|
|
"learning_rate": 4.936570071627517e-06,
|
|
"loss": 1.8205,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 53031403.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.5430908203125,
|
|
"epoch": 0.7386363636363636,
|
|
"grad_norm": 58.510081093257874,
|
|
"learning_rate": 4.933013283199124e-06,
|
|
"loss": 1.7844,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 53844933.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.5445480346679688,
|
|
"epoch": 0.75,
|
|
"grad_norm": 59.65972342732521,
|
|
"learning_rate": 4.929360826626286e-06,
|
|
"loss": 1.776,
|
|
"mean_token_accuracy": 0.8997395893093199,
|
|
"num_tokens": 54693823.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.53070068359375,
|
|
"epoch": 0.7613636363636364,
|
|
"grad_norm": 58.265622775881965,
|
|
"learning_rate": 4.925612845526691e-06,
|
|
"loss": 1.7339,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 55549536.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.542205810546875,
|
|
"epoch": 0.7727272727272727,
|
|
"grad_norm": 58.30084190912645,
|
|
"learning_rate": 4.921769487274132e-06,
|
|
"loss": 1.702,
|
|
"mean_token_accuracy": 0.9283854209352285,
|
|
"num_tokens": 56378172.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.5559234619140625,
|
|
"epoch": 0.7840909090909091,
|
|
"grad_norm": 58.15114759775454,
|
|
"learning_rate": 4.917830902992716e-06,
|
|
"loss": 1.6686,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 57189636.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.5531768798828125,
|
|
"epoch": 0.7954545454545454,
|
|
"grad_norm": 57.91550671898149,
|
|
"learning_rate": 4.913797247550912e-06,
|
|
"loss": 1.6516,
|
|
"mean_token_accuracy": 0.9179687548894435,
|
|
"num_tokens": 58000089.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.5615463256835938,
|
|
"epoch": 0.8068181818181818,
|
|
"grad_norm": 58.08935001982205,
|
|
"learning_rate": 4.9096686795554725e-06,
|
|
"loss": 1.605,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 58802829.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.5490951538085938,
|
|
"epoch": 0.8181818181818182,
|
|
"grad_norm": 58.154246622201676,
|
|
"learning_rate": 4.90544536134519e-06,
|
|
"loss": 1.597,
|
|
"mean_token_accuracy": 0.9283854209352285,
|
|
"num_tokens": 59625095.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.5558624267578125,
|
|
"epoch": 0.8295454545454546,
|
|
"grad_norm": 58.58076155801676,
|
|
"learning_rate": 4.901127458984516e-06,
|
|
"loss": 1.5516,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 60419806.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.5548248291015625,
|
|
"epoch": 0.8409090909090909,
|
|
"grad_norm": 58.213836732046914,
|
|
"learning_rate": 4.8967151422570314e-06,
|
|
"loss": 1.5206,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 61242019.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.54022216796875,
|
|
"epoch": 0.8522727272727273,
|
|
"grad_norm": 58.4253733216111,
|
|
"learning_rate": 4.89220858465877e-06,
|
|
"loss": 1.4992,
|
|
"mean_token_accuracy": 0.9283854209352285,
|
|
"num_tokens": 62099804.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.5445098876953125,
|
|
"epoch": 0.8636363636363636,
|
|
"grad_norm": 58.47702346737799,
|
|
"learning_rate": 4.887607963391394e-06,
|
|
"loss": 1.4669,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 62934736.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.5456314086914062,
|
|
"epoch": 0.875,
|
|
"grad_norm": 58.489811661673144,
|
|
"learning_rate": 4.882913459355233e-06,
|
|
"loss": 1.4349,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 63755694.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.5397567749023438,
|
|
"epoch": 0.8863636363636364,
|
|
"grad_norm": 58.81980710508508,
|
|
"learning_rate": 4.878125257142165e-06,
|
|
"loss": 1.4201,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 64618678.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.5386886596679688,
|
|
"epoch": 0.8977272727272727,
|
|
"grad_norm": 58.85913765518202,
|
|
"learning_rate": 4.873243545028356e-06,
|
|
"loss": 1.3857,
|
|
"mean_token_accuracy": 0.9335937539581209,
|
|
"num_tokens": 65481725.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.5303268432617188,
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 58.90455120115232,
|
|
"learning_rate": 4.868268514966869e-06,
|
|
"loss": 1.3689,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 66326767.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.5485992431640625,
|
|
"epoch": 0.9204545454545454,
|
|
"grad_norm": 59.37737137216742,
|
|
"learning_rate": 4.8632003625800995e-06,
|
|
"loss": 1.3313,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 67132451.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.544342041015625,
|
|
"epoch": 0.9318181818181818,
|
|
"grad_norm": 58.42746386237537,
|
|
"learning_rate": 4.858039287152095e-06,
|
|
"loss": 1.2899,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 67951468.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.5399322509765625,
|
|
"epoch": 0.9431818181818182,
|
|
"grad_norm": 59.5285283289149,
|
|
"learning_rate": 4.852785491620716e-06,
|
|
"loss": 1.277,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 68794669.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.5452194213867188,
|
|
"epoch": 0.9545454545454546,
|
|
"grad_norm": 58.86810235822698,
|
|
"learning_rate": 4.847439182569656e-06,
|
|
"loss": 1.2559,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 69610011.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.5454788208007812,
|
|
"epoch": 0.9659090909090909,
|
|
"grad_norm": 58.244417132360944,
|
|
"learning_rate": 4.84200057022032e-06,
|
|
"loss": 1.2393,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 70406656.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.5455780029296875,
|
|
"epoch": 0.9772727272727273,
|
|
"grad_norm": 58.110729897503354,
|
|
"learning_rate": 4.836469868423552e-06,
|
|
"loss": 1.1798,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 71214893.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.549102783203125,
|
|
"epoch": 0.9886363636363636,
|
|
"grad_norm": 57.96764990952516,
|
|
"learning_rate": 4.830847294651236e-06,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 72038768.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.5456390380859375,
|
|
"epoch": 1.0,
|
|
"grad_norm": 58.09143335722103,
|
|
"learning_rate": 4.825133069987737e-06,
|
|
"loss": 1.1471,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 72847782.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.538909912109375,
|
|
"epoch": 1.0113636363636365,
|
|
"grad_norm": 57.38198228541311,
|
|
"learning_rate": 4.819327419121215e-06,
|
|
"loss": 1.1177,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 73701972.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.5499267578125,
|
|
"epoch": 1.0227272727272727,
|
|
"grad_norm": 57.003608180265964,
|
|
"learning_rate": 4.81343057033478e-06,
|
|
"loss": 1.0763,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 74539661.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.5374908447265625,
|
|
"epoch": 1.0340909090909092,
|
|
"grad_norm": 57.80822364026077,
|
|
"learning_rate": 4.8074427554975235e-06,
|
|
"loss": 1.0644,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 75393325.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.5450515747070312,
|
|
"epoch": 1.0454545454545454,
|
|
"grad_norm": 56.68961214202094,
|
|
"learning_rate": 4.8013642100554034e-06,
|
|
"loss": 1.0258,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 76230771.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.5306625366210938,
|
|
"epoch": 1.0568181818181819,
|
|
"grad_norm": 56.74441180261517,
|
|
"learning_rate": 4.795195173021976e-06,
|
|
"loss": 1.0344,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 77097214.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.51495361328125,
|
|
"epoch": 1.0681818181818181,
|
|
"grad_norm": 56.46006981310896,
|
|
"learning_rate": 4.7889358869690065e-06,
|
|
"loss": 0.9933,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 78024772.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.5518417358398438,
|
|
"epoch": 1.0795454545454546,
|
|
"grad_norm": 57.22442413009565,
|
|
"learning_rate": 4.782586598016928e-06,
|
|
"loss": 0.9509,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 78828734.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.53485107421875,
|
|
"epoch": 1.0909090909090908,
|
|
"grad_norm": 55.811186941838926,
|
|
"learning_rate": 4.776147555825164e-06,
|
|
"loss": 0.9158,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 79685825.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.542236328125,
|
|
"epoch": 1.1022727272727273,
|
|
"grad_norm": 56.18995031350287,
|
|
"learning_rate": 4.769619013582309e-06,
|
|
"loss": 0.9235,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 80538841.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.5271377563476562,
|
|
"epoch": 1.1136363636363635,
|
|
"grad_norm": 55.596172230629556,
|
|
"learning_rate": 4.7630012279961805e-06,
|
|
"loss": 0.871,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 81406964.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.5487060546875,
|
|
"epoch": 1.125,
|
|
"grad_norm": 55.22959755493109,
|
|
"learning_rate": 4.7562944592837145e-06,
|
|
"loss": 0.8578,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 82228992.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.55682373046875,
|
|
"epoch": 1.1363636363636362,
|
|
"grad_norm": 54.97844804028525,
|
|
"learning_rate": 4.749498971160742e-06,
|
|
"loss": 0.8207,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 83019198.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.5458526611328125,
|
|
"epoch": 1.1477272727272727,
|
|
"grad_norm": 54.971653184869254,
|
|
"learning_rate": 4.742615030831615e-06,
|
|
"loss": 0.8323,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 83850145.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.554840087890625,
|
|
"epoch": 1.1590909090909092,
|
|
"grad_norm": 54.786840015792265,
|
|
"learning_rate": 4.735642908978704e-06,
|
|
"loss": 0.804,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 84659245.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.5523223876953125,
|
|
"epoch": 1.1704545454545454,
|
|
"grad_norm": 53.614665578810396,
|
|
"learning_rate": 4.728582879751746e-06,
|
|
"loss": 0.7812,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 85506138.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.544036865234375,
|
|
"epoch": 1.1818181818181819,
|
|
"grad_norm": 54.85928977071388,
|
|
"learning_rate": 4.721435220757078e-06,
|
|
"loss": 0.7617,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 86335548.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.5494613647460938,
|
|
"epoch": 1.1931818181818181,
|
|
"grad_norm": 54.072025328393615,
|
|
"learning_rate": 4.714200213046707e-06,
|
|
"loss": 0.7409,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 87158155.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.5440139770507812,
|
|
"epoch": 1.2045454545454546,
|
|
"grad_norm": 52.66696077779264,
|
|
"learning_rate": 4.706878141107269e-06,
|
|
"loss": 0.7092,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 87999932.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.5469131469726562,
|
|
"epoch": 1.2159090909090908,
|
|
"grad_norm": 53.71747331748587,
|
|
"learning_rate": 4.699469292848839e-06,
|
|
"loss": 0.7042,
|
|
"mean_token_accuracy": 0.9283854209352285,
|
|
"num_tokens": 88819905.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.5530014038085938,
|
|
"epoch": 1.2272727272727273,
|
|
"grad_norm": 52.00345160403972,
|
|
"learning_rate": 4.691973959593609e-06,
|
|
"loss": 0.6665,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 89634909.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.568328857421875,
|
|
"epoch": 1.2386363636363638,
|
|
"grad_norm": 53.47004513594076,
|
|
"learning_rate": 4.6843924360644385e-06,
|
|
"loss": 0.6714,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 90402321.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.5533065795898438,
|
|
"epoch": 1.25,
|
|
"grad_norm": 48.99806983812518,
|
|
"learning_rate": 4.676725020373255e-06,
|
|
"loss": 0.6269,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 91203638.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.543212890625,
|
|
"epoch": 1.2613636363636362,
|
|
"grad_norm": 49.403098540554055,
|
|
"learning_rate": 4.6689720140093445e-06,
|
|
"loss": 0.6146,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 92020252.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.5560073852539062,
|
|
"epoch": 1.2727272727272727,
|
|
"grad_norm": 46.54252333124646,
|
|
"learning_rate": 4.661133721827487e-06,
|
|
"loss": 0.5562,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 92816807.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.5302276611328125,
|
|
"epoch": 1.2840909090909092,
|
|
"grad_norm": 45.66376732399781,
|
|
"learning_rate": 4.653210452035974e-06,
|
|
"loss": 0.5397,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 93649991.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.5438156127929688,
|
|
"epoch": 1.2954545454545454,
|
|
"grad_norm": 45.87602686823475,
|
|
"learning_rate": 4.645202516184492e-06,
|
|
"loss": 0.5526,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 94459730.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.5356369018554688,
|
|
"epoch": 1.3068181818181819,
|
|
"grad_norm": 44.56282799528236,
|
|
"learning_rate": 4.6371102291518635e-06,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 95302339.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.5306625366210938,
|
|
"epoch": 1.3181818181818181,
|
|
"grad_norm": 42.809418293734176,
|
|
"learning_rate": 4.628933909133674e-06,
|
|
"loss": 0.4839,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 96149112.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.5399169921875,
|
|
"epoch": 1.3295454545454546,
|
|
"grad_norm": 43.262845013372875,
|
|
"learning_rate": 4.620673877629757e-06,
|
|
"loss": 0.4791,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 96974920.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.5676727294921875,
|
|
"epoch": 1.3409090909090908,
|
|
"grad_norm": 41.54182990822529,
|
|
"learning_rate": 4.612330459431552e-06,
|
|
"loss": 0.453,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 97741872.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.5345535278320312,
|
|
"epoch": 1.3522727272727273,
|
|
"grad_norm": 42.1177627835191,
|
|
"learning_rate": 4.603903982609334e-06,
|
|
"loss": 0.4717,
|
|
"mean_token_accuracy": 0.9335937539581209,
|
|
"num_tokens": 98575899.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.5326919555664062,
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 43.3669026761141,
|
|
"learning_rate": 4.595394778499314e-06,
|
|
"loss": 0.5153,
|
|
"mean_token_accuracy": 0.901041672565043,
|
|
"num_tokens": 99431810.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.5554275512695312,
|
|
"epoch": 1.375,
|
|
"grad_norm": 36.50419461925312,
|
|
"learning_rate": 4.586803181690609e-06,
|
|
"loss": 0.435,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 100229559.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.5372390747070312,
|
|
"epoch": 1.3863636363636362,
|
|
"grad_norm": 36.592474361723475,
|
|
"learning_rate": 4.5781295300120885e-06,
|
|
"loss": 0.4384,
|
|
"mean_token_accuracy": 0.912760421866551,
|
|
"num_tokens": 101066027.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.5535049438476562,
|
|
"epoch": 1.3977272727272727,
|
|
"grad_norm": 36.245607196315184,
|
|
"learning_rate": 4.569374164519088e-06,
|
|
"loss": 0.4139,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 101861078.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.5516357421875,
|
|
"epoch": 1.4090909090909092,
|
|
"grad_norm": 32.59705233863274,
|
|
"learning_rate": 4.560537429479998e-06,
|
|
"loss": 0.3721,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 102654586.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.5359344482421875,
|
|
"epoch": 1.4204545454545454,
|
|
"grad_norm": 31.92040372605121,
|
|
"learning_rate": 4.5516196723627325e-06,
|
|
"loss": 0.3577,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 103497056.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.5435409545898438,
|
|
"epoch": 1.4318181818181819,
|
|
"grad_norm": 30.46099601858736,
|
|
"learning_rate": 4.542621243821058e-06,
|
|
"loss": 0.3295,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 104315903.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.554229736328125,
|
|
"epoch": 1.4431818181818181,
|
|
"grad_norm": 32.04665107939428,
|
|
"learning_rate": 4.533542497680811e-06,
|
|
"loss": 0.3545,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 105097772.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.5345306396484375,
|
|
"epoch": 1.4545454545454546,
|
|
"grad_norm": 32.593755043031194,
|
|
"learning_rate": 4.524383790925987e-06,
|
|
"loss": 0.3498,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 105938088.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.560516357421875,
|
|
"epoch": 1.4659090909090908,
|
|
"grad_norm": 27.86040213744899,
|
|
"learning_rate": 4.515145483684696e-06,
|
|
"loss": 0.2999,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 106725926.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.5250091552734375,
|
|
"epoch": 1.4772727272727273,
|
|
"grad_norm": 31.53561853505193,
|
|
"learning_rate": 4.505827939215009e-06,
|
|
"loss": 0.338,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 107586088.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.545989990234375,
|
|
"epoch": 1.4886363636363638,
|
|
"grad_norm": 25.96701371896309,
|
|
"learning_rate": 4.496431523890673e-06,
|
|
"loss": 0.2851,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 108406686.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.5378570556640625,
|
|
"epoch": 1.5,
|
|
"grad_norm": 26.273437673336062,
|
|
"learning_rate": 4.486956607186702e-06,
|
|
"loss": 0.291,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 109266458.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.5393905639648438,
|
|
"epoch": 1.5113636363636362,
|
|
"grad_norm": 24.208179077445585,
|
|
"learning_rate": 4.477403561664852e-06,
|
|
"loss": 0.2684,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 110097799.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.5587005615234375,
|
|
"epoch": 1.5227272727272727,
|
|
"grad_norm": 28.135497638609476,
|
|
"learning_rate": 4.467772762958968e-06,
|
|
"loss": 0.2883,
|
|
"mean_token_accuracy": 0.9335937539581209,
|
|
"num_tokens": 110885530.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.5532913208007812,
|
|
"epoch": 1.5340909090909092,
|
|
"grad_norm": 20.865989221665703,
|
|
"learning_rate": 4.458064589760221e-06,
|
|
"loss": 0.2387,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 111709692.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.5501937866210938,
|
|
"epoch": 1.5454545454545454,
|
|
"grad_norm": 24.535283552721097,
|
|
"learning_rate": 4.448279423802207e-06,
|
|
"loss": 0.2446,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 112521769.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.5506134033203125,
|
|
"epoch": 1.5568181818181817,
|
|
"grad_norm": 19.120665717121184,
|
|
"learning_rate": 4.438417649845946e-06,
|
|
"loss": 0.208,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 113374792.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.5475006103515625,
|
|
"epoch": 1.5681818181818183,
|
|
"grad_norm": 21.153400031877272,
|
|
"learning_rate": 4.428479655664748e-06,
|
|
"loss": 0.217,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 114209235.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.54583740234375,
|
|
"epoch": 1.5795454545454546,
|
|
"grad_norm": 18.33572562599262,
|
|
"learning_rate": 4.4184658320289675e-06,
|
|
"loss": 0.2144,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 115024770.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.5428314208984375,
|
|
"epoch": 1.5909090909090908,
|
|
"grad_norm": 17.275245293829077,
|
|
"learning_rate": 4.408376572690638e-06,
|
|
"loss": 0.1946,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 115851204.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.5377197265625,
|
|
"epoch": 1.6022727272727273,
|
|
"grad_norm": 18.38711220300695,
|
|
"learning_rate": 4.3982122743679875e-06,
|
|
"loss": 0.2152,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 116699283.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.5587234497070312,
|
|
"epoch": 1.6136363636363638,
|
|
"grad_norm": 15.030526444653805,
|
|
"learning_rate": 4.387973336729841e-06,
|
|
"loss": 0.1849,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 117503423.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.5555343627929688,
|
|
"epoch": 1.625,
|
|
"grad_norm": 49.09119160394531,
|
|
"learning_rate": 4.377660162379904e-06,
|
|
"loss": 0.1757,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 118291651.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.5510406494140625,
|
|
"epoch": 1.6363636363636362,
|
|
"grad_norm": 17.06703017323922,
|
|
"learning_rate": 4.3672731568409344e-06,
|
|
"loss": 0.1835,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 119098075.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.5520248413085938,
|
|
"epoch": 1.6477272727272727,
|
|
"grad_norm": 14.519167602295752,
|
|
"learning_rate": 4.3568127285387925e-06,
|
|
"loss": 0.1815,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 119888254.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.5263671875,
|
|
"epoch": 1.6590909090909092,
|
|
"grad_norm": 15.842685933217664,
|
|
"learning_rate": 4.346279288786387e-06,
|
|
"loss": 0.1841,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 120730204.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.546875,
|
|
"epoch": 1.6704545454545454,
|
|
"grad_norm": 14.102138793551976,
|
|
"learning_rate": 4.3356732517674935e-06,
|
|
"loss": 0.1665,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 121547571.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.5335464477539062,
|
|
"epoch": 1.6818181818181817,
|
|
"grad_norm": 13.42344794507078,
|
|
"learning_rate": 4.32499503452048e-06,
|
|
"loss": 0.1651,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 122398891.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.5406112670898438,
|
|
"epoch": 1.6931818181818183,
|
|
"grad_norm": 16.777966557638916,
|
|
"learning_rate": 4.314245056921899e-06,
|
|
"loss": 0.2101,
|
|
"mean_token_accuracy": 0.9244791711680591,
|
|
"num_tokens": 123217975.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.5275344848632812,
|
|
"epoch": 1.7045454545454546,
|
|
"grad_norm": 12.73456653618887,
|
|
"learning_rate": 4.303423741669978e-06,
|
|
"loss": 0.1711,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 124067259.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.5336456298828125,
|
|
"epoch": 1.7159090909090908,
|
|
"grad_norm": 14.693936825241831,
|
|
"learning_rate": 4.292531514268008e-06,
|
|
"loss": 0.1729,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 124928545.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.5305709838867188,
|
|
"epoch": 1.7272727272727273,
|
|
"grad_norm": 10.728134350130471,
|
|
"learning_rate": 4.281568803007601e-06,
|
|
"loss": 0.1743,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 125802700.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.5507659912109375,
|
|
"epoch": 1.7386363636363638,
|
|
"grad_norm": 11.357422242866724,
|
|
"learning_rate": 4.270536038951855e-06,
|
|
"loss": 0.1455,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 126615551.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.528594970703125,
|
|
"epoch": 1.75,
|
|
"grad_norm": 13.659522995364625,
|
|
"learning_rate": 4.259433655918404e-06,
|
|
"loss": 0.1593,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 127457356.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.52630615234375,
|
|
"epoch": 1.7613636363636362,
|
|
"grad_norm": 11.98866784293711,
|
|
"learning_rate": 4.24826209046236e-06,
|
|
"loss": 0.146,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 128340971.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.5377883911132812,
|
|
"epoch": 1.7727272727272727,
|
|
"grad_norm": 10.302404549159972,
|
|
"learning_rate": 4.237021781859143e-06,
|
|
"loss": 0.1488,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 129194640.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.54241943359375,
|
|
"epoch": 1.7840909090909092,
|
|
"grad_norm": 12.40519661964789,
|
|
"learning_rate": 4.225713172087216e-06,
|
|
"loss": 0.148,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 130035771.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.5303497314453125,
|
|
"epoch": 1.7954545454545454,
|
|
"grad_norm": 8.481343011266665,
|
|
"learning_rate": 4.2143367058107e-06,
|
|
"loss": 0.1295,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 130925665.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.5450363159179688,
|
|
"epoch": 1.8068181818181817,
|
|
"grad_norm": 7.512602992919226,
|
|
"learning_rate": 4.202892830361892e-06,
|
|
"loss": 0.1347,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 131762678.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.53131103515625,
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 8.20500103400398,
|
|
"learning_rate": 4.191381995723672e-06,
|
|
"loss": 0.143,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 132615569.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.5467453002929688,
|
|
"epoch": 1.8295454545454546,
|
|
"grad_norm": 12.101821695010367,
|
|
"learning_rate": 4.179804654511816e-06,
|
|
"loss": 0.155,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 133443981.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.539459228515625,
|
|
"epoch": 1.8409090909090908,
|
|
"grad_norm": 7.20954852117008,
|
|
"learning_rate": 4.168161261957192e-06,
|
|
"loss": 0.1236,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 134283417.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.5276107788085938,
|
|
"epoch": 1.8522727272727273,
|
|
"grad_norm": 12.45647994209618,
|
|
"learning_rate": 4.1564522758878656e-06,
|
|
"loss": 0.1541,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 135129505.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.525909423828125,
|
|
"epoch": 1.8636363636363638,
|
|
"grad_norm": 11.397437854508587,
|
|
"learning_rate": 4.144678156711091e-06,
|
|
"loss": 0.1648,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 135982474.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.5355148315429688,
|
|
"epoch": 1.875,
|
|
"grad_norm": 9.052055602010471,
|
|
"learning_rate": 4.132839367395215e-06,
|
|
"loss": 0.1254,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 136810572.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.544097900390625,
|
|
"epoch": 1.8863636363636362,
|
|
"grad_norm": 12.358520941926209,
|
|
"learning_rate": 4.120936373451467e-06,
|
|
"loss": 0.1435,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 137623231.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.5360260009765625,
|
|
"epoch": 1.8977272727272727,
|
|
"grad_norm": 5.240176699801722,
|
|
"learning_rate": 4.108969642915658e-06,
|
|
"loss": 0.125,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 138474048.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.533233642578125,
|
|
"epoch": 1.9090909090909092,
|
|
"grad_norm": 10.779052698306451,
|
|
"learning_rate": 4.096939646329775e-06,
|
|
"loss": 0.135,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 139318039.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.5390167236328125,
|
|
"epoch": 1.9204545454545454,
|
|
"grad_norm": 6.710289531825517,
|
|
"learning_rate": 4.08484685672348e-06,
|
|
"loss": 0.1308,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 140136159.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.5381927490234375,
|
|
"epoch": 1.9318181818181817,
|
|
"grad_norm": 13.851350819778412,
|
|
"learning_rate": 4.07269174959551e-06,
|
|
"loss": 0.1462,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 140989018.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.5294189453125,
|
|
"epoch": 1.9431818181818183,
|
|
"grad_norm": 5.416353381363566,
|
|
"learning_rate": 4.06047480289498e-06,
|
|
"loss": 0.1009,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 141834446.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.5440444946289062,
|
|
"epoch": 1.9545454545454546,
|
|
"grad_norm": 12.01909827068296,
|
|
"learning_rate": 4.0481964970025885e-06,
|
|
"loss": 0.1342,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 142637643.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.5352020263671875,
|
|
"epoch": 1.9659090909090908,
|
|
"grad_norm": 7.410850925326847,
|
|
"learning_rate": 4.035857314711729e-06,
|
|
"loss": 0.1064,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 143446972.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.5480270385742188,
|
|
"epoch": 1.9772727272727273,
|
|
"grad_norm": 10.80598929383265,
|
|
"learning_rate": 4.023457741209509e-06,
|
|
"loss": 0.1294,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 144219843.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.5273895263671875,
|
|
"epoch": 1.9886363636363638,
|
|
"grad_norm": 11.406603208620567,
|
|
"learning_rate": 4.0109982640576676e-06,
|
|
"loss": 0.1345,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 145065280.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.533416748046875,
|
|
"epoch": 2.0,
|
|
"grad_norm": 5.403077497706771,
|
|
"learning_rate": 3.998479373173406e-06,
|
|
"loss": 0.1099,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 145884441.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.5384902954101562,
|
|
"epoch": 2.0113636363636362,
|
|
"grad_norm": 9.064993783573478,
|
|
"learning_rate": 3.985901560810126e-06,
|
|
"loss": 0.1228,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 146692231.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.527313232421875,
|
|
"epoch": 2.022727272727273,
|
|
"grad_norm": 5.7544500718889,
|
|
"learning_rate": 3.973265321538069e-06,
|
|
"loss": 0.106,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 147528006.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.5183944702148438,
|
|
"epoch": 2.034090909090909,
|
|
"grad_norm": 4.613351890776185,
|
|
"learning_rate": 3.960571152224872e-06,
|
|
"loss": 0.0908,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 148417260.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.541961669921875,
|
|
"epoch": 2.0454545454545454,
|
|
"grad_norm": 3.8462617773274728,
|
|
"learning_rate": 3.9478195520160355e-06,
|
|
"loss": 0.0756,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 149242794.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.524169921875,
|
|
"epoch": 2.0568181818181817,
|
|
"grad_norm": 5.4596119310313185,
|
|
"learning_rate": 3.935011022315284e-06,
|
|
"loss": 0.082,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 150098081.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.5390472412109375,
|
|
"epoch": 2.0681818181818183,
|
|
"grad_norm": 10.39377561593915,
|
|
"learning_rate": 3.922146066764863e-06,
|
|
"loss": 0.1071,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 150917507.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.525787353515625,
|
|
"epoch": 2.0795454545454546,
|
|
"grad_norm": 13.4198283536437,
|
|
"learning_rate": 3.9092251912257286e-06,
|
|
"loss": 0.1439,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 151774062.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.5273284912109375,
|
|
"epoch": 2.090909090909091,
|
|
"grad_norm": 8.530055658216385,
|
|
"learning_rate": 3.896248903757658e-06,
|
|
"loss": 0.0821,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 152611097.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.531890869140625,
|
|
"epoch": 2.102272727272727,
|
|
"grad_norm": 14.787176248320279,
|
|
"learning_rate": 3.883217714599273e-06,
|
|
"loss": 0.1179,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 153466669.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.5436248779296875,
|
|
"epoch": 2.1136363636363638,
|
|
"grad_norm": 9.750570935700242,
|
|
"learning_rate": 3.870132136147977e-06,
|
|
"loss": 0.0984,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 154298660.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.5355148315429688,
|
|
"epoch": 2.125,
|
|
"grad_norm": 17.447751788939698,
|
|
"learning_rate": 3.856992682939803e-06,
|
|
"loss": 0.1534,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 155130292.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.538909912109375,
|
|
"epoch": 2.1363636363636362,
|
|
"grad_norm": 14.892648839395319,
|
|
"learning_rate": 3.84379987162919e-06,
|
|
"loss": 0.1339,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 155975863.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.54864501953125,
|
|
"epoch": 2.147727272727273,
|
|
"grad_norm": 3.5306413830760657,
|
|
"learning_rate": 3.830554220968661e-06,
|
|
"loss": 0.0968,
|
|
"mean_token_accuracy": 0.967447918606922,
|
|
"num_tokens": 156800322.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.5349349975585938,
|
|
"epoch": 2.159090909090909,
|
|
"grad_norm": 13.850121100268511,
|
|
"learning_rate": 3.817256251788425e-06,
|
|
"loss": 0.1411,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 157668790.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.543304443359375,
|
|
"epoch": 2.1704545454545454,
|
|
"grad_norm": 12.355866968351991,
|
|
"learning_rate": 3.803906486975901e-06,
|
|
"loss": 0.1229,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 158496829.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.5365829467773438,
|
|
"epoch": 2.1818181818181817,
|
|
"grad_norm": 3.337519584172672,
|
|
"learning_rate": 3.790505451455158e-06,
|
|
"loss": 0.0812,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 159347676.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.5381317138671875,
|
|
"epoch": 2.1931818181818183,
|
|
"grad_norm": 12.868212538236262,
|
|
"learning_rate": 3.77705367216627e-06,
|
|
"loss": 0.1274,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 160186343.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.5421295166015625,
|
|
"epoch": 2.2045454545454546,
|
|
"grad_norm": 11.400363668337347,
|
|
"learning_rate": 3.7635516780446e-06,
|
|
"loss": 0.136,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 160994811.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.5428924560546875,
|
|
"epoch": 2.215909090909091,
|
|
"grad_norm": 4.068567700010866,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 0.0905,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 161804760.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.525787353515625,
|
|
"epoch": 2.227272727272727,
|
|
"grad_norm": 7.662109925607153,
|
|
"learning_rate": 3.7363991708959386e-06,
|
|
"loss": 0.1078,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 162669635.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.5388870239257812,
|
|
"epoch": 2.2386363636363638,
|
|
"grad_norm": 6.488639817122286,
|
|
"learning_rate": 3.7227497255285416e-06,
|
|
"loss": 0.0965,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 163509429.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.5268783569335938,
|
|
"epoch": 2.25,
|
|
"grad_norm": 4.879390086259226,
|
|
"learning_rate": 3.709052200605572e-06,
|
|
"loss": 0.097,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 164350633.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.5328140258789062,
|
|
"epoch": 2.2613636363636362,
|
|
"grad_norm": 8.016614290480945,
|
|
"learning_rate": 3.6953071347253167e-06,
|
|
"loss": 0.105,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 165187521.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.5488815307617188,
|
|
"epoch": 2.2727272727272725,
|
|
"grad_norm": 5.848981338498339,
|
|
"learning_rate": 3.6815150683554187e-06,
|
|
"loss": 0.0809,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 166006751.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.5378341674804688,
|
|
"epoch": 2.284090909090909,
|
|
"grad_norm": 5.298493599070519,
|
|
"learning_rate": 3.6676765438116157e-06,
|
|
"loss": 0.1057,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 166843238.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.5397262573242188,
|
|
"epoch": 2.2954545454545454,
|
|
"grad_norm": 7.197323291540629,
|
|
"learning_rate": 3.6537921052364223e-06,
|
|
"loss": 0.1094,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 167691986.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.5311660766601562,
|
|
"epoch": 2.3068181818181817,
|
|
"grad_norm": 4.266710229403864,
|
|
"learning_rate": 3.6398622985777314e-06,
|
|
"loss": 0.0743,
|
|
"mean_token_accuracy": 0.9752604181412607,
|
|
"num_tokens": 168532989.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.5288314819335938,
|
|
"epoch": 2.3181818181818183,
|
|
"grad_norm": 8.448421834056651,
|
|
"learning_rate": 3.6258876715673475e-06,
|
|
"loss": 0.0813,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 169363455.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.5425033569335938,
|
|
"epoch": 2.3295454545454546,
|
|
"grad_norm": 5.07863510651651,
|
|
"learning_rate": 3.611868773699449e-06,
|
|
"loss": 0.0984,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 170177432.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.5307388305664062,
|
|
"epoch": 2.340909090909091,
|
|
"grad_norm": 3.6790168522255255,
|
|
"learning_rate": 3.597806156208982e-06,
|
|
"loss": 0.0686,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 171014423.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.5391464233398438,
|
|
"epoch": 2.3522727272727275,
|
|
"grad_norm": 3.7261987700436667,
|
|
"learning_rate": 3.5837003720499853e-06,
|
|
"loss": 0.0625,
|
|
"mean_token_accuracy": 0.9752604181412607,
|
|
"num_tokens": 171832784.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.52593994140625,
|
|
"epoch": 2.3636363636363638,
|
|
"grad_norm": 10.359990795629523,
|
|
"learning_rate": 3.569551975873847e-06,
|
|
"loss": 0.1121,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 172678098.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.5261611938476562,
|
|
"epoch": 2.375,
|
|
"grad_norm": 7.621881324218433,
|
|
"learning_rate": 3.555361524007498e-06,
|
|
"loss": 0.073,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 173521043.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.5422592163085938,
|
|
"epoch": 2.3863636363636362,
|
|
"grad_norm": 8.923292571279323,
|
|
"learning_rate": 3.541129574431532e-06,
|
|
"loss": 0.0778,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 174330129.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.52294921875,
|
|
"epoch": 2.3977272727272725,
|
|
"grad_norm": 7.832050365863067,
|
|
"learning_rate": 3.526856686758269e-06,
|
|
"loss": 0.0854,
|
|
"mean_token_accuracy": 0.967447918606922,
|
|
"num_tokens": 175179051.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.5177383422851562,
|
|
"epoch": 2.409090909090909,
|
|
"grad_norm": 7.830008454039604,
|
|
"learning_rate": 3.51254342220975e-06,
|
|
"loss": 0.0794,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 176037909.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.5296478271484375,
|
|
"epoch": 2.4204545454545454,
|
|
"grad_norm": 6.8243379606452095,
|
|
"learning_rate": 3.4981903435956675e-06,
|
|
"loss": 0.0672,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 176860131.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.5161209106445312,
|
|
"epoch": 2.4318181818181817,
|
|
"grad_norm": 5.986660832754099,
|
|
"learning_rate": 3.4837980152912393e-06,
|
|
"loss": 0.0728,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 177705136.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.5319290161132812,
|
|
"epoch": 2.4431818181818183,
|
|
"grad_norm": 7.570755754525308,
|
|
"learning_rate": 3.4693670032150117e-06,
|
|
"loss": 0.0786,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 178510671.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.5307540893554688,
|
|
"epoch": 2.4545454545454546,
|
|
"grad_norm": 5.407915840203935,
|
|
"learning_rate": 3.4548978748066115e-06,
|
|
"loss": 0.0594,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 179329651.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.52447509765625,
|
|
"epoch": 2.465909090909091,
|
|
"grad_norm": 6.447443460350779,
|
|
"learning_rate": 3.440391199004431e-06,
|
|
"loss": 0.061,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 180158843.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.5199203491210938,
|
|
"epoch": 2.4772727272727275,
|
|
"grad_norm": 5.6196857220042835,
|
|
"learning_rate": 3.4258475462232586e-06,
|
|
"loss": 0.0709,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 181003645.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.5209808349609375,
|
|
"epoch": 2.4886363636363638,
|
|
"grad_norm": 4.149800169775993,
|
|
"learning_rate": 3.4112674883318477e-06,
|
|
"loss": 0.0559,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 181838208.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.519622802734375,
|
|
"epoch": 2.5,
|
|
"grad_norm": 7.847869214902608,
|
|
"learning_rate": 3.3966515986304322e-06,
|
|
"loss": 0.0647,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 182681925.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.5227890014648438,
|
|
"epoch": 2.5113636363636362,
|
|
"grad_norm": 3.885151215528788,
|
|
"learning_rate": 3.3820004518281835e-06,
|
|
"loss": 0.0482,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 183519031.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.5285797119140625,
|
|
"epoch": 2.5227272727272725,
|
|
"grad_norm": 9.287577782435898,
|
|
"learning_rate": 3.367314624020613e-06,
|
|
"loss": 0.084,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 184358917.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.513397216796875,
|
|
"epoch": 2.534090909090909,
|
|
"grad_norm": 10.74097807636218,
|
|
"learning_rate": 3.352594692666915e-06,
|
|
"loss": 0.0786,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 185224412.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.5338287353515625,
|
|
"epoch": 2.5454545454545454,
|
|
"grad_norm": 3.4282665798632515,
|
|
"learning_rate": 3.337841236567268e-06,
|
|
"loss": 0.043,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 186048899.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.5269088745117188,
|
|
"epoch": 2.5568181818181817,
|
|
"grad_norm": 12.447034376257264,
|
|
"learning_rate": 3.32305483584007e-06,
|
|
"loss": 0.111,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 186880043.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.5433578491210938,
|
|
"epoch": 2.5681818181818183,
|
|
"grad_norm": 10.265085259781317,
|
|
"learning_rate": 3.30823607189913e-06,
|
|
"loss": 0.0983,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 187654506.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.5324630737304688,
|
|
"epoch": 2.5795454545454546,
|
|
"grad_norm": 6.24491633738438,
|
|
"learning_rate": 3.2933855274308067e-06,
|
|
"loss": 0.0744,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 188464920.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.5325698852539062,
|
|
"epoch": 2.590909090909091,
|
|
"grad_norm": 9.798838479871074,
|
|
"learning_rate": 3.278503786371095e-06,
|
|
"loss": 0.0844,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 189293301.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.5347824096679688,
|
|
"epoch": 2.6022727272727275,
|
|
"grad_norm": 12.7274044235036,
|
|
"learning_rate": 3.2635914338826665e-06,
|
|
"loss": 0.1058,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 190121007.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.5603790283203125,
|
|
"epoch": 2.6136363636363638,
|
|
"grad_norm": 7.03460715005398,
|
|
"learning_rate": 3.2486490563318605e-06,
|
|
"loss": 0.0768,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 190881003.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.5350341796875,
|
|
"epoch": 2.625,
|
|
"grad_norm": 3.155224324649305,
|
|
"learning_rate": 3.233677241265627e-06,
|
|
"loss": 0.0588,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 191699288.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.5343246459960938,
|
|
"epoch": 2.6363636363636362,
|
|
"grad_norm": 6.092967080136461,
|
|
"learning_rate": 3.218676577388424e-06,
|
|
"loss": 0.0673,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 192526668.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.5334014892578125,
|
|
"epoch": 2.6477272727272725,
|
|
"grad_norm": 5.321016163063104,
|
|
"learning_rate": 3.2036476545390695e-06,
|
|
"loss": 0.0702,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 193349142.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.5334320068359375,
|
|
"epoch": 2.659090909090909,
|
|
"grad_norm": 3.466858920718226,
|
|
"learning_rate": 3.188591063667548e-06,
|
|
"loss": 0.0469,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 194176787.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.53265380859375,
|
|
"epoch": 2.6704545454545454,
|
|
"grad_norm": 8.83147751793894,
|
|
"learning_rate": 3.1735073968117743e-06,
|
|
"loss": 0.0596,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 195004474.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.540924072265625,
|
|
"epoch": 2.6818181818181817,
|
|
"grad_norm": 12.22958519345781,
|
|
"learning_rate": 3.1583972470743123e-06,
|
|
"loss": 0.088,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 195807358.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.5318756103515625,
|
|
"epoch": 2.6931818181818183,
|
|
"grad_norm": 5.996711651679238,
|
|
"learning_rate": 3.1432612085990576e-06,
|
|
"loss": 0.0677,
|
|
"mean_token_accuracy": 0.9752604181412607,
|
|
"num_tokens": 196617818.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.5335693359375,
|
|
"epoch": 2.7045454545454546,
|
|
"grad_norm": 5.0941973108424365,
|
|
"learning_rate": 3.1280998765478725e-06,
|
|
"loss": 0.0645,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 197430311.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.528717041015625,
|
|
"epoch": 2.715909090909091,
|
|
"grad_norm": 5.622809967264127,
|
|
"learning_rate": 3.1129138470771823e-06,
|
|
"loss": 0.0579,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 198277321.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.5289077758789062,
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 5.096620942507479,
|
|
"learning_rate": 3.0977037173145387e-06,
|
|
"loss": 0.0442,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 199108815.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.5392990112304688,
|
|
"epoch": 2.7386363636363638,
|
|
"grad_norm": 4.634448923382875,
|
|
"learning_rate": 3.082470085335133e-06,
|
|
"loss": 0.0561,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 199899561.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.5410385131835938,
|
|
"epoch": 2.75,
|
|
"grad_norm": 4.486339153887012,
|
|
"learning_rate": 3.0672135501382894e-06,
|
|
"loss": 0.0724,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 200725431.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.5286865234375,
|
|
"epoch": 2.7613636363636362,
|
|
"grad_norm": 2.9315078295067893,
|
|
"learning_rate": 3.0519347116239e-06,
|
|
"loss": 0.0423,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 201560765.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.538360595703125,
|
|
"epoch": 2.7727272727272725,
|
|
"grad_norm": 3.2320025767052982,
|
|
"learning_rate": 3.036634170568847e-06,
|
|
"loss": 0.0401,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 202381503.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.5329360961914062,
|
|
"epoch": 2.784090909090909,
|
|
"grad_norm": 5.331727343345033,
|
|
"learning_rate": 3.021312528603371e-06,
|
|
"loss": 0.0533,
|
|
"mean_token_accuracy": 0.9830729176755995,
|
|
"num_tokens": 203203187.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.5489501953125,
|
|
"epoch": 2.7954545454545454,
|
|
"grad_norm": 3.3498628859906723,
|
|
"learning_rate": 3.0059703881874232e-06,
|
|
"loss": 0.0357,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 203986073.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.5378494262695312,
|
|
"epoch": 2.8068181818181817,
|
|
"grad_norm": 3.8988231949793417,
|
|
"learning_rate": 2.990608352586965e-06,
|
|
"loss": 0.0498,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 204805437.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.5403289794921875,
|
|
"epoch": 2.8181818181818183,
|
|
"grad_norm": 5.259116353604022,
|
|
"learning_rate": 2.9752270258502593e-06,
|
|
"loss": 0.056,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 205608028.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.5286102294921875,
|
|
"epoch": 2.8295454545454546,
|
|
"grad_norm": 6.195966706572306,
|
|
"learning_rate": 2.959827012784108e-06,
|
|
"loss": 0.048,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 206426839.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.5177841186523438,
|
|
"epoch": 2.840909090909091,
|
|
"grad_norm": 7.2897806046347196,
|
|
"learning_rate": 2.9444089189300783e-06,
|
|
"loss": 0.0588,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 207279265.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.5423431396484375,
|
|
"epoch": 2.8522727272727275,
|
|
"grad_norm": 6.301702261877335,
|
|
"learning_rate": 2.92897335054069e-06,
|
|
"loss": 0.059,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 208074284.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.5258865356445312,
|
|
"epoch": 2.8636363636363638,
|
|
"grad_norm": 6.437422190596041,
|
|
"learning_rate": 2.913520914555572e-06,
|
|
"loss": 0.057,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 208920165.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.5269393920898438,
|
|
"epoch": 2.875,
|
|
"grad_norm": 4.473468381789689,
|
|
"learning_rate": 2.8980522185776065e-06,
|
|
"loss": 0.0429,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 209763076.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.523956298828125,
|
|
"epoch": 2.8863636363636362,
|
|
"grad_norm": 3.685173302105583,
|
|
"learning_rate": 2.882567870849029e-06,
|
|
"loss": 0.0398,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 210578485.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.5370407104492188,
|
|
"epoch": 2.8977272727272725,
|
|
"grad_norm": 4.774066475110165,
|
|
"learning_rate": 2.8670684802275173e-06,
|
|
"loss": 0.0368,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 211388341.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.5209121704101562,
|
|
"epoch": 2.909090909090909,
|
|
"grad_norm": 4.220527200517054,
|
|
"learning_rate": 2.8515546561622464e-06,
|
|
"loss": 0.0325,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 212229038.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.5267486572265625,
|
|
"epoch": 2.9204545454545454,
|
|
"grad_norm": 4.5755301854337045,
|
|
"learning_rate": 2.8360270086699274e-06,
|
|
"loss": 0.0372,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 213072715.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.5250320434570312,
|
|
"epoch": 2.9318181818181817,
|
|
"grad_norm": 4.428753877599414,
|
|
"learning_rate": 2.820486148310822e-06,
|
|
"loss": 0.0376,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 213912498.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.534210205078125,
|
|
"epoch": 2.9431818181818183,
|
|
"grad_norm": 4.791248923408223,
|
|
"learning_rate": 2.8049326861647303e-06,
|
|
"loss": 0.0454,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 214726839.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.5367584228515625,
|
|
"epoch": 2.9545454545454546,
|
|
"grad_norm": 4.490486917260571,
|
|
"learning_rate": 2.7893672338069666e-06,
|
|
"loss": 0.0418,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 215527901.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.5217819213867188,
|
|
"epoch": 2.965909090909091,
|
|
"grad_norm": 4.610870917217071,
|
|
"learning_rate": 2.7737904032843105e-06,
|
|
"loss": 0.0462,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 216370628.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.528656005859375,
|
|
"epoch": 2.9772727272727275,
|
|
"grad_norm": 4.762926208072535,
|
|
"learning_rate": 2.7582028070909415e-06,
|
|
"loss": 0.0343,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 217191647.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.5208206176757812,
|
|
"epoch": 2.9886363636363638,
|
|
"grad_norm": 2.6855558389598406,
|
|
"learning_rate": 2.742605058144352e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 218045769.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.5156478881835938,
|
|
"epoch": 3.0,
|
|
"grad_norm": 3.8864194579259568,
|
|
"learning_rate": 2.7269977697612515e-06,
|
|
"loss": 0.0274,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 218903155.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.5219039916992188,
|
|
"epoch": 3.0113636363636362,
|
|
"grad_norm": 2.7681144892880765,
|
|
"learning_rate": 2.7113815556334478e-06,
|
|
"loss": 0.0143,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 219733127.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.5251312255859375,
|
|
"epoch": 3.022727272727273,
|
|
"grad_norm": 4.368355859336438,
|
|
"learning_rate": 2.6957570298037156e-06,
|
|
"loss": 0.0188,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 220560193.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.5105361938476562,
|
|
"epoch": 3.034090909090909,
|
|
"grad_norm": 9.088977324112165,
|
|
"learning_rate": 2.680124806641654e-06,
|
|
"loss": 0.036,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 221419000.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.5338973999023438,
|
|
"epoch": 3.0454545454545454,
|
|
"grad_norm": 6.018166776800449,
|
|
"learning_rate": 2.664485500819527e-06,
|
|
"loss": 0.0333,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 222209715.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.5257644653320312,
|
|
"epoch": 3.0568181818181817,
|
|
"grad_norm": 5.468563141530249,
|
|
"learning_rate": 2.6488397272880943e-06,
|
|
"loss": 0.0287,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 223024902.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.5193328857421875,
|
|
"epoch": 3.0681818181818183,
|
|
"grad_norm": 7.295106536435693,
|
|
"learning_rate": 2.633188101252433e-06,
|
|
"loss": 0.0337,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 223854643.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.5175323486328125,
|
|
"epoch": 3.0795454545454546,
|
|
"grad_norm": 6.883191149944938,
|
|
"learning_rate": 2.617531238147744e-06,
|
|
"loss": 0.0501,
|
|
"mean_token_accuracy": 0.9830729176755995,
|
|
"num_tokens": 224719358.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.5205154418945312,
|
|
"epoch": 3.090909090909091,
|
|
"grad_norm": 5.88154920217092,
|
|
"learning_rate": 2.6018697536151554e-06,
|
|
"loss": 0.0381,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 225549159.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.5368423461914062,
|
|
"epoch": 3.102272727272727,
|
|
"grad_norm": 9.2073262872856,
|
|
"learning_rate": 2.5862042634775125e-06,
|
|
"loss": 0.0618,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 226368587.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.53558349609375,
|
|
"epoch": 3.1136363636363638,
|
|
"grad_norm": 8.618806420796583,
|
|
"learning_rate": 2.5705353837151655e-06,
|
|
"loss": 0.0316,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 227156622.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.53497314453125,
|
|
"epoch": 3.125,
|
|
"grad_norm": 5.171416458961046,
|
|
"learning_rate": 2.554863730441748e-06,
|
|
"loss": 0.0376,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 227980137.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.5346145629882812,
|
|
"epoch": 3.1363636363636362,
|
|
"grad_norm": 3.2259069311735753,
|
|
"learning_rate": 2.5391899198799475e-06,
|
|
"loss": 0.0217,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 228789801.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.5227127075195312,
|
|
"epoch": 3.147727272727273,
|
|
"grad_norm": 5.680603759953942,
|
|
"learning_rate": 2.5235145683372813e-06,
|
|
"loss": 0.0445,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 229631483.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.5258560180664062,
|
|
"epoch": 3.159090909090909,
|
|
"grad_norm": 3.7672880245068754,
|
|
"learning_rate": 2.507838292181858e-06,
|
|
"loss": 0.0274,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 230473829.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.525543212890625,
|
|
"epoch": 3.1704545454545454,
|
|
"grad_norm": 4.446400889769841,
|
|
"learning_rate": 2.4921617078181425e-06,
|
|
"loss": 0.0295,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 231297503.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.5142059326171875,
|
|
"epoch": 3.1818181818181817,
|
|
"grad_norm": 8.655584591637458,
|
|
"learning_rate": 2.47648543166272e-06,
|
|
"loss": 0.053,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 232167203.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.53271484375,
|
|
"epoch": 3.1931818181818183,
|
|
"grad_norm": 8.687831510758203,
|
|
"learning_rate": 2.4608100801200533e-06,
|
|
"loss": 0.0455,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 232982992.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.5253372192382812,
|
|
"epoch": 3.2045454545454546,
|
|
"grad_norm": 4.055331476139395,
|
|
"learning_rate": 2.445136269558254e-06,
|
|
"loss": 0.0185,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 233794951.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.5300216674804688,
|
|
"epoch": 3.215909090909091,
|
|
"grad_norm": 5.006857935808983,
|
|
"learning_rate": 2.4294646162848353e-06,
|
|
"loss": 0.0418,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 234627063.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.530303955078125,
|
|
"epoch": 3.227272727272727,
|
|
"grad_norm": 4.2488937867763585,
|
|
"learning_rate": 2.413795736522489e-06,
|
|
"loss": 0.0286,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 235422472.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.5381851196289062,
|
|
"epoch": 3.2386363636363638,
|
|
"grad_norm": 3.8677868707482665,
|
|
"learning_rate": 2.3981302463848454e-06,
|
|
"loss": 0.0239,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 236250854.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.5477142333984375,
|
|
"epoch": 3.25,
|
|
"grad_norm": 2.4908003048655005,
|
|
"learning_rate": 2.3824687618522567e-06,
|
|
"loss": 0.019,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 237025724.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.5311508178710938,
|
|
"epoch": 3.2613636363636362,
|
|
"grad_norm": 2.8300800369105423,
|
|
"learning_rate": 2.366811898747568e-06,
|
|
"loss": 0.0324,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 237847006.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.5348968505859375,
|
|
"epoch": 3.2727272727272725,
|
|
"grad_norm": 3.74649110100955,
|
|
"learning_rate": 2.351160272711907e-06,
|
|
"loss": 0.0265,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 238659400.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.5213699340820312,
|
|
"epoch": 3.284090909090909,
|
|
"grad_norm": 2.2370472232591556,
|
|
"learning_rate": 2.3355144991804736e-06,
|
|
"loss": 0.0146,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 239520566.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.5295486450195312,
|
|
"epoch": 3.2954545454545454,
|
|
"grad_norm": 4.645357526431567,
|
|
"learning_rate": 2.3198751933583463e-06,
|
|
"loss": 0.0251,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 240338169.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.5078277587890625,
|
|
"epoch": 3.3068181818181817,
|
|
"grad_norm": 3.977909281667933,
|
|
"learning_rate": 2.304242970196285e-06,
|
|
"loss": 0.0157,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 241205913.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.518096923828125,
|
|
"epoch": 3.3181818181818183,
|
|
"grad_norm": 3.705571268224213,
|
|
"learning_rate": 2.2886184443665522e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 242031860.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.51983642578125,
|
|
"epoch": 3.3295454545454546,
|
|
"grad_norm": 3.153417353690183,
|
|
"learning_rate": 2.2730022302387493e-06,
|
|
"loss": 0.0209,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 242863430.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.5169601440429688,
|
|
"epoch": 3.340909090909091,
|
|
"grad_norm": 5.244316414765351,
|
|
"learning_rate": 2.257394941855648e-06,
|
|
"loss": 0.0194,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 243706209.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.5330963134765625,
|
|
"epoch": 3.3522727272727275,
|
|
"grad_norm": 5.974620564211638,
|
|
"learning_rate": 2.2417971929090593e-06,
|
|
"loss": 0.0176,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 244495154.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.5119476318359375,
|
|
"epoch": 3.3636363636363638,
|
|
"grad_norm": 6.28569760192753,
|
|
"learning_rate": 2.2262095967156895e-06,
|
|
"loss": 0.0232,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 245361556.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.5241546630859375,
|
|
"epoch": 3.375,
|
|
"grad_norm": 2.976749613690859,
|
|
"learning_rate": 2.2106327661930343e-06,
|
|
"loss": 0.0129,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 246179032.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.5186386108398438,
|
|
"epoch": 3.3863636363636362,
|
|
"grad_norm": 5.205476897606577,
|
|
"learning_rate": 2.19506731383527e-06,
|
|
"loss": 0.0174,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 247044591.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.5159530639648438,
|
|
"epoch": 3.3977272727272725,
|
|
"grad_norm": 4.885033948373779,
|
|
"learning_rate": 2.1795138516891786e-06,
|
|
"loss": 0.0263,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 247849642.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.5101547241210938,
|
|
"epoch": 3.409090909090909,
|
|
"grad_norm": 4.276142732793436,
|
|
"learning_rate": 2.163972991330073e-06,
|
|
"loss": 0.0117,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 248687645.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.512420654296875,
|
|
"epoch": 3.4204545454545454,
|
|
"grad_norm": 5.6616807718007065,
|
|
"learning_rate": 2.148445343837755e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 249523174.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.5093612670898438,
|
|
"epoch": 3.4318181818181817,
|
|
"grad_norm": 7.495530375469061,
|
|
"learning_rate": 2.1329315197724835e-06,
|
|
"loss": 0.0137,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 250381087.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.5103759765625,
|
|
"epoch": 3.4431818181818183,
|
|
"grad_norm": 1.4405232803470114,
|
|
"learning_rate": 2.1174321291509716e-06,
|
|
"loss": 0.0053,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 251231901.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.5128860473632812,
|
|
"epoch": 3.4545454545454546,
|
|
"grad_norm": 9.845854165742995,
|
|
"learning_rate": 2.1019477814223943e-06,
|
|
"loss": 0.0327,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 252066427.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.5172500610351562,
|
|
"epoch": 3.465909090909091,
|
|
"grad_norm": 6.8859140835256545,
|
|
"learning_rate": 2.086479085444429e-06,
|
|
"loss": 0.0267,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 252898682.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.4980316162109375,
|
|
"epoch": 3.4772727272727275,
|
|
"grad_norm": 4.267657568764952,
|
|
"learning_rate": 2.071026649459311e-06,
|
|
"loss": 0.0092,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 253766783.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.5067977905273438,
|
|
"epoch": 3.4886363636363638,
|
|
"grad_norm": 7.713129362260799,
|
|
"learning_rate": 2.055591081069922e-06,
|
|
"loss": 0.0193,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 254619003.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.5159912109375,
|
|
"epoch": 3.5,
|
|
"grad_norm": 5.82481587389578,
|
|
"learning_rate": 2.040172987215893e-06,
|
|
"loss": 0.0164,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 255449310.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.5168304443359375,
|
|
"epoch": 3.5113636363636362,
|
|
"grad_norm": 9.77767804352304,
|
|
"learning_rate": 2.024772974149741e-06,
|
|
"loss": 0.0319,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 256269971.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.508331298828125,
|
|
"epoch": 3.5227272727272725,
|
|
"grad_norm": 3.110919919007407,
|
|
"learning_rate": 2.0093916474130354e-06,
|
|
"loss": 0.0083,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 257110990.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.49806976318359375,
|
|
"epoch": 3.534090909090909,
|
|
"grad_norm": 4.028515631346399,
|
|
"learning_rate": 1.9940296118125776e-06,
|
|
"loss": 0.0178,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 257969152.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.5093307495117188,
|
|
"epoch": 3.5454545454545454,
|
|
"grad_norm": 4.838995412482845,
|
|
"learning_rate": 1.9786874713966293e-06,
|
|
"loss": 0.0413,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 258787610.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.5196762084960938,
|
|
"epoch": 3.5568181818181817,
|
|
"grad_norm": 6.530173083527098,
|
|
"learning_rate": 1.9633658294311535e-06,
|
|
"loss": 0.0311,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 259591761.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.5156402587890625,
|
|
"epoch": 3.5681818181818183,
|
|
"grad_norm": 6.929601010585583,
|
|
"learning_rate": 1.9480652883761007e-06,
|
|
"loss": 0.0292,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 260397894.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.5229644775390625,
|
|
"epoch": 3.5795454545454546,
|
|
"grad_norm": 7.831867296735421,
|
|
"learning_rate": 1.9327864498617114e-06,
|
|
"loss": 0.0298,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 261200683.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.5133056640625,
|
|
"epoch": 3.590909090909091,
|
|
"grad_norm": 3.5816335430908777,
|
|
"learning_rate": 1.9175299146648672e-06,
|
|
"loss": 0.0201,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 262040136.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.5169219970703125,
|
|
"epoch": 3.6022727272727275,
|
|
"grad_norm": 3.2198926229429823,
|
|
"learning_rate": 1.9022962826854619e-06,
|
|
"loss": 0.0212,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 262885485.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.5167617797851562,
|
|
"epoch": 3.6136363636363638,
|
|
"grad_norm": 2.5731365142380587,
|
|
"learning_rate": 1.887086152922818e-06,
|
|
"loss": 0.024,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 263732921.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.5399856567382812,
|
|
"epoch": 3.625,
|
|
"grad_norm": 5.5206910993901355,
|
|
"learning_rate": 1.8719001234521283e-06,
|
|
"loss": 0.0325,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 264525028.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.5193099975585938,
|
|
"epoch": 3.6363636363636362,
|
|
"grad_norm": 2.9209940154455545,
|
|
"learning_rate": 1.8567387914009432e-06,
|
|
"loss": 0.0138,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 265358646.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.508575439453125,
|
|
"epoch": 3.6477272727272725,
|
|
"grad_norm": 2.5817093209649444,
|
|
"learning_rate": 1.8416027529256885e-06,
|
|
"loss": 0.0191,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 266215215.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.5082931518554688,
|
|
"epoch": 3.659090909090909,
|
|
"grad_norm": 2.503376524092804,
|
|
"learning_rate": 1.8264926031882274e-06,
|
|
"loss": 0.0151,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 267080581.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.5336380004882812,
|
|
"epoch": 3.6704545454545454,
|
|
"grad_norm": 2.601198540739444,
|
|
"learning_rate": 1.8114089363324525e-06,
|
|
"loss": 0.0181,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 267854230.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.5327835083007812,
|
|
"epoch": 3.6818181818181817,
|
|
"grad_norm": 3.063054591464437,
|
|
"learning_rate": 1.7963523454609317e-06,
|
|
"loss": 0.0094,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 268647574.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.5183258056640625,
|
|
"epoch": 3.6931818181818183,
|
|
"grad_norm": 2.758926546908894,
|
|
"learning_rate": 1.7813234226115767e-06,
|
|
"loss": 0.0077,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 269479857.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.5243988037109375,
|
|
"epoch": 3.7045454545454546,
|
|
"grad_norm": 4.055775400118274,
|
|
"learning_rate": 1.766322758734374e-06,
|
|
"loss": 0.0242,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 270278460.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.5056533813476562,
|
|
"epoch": 3.715909090909091,
|
|
"grad_norm": 3.6447943171210877,
|
|
"learning_rate": 1.75135094366814e-06,
|
|
"loss": 0.0181,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 271125235.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.518310546875,
|
|
"epoch": 3.7272727272727275,
|
|
"grad_norm": 6.528608409582633,
|
|
"learning_rate": 1.7364085661173346e-06,
|
|
"loss": 0.0219,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 271947014.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.5148544311523438,
|
|
"epoch": 3.7386363636363638,
|
|
"grad_norm": 4.846437727927079,
|
|
"learning_rate": 1.721496213628906e-06,
|
|
"loss": 0.0179,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 272778931.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.5037765502929688,
|
|
"epoch": 3.75,
|
|
"grad_norm": 5.614029654133794,
|
|
"learning_rate": 1.7066144725691933e-06,
|
|
"loss": 0.0209,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 273609373.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.5075302124023438,
|
|
"epoch": 3.7613636363636362,
|
|
"grad_norm": 6.557376782306085,
|
|
"learning_rate": 1.6917639281008703e-06,
|
|
"loss": 0.0171,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 274452240.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.512725830078125,
|
|
"epoch": 3.7727272727272725,
|
|
"grad_norm": 2.752525740999078,
|
|
"learning_rate": 1.6769451641599305e-06,
|
|
"loss": 0.0112,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 275279923.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.5047988891601562,
|
|
"epoch": 3.784090909090909,
|
|
"grad_norm": 3.2366011635785368,
|
|
"learning_rate": 1.6621587634327328e-06,
|
|
"loss": 0.01,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 276099706.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.5012435913085938,
|
|
"epoch": 3.7954545454545454,
|
|
"grad_norm": 6.289968133807858,
|
|
"learning_rate": 1.647405307333085e-06,
|
|
"loss": 0.028,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 276948323.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.5199050903320312,
|
|
"epoch": 3.8068181818181817,
|
|
"grad_norm": 5.07228106259114,
|
|
"learning_rate": 1.6326853759793878e-06,
|
|
"loss": 0.0227,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 277752317.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.5204315185546875,
|
|
"epoch": 3.8181818181818183,
|
|
"grad_norm": 1.8589073814632076,
|
|
"learning_rate": 1.6179995481718165e-06,
|
|
"loss": 0.017,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 278553380.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.5152053833007812,
|
|
"epoch": 3.8295454545454546,
|
|
"grad_norm": 5.580405395858779,
|
|
"learning_rate": 1.6033484013695688e-06,
|
|
"loss": 0.0216,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 279380180.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.51837158203125,
|
|
"epoch": 3.840909090909091,
|
|
"grad_norm": 1.8203808497415166,
|
|
"learning_rate": 1.588732511668153e-06,
|
|
"loss": 0.0107,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 280190463.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.5064926147460938,
|
|
"epoch": 3.8522727272727275,
|
|
"grad_norm": 0.6183424685913169,
|
|
"learning_rate": 1.5741524537767427e-06,
|
|
"loss": 0.0034,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 281038954.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.5031051635742188,
|
|
"epoch": 3.8636363636363638,
|
|
"grad_norm": 5.5233826679734195,
|
|
"learning_rate": 1.5596088009955695e-06,
|
|
"loss": 0.0152,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 281892856.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.5198516845703125,
|
|
"epoch": 3.875,
|
|
"grad_norm": 5.607369212208562,
|
|
"learning_rate": 1.5451021251933895e-06,
|
|
"loss": 0.025,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 282714631.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.4959564208984375,
|
|
"epoch": 3.8863636363636362,
|
|
"grad_norm": 4.099811537737254,
|
|
"learning_rate": 1.5306329967849887e-06,
|
|
"loss": 0.0242,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 283568255.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.507598876953125,
|
|
"epoch": 3.8977272727272725,
|
|
"grad_norm": 2.3164448440263428,
|
|
"learning_rate": 1.5162019847087616e-06,
|
|
"loss": 0.0054,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 284396996.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.507171630859375,
|
|
"epoch": 3.909090909090909,
|
|
"grad_norm": 2.8517936981521945,
|
|
"learning_rate": 1.5018096564043333e-06,
|
|
"loss": 0.0097,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 285208621.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.50006103515625,
|
|
"epoch": 3.9204545454545454,
|
|
"grad_norm": 4.123129780590149,
|
|
"learning_rate": 1.4874565777902518e-06,
|
|
"loss": 0.0107,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 286068274.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.513397216796875,
|
|
"epoch": 3.9318181818181817,
|
|
"grad_norm": 3.52925414394255,
|
|
"learning_rate": 1.4731433132417316e-06,
|
|
"loss": 0.0119,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 286884909.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.5006332397460938,
|
|
"epoch": 3.9431818181818183,
|
|
"grad_norm": 4.776748695721814,
|
|
"learning_rate": 1.4588704255684697e-06,
|
|
"loss": 0.0165,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 287730733.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.504791259765625,
|
|
"epoch": 3.9545454545454546,
|
|
"grad_norm": 2.537558079581804,
|
|
"learning_rate": 1.4446384759925024e-06,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 288550016.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.5134658813476562,
|
|
"epoch": 3.965909090909091,
|
|
"grad_norm": 1.085201575355781,
|
|
"learning_rate": 1.4304480241261529e-06,
|
|
"loss": 0.0044,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 289378888.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.47869110107421875,
|
|
"epoch": 3.9772727272727275,
|
|
"grad_norm": 1.4589087196629151,
|
|
"learning_rate": 1.4162996279500158e-06,
|
|
"loss": 0.0051,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 290281736.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.5169143676757812,
|
|
"epoch": 3.9886363636363638,
|
|
"grad_norm": 3.4093651803696177,
|
|
"learning_rate": 1.4021938437910181e-06,
|
|
"loss": 0.0221,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 291064628.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.5065155029296875,
|
|
"epoch": 4.0,
|
|
"grad_norm": 3.137419095072893,
|
|
"learning_rate": 1.388131226300552e-06,
|
|
"loss": 0.0179,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 291890322.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.5136337280273438,
|
|
"epoch": 4.011363636363637,
|
|
"grad_norm": 1.4892912968011076,
|
|
"learning_rate": 1.374112328432652e-06,
|
|
"loss": 0.0047,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 292701363.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.5075454711914062,
|
|
"epoch": 4.0227272727272725,
|
|
"grad_norm": 2.654487334818569,
|
|
"learning_rate": 1.3601377014222688e-06,
|
|
"loss": 0.0149,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 293532137.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.49842071533203125,
|
|
"epoch": 4.034090909090909,
|
|
"grad_norm": 3.028217204521398,
|
|
"learning_rate": 1.3462078947635781e-06,
|
|
"loss": 0.0063,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 294375049.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.5129852294921875,
|
|
"epoch": 4.045454545454546,
|
|
"grad_norm": 1.9243706005279761,
|
|
"learning_rate": 1.3323234561883847e-06,
|
|
"loss": 0.0041,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 295187394.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.5176849365234375,
|
|
"epoch": 4.056818181818182,
|
|
"grad_norm": 2.696144952101861,
|
|
"learning_rate": 1.318484931644582e-06,
|
|
"loss": 0.0131,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 295972208.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.48602294921875,
|
|
"epoch": 4.068181818181818,
|
|
"grad_norm": 6.427614860445277,
|
|
"learning_rate": 1.3046928652746833e-06,
|
|
"loss": 0.0062,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 296841049.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.48738861083984375,
|
|
"epoch": 4.079545454545454,
|
|
"grad_norm": 4.960528563198844,
|
|
"learning_rate": 1.2909477993944286e-06,
|
|
"loss": 0.0264,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 297687674.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.4840240478515625,
|
|
"epoch": 4.090909090909091,
|
|
"grad_norm": 6.171675343056278,
|
|
"learning_rate": 1.2772502744714592e-06,
|
|
"loss": 0.0211,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 298539579.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.49788665771484375,
|
|
"epoch": 4.1022727272727275,
|
|
"grad_norm": 7.239641501067476,
|
|
"learning_rate": 1.2636008291040618e-06,
|
|
"loss": 0.0096,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 299370858.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.5033340454101562,
|
|
"epoch": 4.113636363636363,
|
|
"grad_norm": 5.124511979744234,
|
|
"learning_rate": 1.2500000000000007e-06,
|
|
"loss": 0.0206,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 300193508.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.494537353515625,
|
|
"epoch": 4.125,
|
|
"grad_norm": 4.156621336632535,
|
|
"learning_rate": 1.236448321955401e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 301045850.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.5119476318359375,
|
|
"epoch": 4.136363636363637,
|
|
"grad_norm": 2.0665220666683353,
|
|
"learning_rate": 1.222946327833731e-06,
|
|
"loss": 0.0037,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 301841078.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.4991302490234375,
|
|
"epoch": 4.1477272727272725,
|
|
"grad_norm": 7.318250997291169,
|
|
"learning_rate": 1.2094945485448424e-06,
|
|
"loss": 0.0076,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 302689150.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.528778076171875,
|
|
"epoch": 4.159090909090909,
|
|
"grad_norm": 2.876075984900272,
|
|
"learning_rate": 1.196093513024099e-06,
|
|
"loss": 0.0131,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 303488060.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.501953125,
|
|
"epoch": 4.170454545454546,
|
|
"grad_norm": 7.049810570291405,
|
|
"learning_rate": 1.182743748211576e-06,
|
|
"loss": 0.0165,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 304341157.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.496307373046875,
|
|
"epoch": 4.181818181818182,
|
|
"grad_norm": 8.371870138705338,
|
|
"learning_rate": 1.1694457790313403e-06,
|
|
"loss": 0.013,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 305186459.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.5236282348632812,
|
|
"epoch": 4.193181818181818,
|
|
"grad_norm": 2.7232406189892493,
|
|
"learning_rate": 1.15620012837081e-06,
|
|
"loss": 0.0105,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 305978157.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.5146942138671875,
|
|
"epoch": 4.204545454545454,
|
|
"grad_norm": 1.032258263050008,
|
|
"learning_rate": 1.1430073170601968e-06,
|
|
"loss": 0.01,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 306769045.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.5118789672851562,
|
|
"epoch": 4.215909090909091,
|
|
"grad_norm": 3.0442908381820417,
|
|
"learning_rate": 1.1298678638520247e-06,
|
|
"loss": 0.0062,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 307562505.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.496978759765625,
|
|
"epoch": 4.2272727272727275,
|
|
"grad_norm": 2.0148794276847513,
|
|
"learning_rate": 1.1167822854007265e-06,
|
|
"loss": 0.0069,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 308441553.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.5107421875,
|
|
"epoch": 4.238636363636363,
|
|
"grad_norm": 0.9950819388070626,
|
|
"learning_rate": 1.1037510962423425e-06,
|
|
"loss": 0.0074,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 309253697.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.5004348754882812,
|
|
"epoch": 4.25,
|
|
"grad_norm": 0.4801227777140967,
|
|
"learning_rate": 1.0907748087742716e-06,
|
|
"loss": 0.0027,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 310096440.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.4990081787109375,
|
|
"epoch": 4.261363636363637,
|
|
"grad_norm": 1.5559270159153842,
|
|
"learning_rate": 1.0778539332351374e-06,
|
|
"loss": 0.005,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 310946699.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.5043411254882812,
|
|
"epoch": 4.2727272727272725,
|
|
"grad_norm": 1.6535039299735041,
|
|
"learning_rate": 1.0649889776847161e-06,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 311780327.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.5130233764648438,
|
|
"epoch": 4.284090909090909,
|
|
"grad_norm": 0.9666561955269652,
|
|
"learning_rate": 1.0521804479839651e-06,
|
|
"loss": 0.0107,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 312599293.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.506927490234375,
|
|
"epoch": 4.295454545454546,
|
|
"grad_norm": 1.0467474251432374,
|
|
"learning_rate": 1.0394288477751274e-06,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 313433046.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.52001953125,
|
|
"epoch": 4.306818181818182,
|
|
"grad_norm": 1.9504543167360786,
|
|
"learning_rate": 1.0267346784619324e-06,
|
|
"loss": 0.0082,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 314231321.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.4952545166015625,
|
|
"epoch": 4.318181818181818,
|
|
"grad_norm": 1.1273016731224978,
|
|
"learning_rate": 1.0140984391898744e-06,
|
|
"loss": 0.0104,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 315090907.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.49346160888671875,
|
|
"epoch": 4.329545454545454,
|
|
"grad_norm": 1.1570735678574222,
|
|
"learning_rate": 1.0015206268265948e-06,
|
|
"loss": 0.0034,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 315934937.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.5069198608398438,
|
|
"epoch": 4.340909090909091,
|
|
"grad_norm": 1.645370918123862,
|
|
"learning_rate": 9.890017359423326e-07,
|
|
"loss": 0.0152,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 316744484.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.5043182373046875,
|
|
"epoch": 4.3522727272727275,
|
|
"grad_norm": 4.167752578085236,
|
|
"learning_rate": 9.765422587904919e-07,
|
|
"loss": 0.0084,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 317569584.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.511566162109375,
|
|
"epoch": 4.363636363636363,
|
|
"grad_norm": 4.169768059903615,
|
|
"learning_rate": 9.641426852882717e-07,
|
|
"loss": 0.0078,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 318380963.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.5172653198242188,
|
|
"epoch": 4.375,
|
|
"grad_norm": 1.1444633617257265,
|
|
"learning_rate": 9.518035029974127e-07,
|
|
"loss": 0.0029,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 319177808.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.5063858032226562,
|
|
"epoch": 4.386363636363637,
|
|
"grad_norm": 3.231652165172302,
|
|
"learning_rate": 9.395251971050206e-07,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 319991752.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.5062026977539062,
|
|
"epoch": 4.3977272727272725,
|
|
"grad_norm": 0.4562096042156392,
|
|
"learning_rate": 9.273082504044903e-07,
|
|
"loss": 0.0024,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 320823781.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.4913787841796875,
|
|
"epoch": 4.409090909090909,
|
|
"grad_norm": 4.6988922150451975,
|
|
"learning_rate": 9.151531432765204e-07,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 321672380.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.508453369140625,
|
|
"epoch": 4.420454545454546,
|
|
"grad_norm": 1.933648265502234,
|
|
"learning_rate": 9.030603536702254e-07,
|
|
"loss": 0.0109,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 322475535.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.49637603759765625,
|
|
"epoch": 4.431818181818182,
|
|
"grad_norm": 0.4749255618924114,
|
|
"learning_rate": 8.910303570843423e-07,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 323285742.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.5011215209960938,
|
|
"epoch": 4.443181818181818,
|
|
"grad_norm": 0.4458695094810891,
|
|
"learning_rate": 8.790636265485333e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 324121368.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.49810791015625,
|
|
"epoch": 4.454545454545454,
|
|
"grad_norm": 1.5274219135461808,
|
|
"learning_rate": 8.67160632604786e-07,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 324973947.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.49318695068359375,
|
|
"epoch": 4.465909090909091,
|
|
"grad_norm": 1.1132782159428174,
|
|
"learning_rate": 8.553218432889091e-07,
|
|
"loss": 0.0026,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 325804782.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.510528564453125,
|
|
"epoch": 4.4772727272727275,
|
|
"grad_norm": 3.6263901612660328,
|
|
"learning_rate": 8.435477241121354e-07,
|
|
"loss": 0.0049,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 326603097.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.48946380615234375,
|
|
"epoch": 4.488636363636363,
|
|
"grad_norm": 0.47538702415452083,
|
|
"learning_rate": 8.31838738042808e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 327455117.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.49471282958984375,
|
|
"epoch": 4.5,
|
|
"grad_norm": 5.432484108713521,
|
|
"learning_rate": 8.201953454881844e-07,
|
|
"loss": 0.0191,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 328271844.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.49318695068359375,
|
|
"epoch": 4.511363636363637,
|
|
"grad_norm": 2.7195053826099134,
|
|
"learning_rate": 8.086180042763284e-07,
|
|
"loss": 0.0057,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 329105170.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.5019912719726562,
|
|
"epoch": 4.5227272727272725,
|
|
"grad_norm": 1.6339953775013658,
|
|
"learning_rate": 7.971071696381089e-07,
|
|
"loss": 0.0067,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 329922811.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.5027236938476562,
|
|
"epoch": 4.534090909090909,
|
|
"grad_norm": 1.9153786326147693,
|
|
"learning_rate": 7.856632941893e-07,
|
|
"loss": 0.0028,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 330722929.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.4850311279296875,
|
|
"epoch": 4.545454545454545,
|
|
"grad_norm": 1.6886270091316586,
|
|
"learning_rate": 7.74286827912785e-07,
|
|
"loss": 0.0025,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 331559428.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.48662567138671875,
|
|
"epoch": 4.556818181818182,
|
|
"grad_norm": 1.5068421366606104,
|
|
"learning_rate": 7.629782181408574e-07,
|
|
"loss": 0.0065,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 332413515.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.482879638671875,
|
|
"epoch": 4.568181818181818,
|
|
"grad_norm": 0.47925368613938213,
|
|
"learning_rate": 7.517379095376418e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 333265076.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.49607086181640625,
|
|
"epoch": 4.579545454545455,
|
|
"grad_norm": 1.440219771154895,
|
|
"learning_rate": 7.405663440815968e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 334094385.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.490692138671875,
|
|
"epoch": 4.590909090909091,
|
|
"grad_norm": 3.7567750667717665,
|
|
"learning_rate": 7.294639610481461e-07,
|
|
"loss": 0.0037,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 334947835.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.49617767333984375,
|
|
"epoch": 4.6022727272727275,
|
|
"grad_norm": 5.25000234097866,
|
|
"learning_rate": 7.184311969924002e-07,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 335753110.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.4870147705078125,
|
|
"epoch": 4.613636363636363,
|
|
"grad_norm": 0.2953367174000916,
|
|
"learning_rate": 7.074684857319928e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 336600931.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.47736358642578125,
|
|
"epoch": 4.625,
|
|
"grad_norm": 2.92622111202191,
|
|
"learning_rate": 6.965762583300223e-07,
|
|
"loss": 0.004,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 337462613.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.494781494140625,
|
|
"epoch": 4.636363636363637,
|
|
"grad_norm": 0.28460107939061985,
|
|
"learning_rate": 6.85754943078103e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 338298847.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.49102783203125,
|
|
"epoch": 4.6477272727272725,
|
|
"grad_norm": 0.6146968002181714,
|
|
"learning_rate": 6.750049654795199e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 339113689.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.492462158203125,
|
|
"epoch": 4.659090909090909,
|
|
"grad_norm": 3.809716843433178,
|
|
"learning_rate": 6.643267482325061e-07,
|
|
"loss": 0.003,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 339940988.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.5035018920898438,
|
|
"epoch": 4.670454545454545,
|
|
"grad_norm": 1.467212122986146,
|
|
"learning_rate": 6.537207112136143e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 340752578.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.5028610229492188,
|
|
"epoch": 4.681818181818182,
|
|
"grad_norm": 5.889457746996348,
|
|
"learning_rate": 6.431872714612072e-07,
|
|
"loss": 0.0105,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 341554983.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.5011444091796875,
|
|
"epoch": 4.693181818181818,
|
|
"grad_norm": 5.019388657039859,
|
|
"learning_rate": 6.327268431590664e-07,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 342358183.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.48370361328125,
|
|
"epoch": 4.704545454545455,
|
|
"grad_norm": 3.72166366036877,
|
|
"learning_rate": 6.223398376200956e-07,
|
|
"loss": 0.0034,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 343205856.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.48567962646484375,
|
|
"epoch": 4.715909090909091,
|
|
"grad_norm": 1.9949701003957454,
|
|
"learning_rate": 6.1202666327016e-07,
|
|
"loss": 0.0106,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 344033478.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.49942779541015625,
|
|
"epoch": 4.7272727272727275,
|
|
"grad_norm": 1.9263163276444206,
|
|
"learning_rate": 6.017877256320132e-07,
|
|
"loss": 0.0102,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 344827009.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.4829559326171875,
|
|
"epoch": 4.738636363636363,
|
|
"grad_norm": 0.34749654268128166,
|
|
"learning_rate": 5.916234273093624e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 345670205.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.49527740478515625,
|
|
"epoch": 4.75,
|
|
"grad_norm": 1.7904763213822166,
|
|
"learning_rate": 5.815341679710327e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 346471682.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.485931396484375,
|
|
"epoch": 4.761363636363637,
|
|
"grad_norm": 0.45454733529461383,
|
|
"learning_rate": 5.715203443352526e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 347308458.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.4832916259765625,
|
|
"epoch": 4.7727272727272725,
|
|
"grad_norm": 0.4231547605381226,
|
|
"learning_rate": 5.615823501540546e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 348141445.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.4787139892578125,
|
|
"epoch": 4.784090909090909,
|
|
"grad_norm": 3.404461235555557,
|
|
"learning_rate": 5.51720576197794e-07,
|
|
"loss": 0.0031,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 349005773.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.4886322021484375,
|
|
"epoch": 4.795454545454545,
|
|
"grad_norm": 2.694747189659317,
|
|
"learning_rate": 5.419354102397792e-07,
|
|
"loss": 0.0171,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 349829599.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.48751068115234375,
|
|
"epoch": 4.806818181818182,
|
|
"grad_norm": 1.4354262979624335,
|
|
"learning_rate": 5.32227237041032e-07,
|
|
"loss": 0.0098,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 350647234.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.46784210205078125,
|
|
"epoch": 4.818181818181818,
|
|
"grad_norm": 0.7211976403917981,
|
|
"learning_rate": 5.22596438335149e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 351529720.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.48293304443359375,
|
|
"epoch": 4.829545454545455,
|
|
"grad_norm": 0.7921740483961287,
|
|
"learning_rate": 5.130433928132983e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 352366018.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.4916839599609375,
|
|
"epoch": 4.840909090909091,
|
|
"grad_norm": 3.255008493711048,
|
|
"learning_rate": 5.035684761093273e-07,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 353187956.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.493743896484375,
|
|
"epoch": 4.8522727272727275,
|
|
"grad_norm": 0.4639379515372835,
|
|
"learning_rate": 4.941720607849912e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 354005342.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.48528289794921875,
|
|
"epoch": 4.863636363636363,
|
|
"grad_norm": 2.7318226665423335,
|
|
"learning_rate": 4.848545163153048e-07,
|
|
"loss": 0.0106,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 354847121.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.47846221923828125,
|
|
"epoch": 4.875,
|
|
"grad_norm": 3.3364962212306226,
|
|
"learning_rate": 4.756162090740135e-07,
|
|
"loss": 0.004,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 355685163.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.49137115478515625,
|
|
"epoch": 4.886363636363637,
|
|
"grad_norm": 0.6805978064704994,
|
|
"learning_rate": 4.6645750231918864e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 356523319.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.4839019775390625,
|
|
"epoch": 4.8977272727272725,
|
|
"grad_norm": 0.33583284419497017,
|
|
"learning_rate": 4.5737875617894225e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 357363691.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.49709320068359375,
|
|
"epoch": 4.909090909090909,
|
|
"grad_norm": 0.3853403035900075,
|
|
"learning_rate": 4.4838032763726806e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 358180072.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.47998809814453125,
|
|
"epoch": 4.920454545454545,
|
|
"grad_norm": 1.590063239763804,
|
|
"learning_rate": 4.394625705200012e-07,
|
|
"loss": 0.0111,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 359024240.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.49835205078125,
|
|
"epoch": 4.931818181818182,
|
|
"grad_norm": 5.266170835548258,
|
|
"learning_rate": 4.3062583548091256e-07,
|
|
"loss": 0.0119,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 359847230.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.4920501708984375,
|
|
"epoch": 4.943181818181818,
|
|
"grad_norm": 0.27517320814335366,
|
|
"learning_rate": 4.218704699879117e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 360685374.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.47795867919921875,
|
|
"epoch": 4.954545454545455,
|
|
"grad_norm": 4.084902711490606,
|
|
"learning_rate": 4.1319681830939124e-07,
|
|
"loss": 0.0131,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 361526820.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.48442840576171875,
|
|
"epoch": 4.965909090909091,
|
|
"grad_norm": 0.6198177710303161,
|
|
"learning_rate": 4.0460522150068684e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 362340123.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.48004913330078125,
|
|
"epoch": 4.9772727272727275,
|
|
"grad_norm": 0.5029069058309816,
|
|
"learning_rate": 3.9609601739066664e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 363177369.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.48892974853515625,
|
|
"epoch": 4.988636363636363,
|
|
"grad_norm": 0.29134437168907484,
|
|
"learning_rate": 3.876695405684486e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 364019483.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.5021286010742188,
|
|
"epoch": 5.0,
|
|
"grad_norm": 1.9225403101164527,
|
|
"learning_rate": 3.793261223702441e-07,
|
|
"loss": 0.008,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 364810814.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.4940185546875,
|
|
"epoch": 5.011363636363637,
|
|
"grad_norm": 2.938178788349801,
|
|
"learning_rate": 3.7106609086632635e-07,
|
|
"loss": 0.0058,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 365630038.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.489227294921875,
|
|
"epoch": 5.0227272727272725,
|
|
"grad_norm": 0.291480299250409,
|
|
"learning_rate": 3.628897708481377e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 366463667.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.4832611083984375,
|
|
"epoch": 5.034090909090909,
|
|
"grad_norm": 3.0078477238914507,
|
|
"learning_rate": 3.5479748381550855e-07,
|
|
"loss": 0.0094,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 367311117.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.490234375,
|
|
"epoch": 5.045454545454546,
|
|
"grad_norm": 3.510570730894089,
|
|
"learning_rate": 3.4678954796402624e-07,
|
|
"loss": 0.0035,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 368129489.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.4967803955078125,
|
|
"epoch": 5.056818181818182,
|
|
"grad_norm": 0.7674575202655312,
|
|
"learning_rate": 3.388662781725141e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 368926469.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.49871826171875,
|
|
"epoch": 5.068181818181818,
|
|
"grad_norm": 0.3305389935908899,
|
|
"learning_rate": 3.310279859906565e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 369725096.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.48119354248046875,
|
|
"epoch": 5.079545454545454,
|
|
"grad_norm": 0.6603801299623112,
|
|
"learning_rate": 3.232749796267451e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 370550137.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.48421478271484375,
|
|
"epoch": 5.090909090909091,
|
|
"grad_norm": 5.247118592826336,
|
|
"learning_rate": 3.1560756393556187e-07,
|
|
"loss": 0.0057,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 371400483.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.49166107177734375,
|
|
"epoch": 5.1022727272727275,
|
|
"grad_norm": 0.27569457066734937,
|
|
"learning_rate": 3.0802604040639034e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 372240572.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.47963714599609375,
|
|
"epoch": 5.113636363636363,
|
|
"grad_norm": 0.311204056940979,
|
|
"learning_rate": 3.0053070715116153e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 373094230.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.488861083984375,
|
|
"epoch": 5.125,
|
|
"grad_norm": 0.24417698499128285,
|
|
"learning_rate": 2.9312185889273147e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 373920609.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.4911956787109375,
|
|
"epoch": 5.136363636363637,
|
|
"grad_norm": 0.2718455632587909,
|
|
"learning_rate": 2.8579978695329386e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 374735909.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.4797515869140625,
|
|
"epoch": 5.1477272727272725,
|
|
"grad_norm": 0.23825012117768773,
|
|
"learning_rate": 2.785647792429233e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 375593425.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.48360443115234375,
|
|
"epoch": 5.159090909090909,
|
|
"grad_norm": 0.23818462066480345,
|
|
"learning_rate": 2.714171202482538e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 376407379.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.48807525634765625,
|
|
"epoch": 5.170454545454546,
|
|
"grad_norm": 3.2507164294292457,
|
|
"learning_rate": 2.6435709102129727e-07,
|
|
"loss": 0.0077,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 377243881.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.49269866943359375,
|
|
"epoch": 5.181818181818182,
|
|
"grad_norm": 1.259251187795761,
|
|
"learning_rate": 2.5738496916838524e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 378061945.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.489898681640625,
|
|
"epoch": 5.193181818181818,
|
|
"grad_norm": 0.24830082315404475,
|
|
"learning_rate": 2.505010288392587e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 378873096.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.48850250244140625,
|
|
"epoch": 5.204545454545454,
|
|
"grad_norm": 5.651716128783114,
|
|
"learning_rate": 2.4370554071628613e-07,
|
|
"loss": 0.0112,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 379691075.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.494598388671875,
|
|
"epoch": 5.215909090909091,
|
|
"grad_norm": 0.2395804306738229,
|
|
"learning_rate": 2.3699877200382026e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 380488242.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.4767303466796875,
|
|
"epoch": 5.2272727272727275,
|
|
"grad_norm": 0.2389717565975432,
|
|
"learning_rate": 2.303809864176909e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 381344689.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.47606658935546875,
|
|
"epoch": 5.238636363636363,
|
|
"grad_norm": 0.23243670447716458,
|
|
"learning_rate": 2.2385244417483743e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 382191810.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.490386962890625,
|
|
"epoch": 5.25,
|
|
"grad_norm": 2.071131604510824,
|
|
"learning_rate": 2.174134019830726e-07,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 383013514.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.4799957275390625,
|
|
"epoch": 5.261363636363637,
|
|
"grad_norm": 0.23311333167810783,
|
|
"learning_rate": 2.1106411303099455e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 383851533.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.47631072998046875,
|
|
"epoch": 5.2727272727272725,
|
|
"grad_norm": 0.3585355393010696,
|
|
"learning_rate": 2.0480482697802507e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 384690195.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.4852447509765625,
|
|
"epoch": 5.284090909090909,
|
|
"grad_norm": 3.2075162933183266,
|
|
"learning_rate": 1.986357899445976e-07,
|
|
"loss": 0.0075,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 385510439.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.4665679931640625,
|
|
"epoch": 5.295454545454546,
|
|
"grad_norm": 0.24275030501776346,
|
|
"learning_rate": 1.9255724450247676e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 386394511.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.48211669921875,
|
|
"epoch": 5.306818181818182,
|
|
"grad_norm": 2.8992951132080074,
|
|
"learning_rate": 1.8656942966522124e-07,
|
|
"loss": 0.0189,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 387221719.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.49072265625,
|
|
"epoch": 5.318181818181818,
|
|
"grad_norm": 5.332513171073772,
|
|
"learning_rate": 1.8067258087878597e-07,
|
|
"loss": 0.005,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 388021747.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.495758056640625,
|
|
"epoch": 5.329545454545454,
|
|
"grad_norm": 2.471809529485113,
|
|
"learning_rate": 1.748669300122627e-07,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 388858436.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.47252655029296875,
|
|
"epoch": 5.340909090909091,
|
|
"grad_norm": 0.25195781137704687,
|
|
"learning_rate": 1.691527053487646e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 389713106.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.47857666015625,
|
|
"epoch": 5.3522727272727275,
|
|
"grad_norm": 0.35830977915651785,
|
|
"learning_rate": 1.635301315764484e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 390560663.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.4821929931640625,
|
|
"epoch": 5.363636363636363,
|
|
"grad_norm": 1.1731732549746987,
|
|
"learning_rate": 1.579994297796808e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 391378014.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.47939300537109375,
|
|
"epoch": 5.375,
|
|
"grad_norm": 0.2964542659586891,
|
|
"learning_rate": 1.5256081743034336e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 392217186.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.490966796875,
|
|
"epoch": 5.386363636363637,
|
|
"grad_norm": 0.36855686939198057,
|
|
"learning_rate": 1.472145083792842e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 393043338.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.49106597900390625,
|
|
"epoch": 5.3977272727272725,
|
|
"grad_norm": 0.24516913749219083,
|
|
"learning_rate": 1.419607128479053e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 393861984.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.4851837158203125,
|
|
"epoch": 5.409090909090909,
|
|
"grad_norm": 0.30692899589107897,
|
|
"learning_rate": 1.3679963741990127e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 394694902.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.48297882080078125,
|
|
"epoch": 5.420454545454546,
|
|
"grad_norm": 0.2871018170154113,
|
|
"learning_rate": 1.317314850331314e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 395512370.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.481048583984375,
|
|
"epoch": 5.431818181818182,
|
|
"grad_norm": 0.23400972272255227,
|
|
"learning_rate": 1.2675645497164352e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 396351709.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.4695892333984375,
|
|
"epoch": 5.443181818181818,
|
|
"grad_norm": 0.23441130155017773,
|
|
"learning_rate": 1.2187474285783623e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 397205354.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.4716339111328125,
|
|
"epoch": 5.454545454545454,
|
|
"grad_norm": 0.2363995447234875,
|
|
"learning_rate": 1.1708654064476743e-07,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 398068964.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.47133636474609375,
|
|
"epoch": 5.465909090909091,
|
|
"grad_norm": 0.24166295350052908,
|
|
"learning_rate": 1.1239203660860648e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 398931333.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.48919677734375,
|
|
"epoch": 5.4772727272727275,
|
|
"grad_norm": 0.23376095292532034,
|
|
"learning_rate": 1.0779141534123127e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 399743561.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.4713287353515625,
|
|
"epoch": 5.488636363636363,
|
|
"grad_norm": 3.7980897041869994,
|
|
"learning_rate": 1.0328485774296875e-07,
|
|
"loss": 0.0068,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 400597274.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.47472381591796875,
|
|
"epoch": 5.5,
|
|
"grad_norm": 0.22904406267035865,
|
|
"learning_rate": 9.887254101548422e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 401443536.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.49383544921875,
|
|
"epoch": 5.511363636363637,
|
|
"grad_norm": 1.9032152568586889,
|
|
"learning_rate": 9.455463865481019e-08,
|
|
"loss": 0.0118,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 402253531.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.47495269775390625,
|
|
"epoch": 5.5227272727272725,
|
|
"grad_norm": 2.058118970687396,
|
|
"learning_rate": 9.033132044452775e-08,
|
|
"loss": 0.0022,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 403096077.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.4779815673828125,
|
|
"epoch": 5.534090909090909,
|
|
"grad_norm": 0.2320773396883086,
|
|
"learning_rate": 8.620275244908826e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 403931464.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.481781005859375,
|
|
"epoch": 5.545454545454545,
|
|
"grad_norm": 0.906876727315481,
|
|
"learning_rate": 8.216909700728498e-08,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 404753168.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.49048614501953125,
|
|
"epoch": 5.556818181818182,
|
|
"grad_norm": 0.23101403155939168,
|
|
"learning_rate": 7.823051272586812e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 405574379.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.47937774658203125,
|
|
"epoch": 5.568181818181818,
|
|
"grad_norm": 0.22825579969208615,
|
|
"learning_rate": 7.438715447331018e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 406389523.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.48981475830078125,
|
|
"epoch": 5.579545454545455,
|
|
"grad_norm": 0.2503992325015645,
|
|
"learning_rate": 7.063917337371495e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 407214371.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.48102569580078125,
|
|
"epoch": 5.590909090909091,
|
|
"grad_norm": 0.2513608944740575,
|
|
"learning_rate": 6.698671680087643e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 408046275.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.47139739990234375,
|
|
"epoch": 5.6022727272727275,
|
|
"grad_norm": 0.23256492242420046,
|
|
"learning_rate": 6.342992837248235e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 408911193.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.48833465576171875,
|
|
"epoch": 5.613636363636363,
|
|
"grad_norm": 0.23168022955887826,
|
|
"learning_rate": 5.996894794446817e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 409719072.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.49280548095703125,
|
|
"epoch": 5.625,
|
|
"grad_norm": 0.23122760281603366,
|
|
"learning_rate": 5.660391160551837e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 410525184.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.4806060791015625,
|
|
"epoch": 5.636363636363637,
|
|
"grad_norm": 0.22643998190015072,
|
|
"learning_rate": 5.333495167171354e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 411347407.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.48348236083984375,
|
|
"epoch": 5.6477272727272725,
|
|
"grad_norm": 0.2258812067160099,
|
|
"learning_rate": 5.016219668132871e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 412178040.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.48406982421875,
|
|
"epoch": 5.659090909090909,
|
|
"grad_norm": 0.22725534443329942,
|
|
"learning_rate": 4.708577138977932e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 413017601.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.4803466796875,
|
|
"epoch": 5.670454545454545,
|
|
"grad_norm": 0.22801318528268766,
|
|
"learning_rate": 4.410579676471571e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 413853566.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.47000885009765625,
|
|
"epoch": 5.681818181818182,
|
|
"grad_norm": 0.2325930970029929,
|
|
"learning_rate": 4.1222389981265546e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 414704010.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.47524261474609375,
|
|
"epoch": 5.693181818181818,
|
|
"grad_norm": 0.22453266382199571,
|
|
"learning_rate": 3.843566441742774e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 415551512.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.479248046875,
|
|
"epoch": 5.704545454545455,
|
|
"grad_norm": 0.23083370781694745,
|
|
"learning_rate": 3.574572964961304e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 416393119.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.47689056396484375,
|
|
"epoch": 5.715909090909091,
|
|
"grad_norm": 2.1901642978456235,
|
|
"learning_rate": 3.3152691448336825e-08,
|
|
"loss": 0.003,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 417211364.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.4746856689453125,
|
|
"epoch": 5.7272727272727275,
|
|
"grad_norm": 0.22865275732966547,
|
|
"learning_rate": 3.065665177405808e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 418047976.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.487152099609375,
|
|
"epoch": 5.738636363636363,
|
|
"grad_norm": 0.22775595614971952,
|
|
"learning_rate": 2.825770877317363e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 418866422.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.49505615234375,
|
|
"epoch": 5.75,
|
|
"grad_norm": 0.23101305104816183,
|
|
"learning_rate": 2.5955956774154633e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 419667732.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.48188018798828125,
|
|
"epoch": 5.761363636363637,
|
|
"grad_norm": 0.22916418909007583,
|
|
"learning_rate": 2.3751486283840884e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 420500583.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.4888153076171875,
|
|
"epoch": 5.7727272727272725,
|
|
"grad_norm": 0.2258574174809483,
|
|
"learning_rate": 2.1644383983880356e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 421327571.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.47490692138671875,
|
|
"epoch": 5.784090909090909,
|
|
"grad_norm": 0.22690266496506178,
|
|
"learning_rate": 1.9634732727321636e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 422199122.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.4826202392578125,
|
|
"epoch": 5.795454545454545,
|
|
"grad_norm": 0.2321274055889175,
|
|
"learning_rate": 1.7722611535355426e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 423032124.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.4724884033203125,
|
|
"epoch": 5.806818181818182,
|
|
"grad_norm": 1.107169238484095,
|
|
"learning_rate": 1.5908095594207585e-08,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 423876920.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.48011016845703125,
|
|
"epoch": 5.818181818181818,
|
|
"grad_norm": 0.22872107811700226,
|
|
"learning_rate": 1.4191256252182595e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 424725606.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.479339599609375,
|
|
"epoch": 5.829545454545455,
|
|
"grad_norm": 0.222813351912687,
|
|
"learning_rate": 1.2572161016858874e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 425566413.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.48807525634765625,
|
|
"epoch": 5.840909090909091,
|
|
"grad_norm": 0.23276414117276925,
|
|
"learning_rate": 1.1050873552433394e-08,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 426383910.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.47275543212890625,
|
|
"epoch": 5.8522727272727275,
|
|
"grad_norm": 0.23492927422242632,
|
|
"learning_rate": 9.627453677218402e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 427233159.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.4876251220703125,
|
|
"epoch": 5.863636363636363,
|
|
"grad_norm": 0.2293692620353427,
|
|
"learning_rate": 8.301957361289969e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 428045659.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.4870147705078125,
|
|
"epoch": 5.875,
|
|
"grad_norm": 2.4244775640221263,
|
|
"learning_rate": 7.074436724286704e-09,
|
|
"loss": 0.0064,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 428849891.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.4868316650390625,
|
|
"epoch": 5.886363636363637,
|
|
"grad_norm": 0.22674009504309858,
|
|
"learning_rate": 5.944940033360269e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 429665132.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.48014068603515625,
|
|
"epoch": 5.8977272727272725,
|
|
"grad_norm": 0.2269960396547218,
|
|
"learning_rate": 4.913511701278017e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 430492556.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.4708099365234375,
|
|
"epoch": 5.909090909090909,
|
|
"grad_norm": 0.229788356926301,
|
|
"learning_rate": 3.98019228467661e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 431344266.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.4788055419921875,
|
|
"epoch": 5.920454545454545,
|
|
"grad_norm": 2.692434219230262,
|
|
"learning_rate": 3.1450184824657892e-09,
|
|
"loss": 0.0072,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 432174654.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.48198699951171875,
|
|
"epoch": 5.931818181818182,
|
|
"grad_norm": 5.577294262893831,
|
|
"learning_rate": 2.408023134387871e-09,
|
|
"loss": 0.0035,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 433016456.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.47763824462890625,
|
|
"epoch": 5.943181818181818,
|
|
"grad_norm": 0.22567980456899045,
|
|
"learning_rate": 1.7692352197240525e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 433844975.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.47437286376953125,
|
|
"epoch": 5.954545454545455,
|
|
"grad_norm": 0.22722227788364677,
|
|
"learning_rate": 1.2286798561572666e-09,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 434715113.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.487579345703125,
|
|
"epoch": 5.965909090909091,
|
|
"grad_norm": 0.22818571422197098,
|
|
"learning_rate": 7.863782987821422e-10,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 435516656.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.4957275390625,
|
|
"epoch": 5.9772727272727275,
|
|
"grad_norm": 0.2305305154875918,
|
|
"learning_rate": 4.4234793927094845e-10,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 436285686.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.49588775634765625,
|
|
"epoch": 5.988636363636363,
|
|
"grad_norm": 2.880139289310222,
|
|
"learning_rate": 1.9660230518886436e-10,
|
|
"loss": 0.0078,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 437058962.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.4855499267578125,
|
|
"epoch": 6.0,
|
|
"grad_norm": 0.22660971052663118,
|
|
"learning_rate": 4.915105946246002e-11,
|
|
"loss": 0.0013,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 437879517.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"step": 528,
|
|
"total_flos": 515196244262912.0,
|
|
"train_loss": 0.5535088468757088,
|
|
"train_runtime": 94446.5755,
|
|
"train_samples_per_second": 2.607,
|
|
"train_steps_per_second": 0.006
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 528,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 6,
|
|
"save_steps": 44,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 515196244262912.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|