{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5595855712890625, "epoch": 0.011363636363636364, "grad_norm": 381.7633221456865, "learning_rate": 0.0, "loss": 8.3191, "mean_token_accuracy": 0.0, "num_tokens": 852123.0, "step": 1 }, { "entropy": 0.5646438598632812, "epoch": 0.022727272727272728, "grad_norm": 384.6180271880605, "learning_rate": 1.8518518518518518e-07, "loss": 8.2985, "mean_token_accuracy": 0.0, "num_tokens": 1667244.0, "step": 2 }, { "entropy": 0.5536346435546875, "epoch": 0.03409090909090909, "grad_norm": 383.29626165109, "learning_rate": 3.7037037037037036e-07, "loss": 8.3131, "mean_token_accuracy": 0.0, "num_tokens": 2503572.0, "step": 3 }, { "entropy": 0.5472640991210938, "epoch": 0.045454545454545456, "grad_norm": 384.1270985770701, "learning_rate": 5.555555555555555e-07, "loss": 8.2624, "mean_token_accuracy": 0.0, "num_tokens": 3345459.0, "step": 4 }, { "entropy": 0.5576705932617188, "epoch": 0.056818181818181816, "grad_norm": 395.68326610795435, "learning_rate": 7.407407407407407e-07, "loss": 8.0807, "mean_token_accuracy": 0.0, "num_tokens": 4166938.0, "step": 5 }, { "entropy": 0.5459136962890625, "epoch": 0.06818181818181818, "grad_norm": 392.47836187332365, "learning_rate": 9.259259259259259e-07, "loss": 8.0151, "mean_token_accuracy": 0.0, "num_tokens": 5016940.0, "step": 6 }, { "entropy": 0.5500946044921875, "epoch": 0.07954545454545454, "grad_norm": 402.3592451505352, "learning_rate": 1.111111111111111e-06, "loss": 7.4355, "mean_token_accuracy": 0.0, "num_tokens": 5848503.0, "step": 7 }, { "entropy": 0.5400238037109375, "epoch": 0.09090909090909091, "grad_norm": 270.5312649845278, "learning_rate": 1.2962962962962962e-06, "loss": 5.8653, "mean_token_accuracy": 0.003906250116415322, "num_tokens": 6709898.0, "step": 8 }, { "entropy": 0.5549163818359375, "epoch": 0.10227272727272728, "grad_norm": 228.946154453409, "learning_rate": 1.4814814814814815e-06, "loss": 5.5906, "mean_token_accuracy": 0.006510416860692203, "num_tokens": 7560854.0, "step": 9 }, { "entropy": 0.5582351684570312, "epoch": 0.11363636363636363, "grad_norm": 187.58670277138384, "learning_rate": 1.6666666666666667e-06, "loss": 5.2685, "mean_token_accuracy": 0.014322917093522847, "num_tokens": 8391135.0, "step": 10 }, { "entropy": 0.5673904418945312, "epoch": 0.125, "grad_norm": 102.9653781365581, "learning_rate": 1.8518518518518519e-06, "loss": 4.115, "mean_token_accuracy": 0.5117187652504072, "num_tokens": 9185279.0, "step": 11 }, { "entropy": 0.5558929443359375, "epoch": 0.13636363636363635, "grad_norm": 96.60373813990032, "learning_rate": 2.037037037037037e-06, "loss": 4.0292, "mean_token_accuracy": 0.49218751466833055, "num_tokens": 10024891.0, "step": 12 }, { "entropy": 0.5634613037109375, "epoch": 0.14772727272727273, "grad_norm": 82.81078074953965, "learning_rate": 2.222222222222222e-06, "loss": 3.8265, "mean_token_accuracy": 0.5312500158324838, "num_tokens": 10842191.0, "step": 13 }, { "entropy": 0.5619354248046875, "epoch": 0.1590909090909091, "grad_norm": 74.59071680304716, "learning_rate": 2.4074074074074075e-06, "loss": 3.7086, "mean_token_accuracy": 0.5039062650175765, "num_tokens": 11650475.0, "step": 14 }, { "entropy": 0.5547027587890625, "epoch": 0.17045454545454544, "grad_norm": 59.45145903761326, "learning_rate": 2.5925925925925925e-06, "loss": 3.2698, "mean_token_accuracy": 0.5039062650175765, "num_tokens": 12464155.0, "step": 15 }, { "entropy": 0.5290374755859375, "epoch": 0.18181818181818182, "grad_norm": 58.53812027781114, "learning_rate": 2.7777777777777783e-06, "loss": 3.204, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 13346836.0, "step": 16 }, { "entropy": 0.5463485717773438, "epoch": 0.19318181818181818, "grad_norm": 57.542412544507386, "learning_rate": 2.962962962962963e-06, "loss": 3.1529, "mean_token_accuracy": 0.5247395989717916, "num_tokens": 14174968.0, "step": 17 }, { "entropy": 0.5584182739257812, "epoch": 0.20454545454545456, "grad_norm": 57.52665347282901, "learning_rate": 3.1481481481481483e-06, "loss": 3.0902, "mean_token_accuracy": 0.5468750162981451, "num_tokens": 14975189.0, "step": 18 }, { "entropy": 0.5614852905273438, "epoch": 0.2159090909090909, "grad_norm": 57.53281016106306, "learning_rate": 3.3333333333333333e-06, "loss": 3.0511, "mean_token_accuracy": 0.5286458490882069, "num_tokens": 15764524.0, "step": 19 }, { "entropy": 0.5534286499023438, "epoch": 0.22727272727272727, "grad_norm": 58.14048343492545, "learning_rate": 3.5185185185185187e-06, "loss": 2.9643, "mean_token_accuracy": 0.5442708495538682, "num_tokens": 16594212.0, "step": 20 }, { "entropy": 0.5393600463867188, "epoch": 0.23863636363636365, "grad_norm": 57.29516812284279, "learning_rate": 3.7037037037037037e-06, "loss": 2.9211, "mean_token_accuracy": 0.5468750162981451, "num_tokens": 17431524.0, "step": 21 }, { "entropy": 0.554290771484375, "epoch": 0.25, "grad_norm": 61.67178199646207, "learning_rate": 3.88888888888889e-06, "loss": 2.9303, "mean_token_accuracy": 0.5195312654832378, "num_tokens": 18240206.0, "step": 22 }, { "entropy": 0.543975830078125, "epoch": 0.26136363636363635, "grad_norm": 61.19499349619627, "learning_rate": 4.074074074074074e-06, "loss": 2.9146, "mean_token_accuracy": 0.5325520992046222, "num_tokens": 19067759.0, "step": 23 }, { "entropy": 0.54351806640625, "epoch": 0.2727272727272727, "grad_norm": 58.05690393582671, "learning_rate": 4.2592592592592596e-06, "loss": 2.8641, "mean_token_accuracy": 0.5494791830424219, "num_tokens": 19896857.0, "step": 24 }, { "entropy": 0.5491256713867188, "epoch": 0.2840909090909091, "grad_norm": 57.22079479568823, "learning_rate": 4.444444444444444e-06, "loss": 2.8347, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 20712844.0, "step": 25 }, { "entropy": 0.5425262451171875, "epoch": 0.29545454545454547, "grad_norm": 57.786592169293364, "learning_rate": 4.62962962962963e-06, "loss": 2.8192, "mean_token_accuracy": 0.537760432693176, "num_tokens": 21562110.0, "step": 26 }, { "entropy": 0.5401382446289062, "epoch": 0.3068181818181818, "grad_norm": 57.99056805330064, "learning_rate": 4.814814814814815e-06, "loss": 2.7831, "mean_token_accuracy": 0.5468750162981451, "num_tokens": 22406352.0, "step": 27 }, { "entropy": 0.5256195068359375, "epoch": 0.3181818181818182, "grad_norm": 57.22234236181939, "learning_rate": 5e-06, "loss": 2.7438, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 23252892.0, "step": 28 }, { "entropy": 0.5403366088867188, "epoch": 0.32954545454545453, "grad_norm": 57.097280204957976, "learning_rate": 4.999950848940538e-06, "loss": 2.7117, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 24068796.0, "step": 29 }, { "entropy": 0.5393600463867188, "epoch": 0.3409090909090909, "grad_norm": 57.21667456892074, "learning_rate": 4.999803397694811e-06, "loss": 2.6725, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 24888182.0, "step": 30 }, { "entropy": 0.5381240844726562, "epoch": 0.3522727272727273, "grad_norm": 57.65592126762044, "learning_rate": 4.999557652060729e-06, "loss": 2.65, "mean_token_accuracy": 0.5611979333916679, "num_tokens": 25701818.0, "step": 31 }, { "entropy": 0.5250473022460938, "epoch": 0.36363636363636365, "grad_norm": 57.94845921985987, "learning_rate": 4.9992136217012184e-06, "loss": 2.6265, "mean_token_accuracy": 0.5559895999031141, "num_tokens": 26550803.0, "step": 32 }, { "entropy": 0.534454345703125, "epoch": 0.375, "grad_norm": 60.09221471903111, "learning_rate": 4.998771320143843e-06, "loss": 2.6194, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 27362749.0, "step": 33 }, { "entropy": 0.5318450927734375, "epoch": 0.38636363636363635, "grad_norm": 58.867253178774526, "learning_rate": 4.998230764780277e-06, "loss": 2.5514, "mean_token_accuracy": 0.5781250172294676, "num_tokens": 28197226.0, "step": 34 }, { "entropy": 0.52191162109375, "epoch": 0.3977272727272727, "grad_norm": 59.28541087193654, "learning_rate": 4.9975919768656125e-06, "loss": 2.5631, "mean_token_accuracy": 0.5664062668802217, "num_tokens": 29070075.0, "step": 35 }, { "entropy": 0.5337066650390625, "epoch": 0.4090909090909091, "grad_norm": 59.001328074934214, "learning_rate": 4.996854981517535e-06, "loss": 2.5256, "mean_token_accuracy": 0.5716146003687754, "num_tokens": 29909320.0, "step": 36 }, { "entropy": 0.5183563232421875, "epoch": 0.42045454545454547, "grad_norm": 59.065376167680974, "learning_rate": 4.996019807715324e-06, "loss": 2.4876, "mean_token_accuracy": 0.5677083502523601, "num_tokens": 30778680.0, "step": 37 }, { "entropy": 0.5281143188476562, "epoch": 0.4318181818181818, "grad_norm": 59.32377062126641, "learning_rate": 4.995086488298723e-06, "loss": 2.4747, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 31596240.0, "step": 38 }, { "entropy": 0.5444107055664062, "epoch": 0.4431818181818182, "grad_norm": 59.42893671370653, "learning_rate": 4.994055059966641e-06, "loss": 2.4461, "mean_token_accuracy": 0.5690104336244985, "num_tokens": 32396599.0, "step": 39 }, { "entropy": 0.5415267944335938, "epoch": 0.45454545454545453, "grad_norm": 59.56204485899904, "learning_rate": 4.992925563275714e-06, "loss": 2.4156, "mean_token_accuracy": 0.5755208504851907, "num_tokens": 33192180.0, "step": 40 }, { "entropy": 0.532440185546875, "epoch": 0.4659090909090909, "grad_norm": 59.54887107492075, "learning_rate": 4.991698042638711e-06, "loss": 2.3971, "mean_token_accuracy": 0.5729166837409139, "num_tokens": 34019780.0, "step": 41 }, { "entropy": 0.5341949462890625, "epoch": 0.4772727272727273, "grad_norm": 59.74650962478605, "learning_rate": 4.990372546322782e-06, "loss": 2.3637, "mean_token_accuracy": 0.5755208504851907, "num_tokens": 34845898.0, "step": 42 }, { "entropy": 0.5344314575195312, "epoch": 0.48863636363636365, "grad_norm": 59.949205398945104, "learning_rate": 4.988949126447567e-06, "loss": 2.3412, "mean_token_accuracy": 0.5833333507180214, "num_tokens": 35658003.0, "step": 43 }, { "entropy": 0.519012451171875, "epoch": 0.5, "grad_norm": 61.0931007069813, "learning_rate": 4.987427838983141e-06, "loss": 2.3435, "mean_token_accuracy": 0.5807291839737445, "num_tokens": 36513090.0, "step": 44 }, { "entropy": 0.5371322631835938, "epoch": 0.5113636363636364, "grad_norm": 60.86784941074918, "learning_rate": 4.985808743747817e-06, "loss": 2.3204, "mean_token_accuracy": 0.6158854321110994, "num_tokens": 37335753.0, "step": 45 }, { "entropy": 0.537994384765625, "epoch": 0.5227272727272727, "grad_norm": 60.25561544990646, "learning_rate": 4.984091904405793e-06, "loss": 2.2697, "mean_token_accuracy": 0.7734375107102096, "num_tokens": 38171714.0, "step": 46 }, { "entropy": 0.5361480712890625, "epoch": 0.5340909090909091, "grad_norm": 60.867084288518306, "learning_rate": 4.9822773884646444e-06, "loss": 2.2367, "mean_token_accuracy": 0.8880208396585658, "num_tokens": 39004676.0, "step": 47 }, { "entropy": 0.54638671875, "epoch": 0.5454545454545454, "grad_norm": 60.66915590360354, "learning_rate": 4.980365267272679e-06, "loss": 2.2215, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 39839454.0, "step": 48 }, { "entropy": 0.55718994140625, "epoch": 0.5568181818181818, "grad_norm": 60.73328529237807, "learning_rate": 4.97835561601612e-06, "loss": 2.1965, "mean_token_accuracy": 0.9075520888436586, "num_tokens": 40636201.0, "step": 49 }, { "entropy": 0.549163818359375, "epoch": 0.5681818181818182, "grad_norm": 60.75274840918011, "learning_rate": 4.97624851371616e-06, "loss": 2.1713, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 41456439.0, "step": 50 }, { "entropy": 0.5414886474609375, "epoch": 0.5795454545454546, "grad_norm": 60.49880992250543, "learning_rate": 4.974044043225846e-06, "loss": 2.1378, "mean_token_accuracy": 0.923177087912336, "num_tokens": 42306168.0, "step": 51 }, { "entropy": 0.5356521606445312, "epoch": 0.5909090909090909, "grad_norm": 60.520844933288195, "learning_rate": 4.9717422912268265e-06, "loss": 2.1084, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 43156058.0, "step": 52 }, { "entropy": 0.5290069580078125, "epoch": 0.6022727272727273, "grad_norm": 60.45515973060584, "learning_rate": 4.969343348225942e-06, "loss": 2.0952, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 44026197.0, "step": 53 }, { "entropy": 0.5451889038085938, "epoch": 0.6136363636363636, "grad_norm": 60.15895348971788, "learning_rate": 4.966847308551664e-06, "loss": 2.0768, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 44830346.0, "step": 54 }, { "entropy": 0.543731689453125, "epoch": 0.625, "grad_norm": 60.141288719361974, "learning_rate": 4.9642542703503874e-06, "loss": 2.0532, "mean_token_accuracy": 0.912760421866551, "num_tokens": 45639894.0, "step": 55 }, { "entropy": 0.5470046997070312, "epoch": 0.6363636363636364, "grad_norm": 59.994234499334425, "learning_rate": 4.961564335582572e-06, "loss": 2.0265, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 46453026.0, "step": 56 }, { "entropy": 0.5457687377929688, "epoch": 0.6477272727272727, "grad_norm": 59.48482455264734, "learning_rate": 4.958777610018734e-06, "loss": 1.9859, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 47264316.0, "step": 57 }, { "entropy": 0.5581283569335938, "epoch": 0.6590909090909091, "grad_norm": 60.20361031008577, "learning_rate": 4.955894203235285e-06, "loss": 1.9645, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 48069506.0, "step": 58 }, { "entropy": 0.5583419799804688, "epoch": 0.6704545454545454, "grad_norm": 60.67814698892302, "learning_rate": 4.952914228610221e-06, "loss": 1.9421, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 48869835.0, "step": 59 }, { "entropy": 0.5423202514648438, "epoch": 0.6818181818181818, "grad_norm": 59.547065391114195, "learning_rate": 4.949837803318672e-06, "loss": 1.9176, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 49727599.0, "step": 60 }, { "entropy": 0.5429611206054688, "epoch": 0.6931818181818182, "grad_norm": 59.83074013238096, "learning_rate": 4.946665048328288e-06, "loss": 1.8815, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 50548928.0, "step": 61 }, { "entropy": 0.5543136596679688, "epoch": 0.7045454545454546, "grad_norm": 60.55220373965871, "learning_rate": 4.943396088394482e-06, "loss": 1.8644, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 51360270.0, "step": 62 }, { "entropy": 0.5471343994140625, "epoch": 0.7159090909090909, "grad_norm": 61.796111299701316, "learning_rate": 4.940031052055532e-06, "loss": 1.8707, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 52194089.0, "step": 63 }, { "entropy": 0.5440444946289062, "epoch": 0.7272727272727273, "grad_norm": 60.40349534727321, "learning_rate": 4.936570071627517e-06, "loss": 1.8205, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 53031403.0, "step": 64 }, { "entropy": 0.5430908203125, "epoch": 0.7386363636363636, "grad_norm": 58.510081093257874, "learning_rate": 4.933013283199124e-06, "loss": 1.7844, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 53844933.0, "step": 65 }, { "entropy": 0.5445480346679688, "epoch": 0.75, "grad_norm": 59.65972342732521, "learning_rate": 4.929360826626286e-06, "loss": 1.776, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 54693823.0, "step": 66 }, { "entropy": 0.53070068359375, "epoch": 0.7613636363636364, "grad_norm": 58.265622775881965, "learning_rate": 4.925612845526691e-06, "loss": 1.7339, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 55549536.0, "step": 67 }, { "entropy": 0.542205810546875, "epoch": 0.7727272727272727, "grad_norm": 58.30084190912645, "learning_rate": 4.921769487274132e-06, "loss": 1.702, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 56378172.0, "step": 68 }, { "entropy": 0.5559234619140625, "epoch": 0.7840909090909091, "grad_norm": 58.15114759775454, "learning_rate": 4.917830902992716e-06, "loss": 1.6686, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 57189636.0, "step": 69 }, { "entropy": 0.5531768798828125, "epoch": 0.7954545454545454, "grad_norm": 57.91550671898149, "learning_rate": 4.913797247550912e-06, "loss": 1.6516, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 58000089.0, "step": 70 }, { "entropy": 0.5615463256835938, "epoch": 0.8068181818181818, "grad_norm": 58.08935001982205, "learning_rate": 4.9096686795554725e-06, "loss": 1.605, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 58802829.0, "step": 71 }, { "entropy": 0.5490951538085938, "epoch": 0.8181818181818182, "grad_norm": 58.154246622201676, "learning_rate": 4.90544536134519e-06, "loss": 1.597, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 59625095.0, "step": 72 }, { "entropy": 0.5558624267578125, "epoch": 0.8295454545454546, "grad_norm": 58.58076155801676, "learning_rate": 4.901127458984516e-06, "loss": 1.5516, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 60419806.0, "step": 73 }, { "entropy": 0.5548248291015625, "epoch": 0.8409090909090909, "grad_norm": 58.213836732046914, "learning_rate": 4.8967151422570314e-06, "loss": 1.5206, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 61242019.0, "step": 74 }, { "entropy": 0.54022216796875, "epoch": 0.8522727272727273, "grad_norm": 58.4253733216111, "learning_rate": 4.89220858465877e-06, "loss": 1.4992, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 62099804.0, "step": 75 }, { "entropy": 0.5445098876953125, "epoch": 0.8636363636363636, "grad_norm": 58.47702346737799, "learning_rate": 4.887607963391394e-06, "loss": 1.4669, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 62934736.0, "step": 76 }, { "entropy": 0.5456314086914062, "epoch": 0.875, "grad_norm": 58.489811661673144, "learning_rate": 4.882913459355233e-06, "loss": 1.4349, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 63755694.0, "step": 77 }, { "entropy": 0.5397567749023438, "epoch": 0.8863636363636364, "grad_norm": 58.81980710508508, "learning_rate": 4.878125257142165e-06, "loss": 1.4201, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 64618678.0, "step": 78 }, { "entropy": 0.5386886596679688, "epoch": 0.8977272727272727, "grad_norm": 58.85913765518202, "learning_rate": 4.873243545028356e-06, "loss": 1.3857, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 65481725.0, "step": 79 }, { "entropy": 0.5303268432617188, "epoch": 0.9090909090909091, "grad_norm": 58.90455120115232, "learning_rate": 4.868268514966869e-06, "loss": 1.3689, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 66326767.0, "step": 80 }, { "entropy": 0.5485992431640625, "epoch": 0.9204545454545454, "grad_norm": 59.37737137216742, "learning_rate": 4.8632003625800995e-06, "loss": 1.3313, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 67132451.0, "step": 81 }, { "entropy": 0.544342041015625, "epoch": 0.9318181818181818, "grad_norm": 58.42746386237537, "learning_rate": 4.858039287152095e-06, "loss": 1.2899, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 67951468.0, "step": 82 }, { "entropy": 0.5399322509765625, "epoch": 0.9431818181818182, "grad_norm": 59.5285283289149, "learning_rate": 4.852785491620716e-06, "loss": 1.277, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 68794669.0, "step": 83 }, { "entropy": 0.5452194213867188, "epoch": 0.9545454545454546, "grad_norm": 58.86810235822698, "learning_rate": 4.847439182569656e-06, "loss": 1.2559, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 69610011.0, "step": 84 }, { "entropy": 0.5454788208007812, "epoch": 0.9659090909090909, "grad_norm": 58.244417132360944, "learning_rate": 4.84200057022032e-06, "loss": 1.2393, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 70406656.0, "step": 85 }, { "entropy": 0.5455780029296875, "epoch": 0.9772727272727273, "grad_norm": 58.110729897503354, "learning_rate": 4.836469868423552e-06, "loss": 1.1798, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 71214893.0, "step": 86 }, { "entropy": 0.549102783203125, "epoch": 0.9886363636363636, "grad_norm": 57.96764990952516, "learning_rate": 4.830847294651236e-06, "loss": 1.1639, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 72038768.0, "step": 87 }, { "entropy": 0.5456390380859375, "epoch": 1.0, "grad_norm": 58.09143335722103, "learning_rate": 4.825133069987737e-06, "loss": 1.1471, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 72847782.0, "step": 88 }, { "entropy": 0.538909912109375, "epoch": 1.0113636363636365, "grad_norm": 57.38198228541311, "learning_rate": 4.819327419121215e-06, "loss": 1.1177, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 73701972.0, "step": 89 }, { "entropy": 0.5499267578125, "epoch": 1.0227272727272727, "grad_norm": 57.003608180265964, "learning_rate": 4.81343057033478e-06, "loss": 1.0763, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 74539661.0, "step": 90 }, { "entropy": 0.5374908447265625, "epoch": 1.0340909090909092, "grad_norm": 57.80822364026077, "learning_rate": 4.8074427554975235e-06, "loss": 1.0644, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 75393325.0, "step": 91 }, { "entropy": 0.5450515747070312, "epoch": 1.0454545454545454, "grad_norm": 56.68961214202094, "learning_rate": 4.8013642100554034e-06, "loss": 1.0258, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 76230771.0, "step": 92 }, { "entropy": 0.5306625366210938, "epoch": 1.0568181818181819, "grad_norm": 56.74441180261517, "learning_rate": 4.795195173021976e-06, "loss": 1.0344, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 77097214.0, "step": 93 }, { "entropy": 0.51495361328125, "epoch": 1.0681818181818181, "grad_norm": 56.46006981310896, "learning_rate": 4.7889358869690065e-06, "loss": 0.9933, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 78024772.0, "step": 94 }, { "entropy": 0.5518417358398438, "epoch": 1.0795454545454546, "grad_norm": 57.22442413009565, "learning_rate": 4.782586598016928e-06, "loss": 0.9509, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 78828734.0, "step": 95 }, { "entropy": 0.53485107421875, "epoch": 1.0909090909090908, "grad_norm": 55.811186941838926, "learning_rate": 4.776147555825164e-06, "loss": 0.9158, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 79685825.0, "step": 96 }, { "entropy": 0.542236328125, "epoch": 1.1022727272727273, "grad_norm": 56.18995031350287, "learning_rate": 4.769619013582309e-06, "loss": 0.9235, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 80538841.0, "step": 97 }, { "entropy": 0.5271377563476562, "epoch": 1.1136363636363635, "grad_norm": 55.596172230629556, "learning_rate": 4.7630012279961805e-06, "loss": 0.871, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 81406964.0, "step": 98 }, { "entropy": 0.5487060546875, "epoch": 1.125, "grad_norm": 55.22959755493109, "learning_rate": 4.7562944592837145e-06, "loss": 0.8578, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 82228992.0, "step": 99 }, { "entropy": 0.55682373046875, "epoch": 1.1363636363636362, "grad_norm": 54.97844804028525, "learning_rate": 4.749498971160742e-06, "loss": 0.8207, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 83019198.0, "step": 100 }, { "entropy": 0.5458526611328125, "epoch": 1.1477272727272727, "grad_norm": 54.971653184869254, "learning_rate": 4.742615030831615e-06, "loss": 0.8323, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 83850145.0, "step": 101 }, { "entropy": 0.554840087890625, "epoch": 1.1590909090909092, "grad_norm": 54.786840015792265, "learning_rate": 4.735642908978704e-06, "loss": 0.804, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 84659245.0, "step": 102 }, { "entropy": 0.5523223876953125, "epoch": 1.1704545454545454, "grad_norm": 53.614665578810396, "learning_rate": 4.728582879751746e-06, "loss": 0.7812, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 85506138.0, "step": 103 }, { "entropy": 0.544036865234375, "epoch": 1.1818181818181819, "grad_norm": 54.85928977071388, "learning_rate": 4.721435220757078e-06, "loss": 0.7617, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 86335548.0, "step": 104 }, { "entropy": 0.5494613647460938, "epoch": 1.1931818181818181, "grad_norm": 54.072025328393615, "learning_rate": 4.714200213046707e-06, "loss": 0.7409, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 87158155.0, "step": 105 }, { "entropy": 0.5440139770507812, "epoch": 1.2045454545454546, "grad_norm": 52.66696077779264, "learning_rate": 4.706878141107269e-06, "loss": 0.7092, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 87999932.0, "step": 106 }, { "entropy": 0.5469131469726562, "epoch": 1.2159090909090908, "grad_norm": 53.71747331748587, "learning_rate": 4.699469292848839e-06, "loss": 0.7042, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 88819905.0, "step": 107 }, { "entropy": 0.5530014038085938, "epoch": 1.2272727272727273, "grad_norm": 52.00345160403972, "learning_rate": 4.691973959593609e-06, "loss": 0.6665, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 89634909.0, "step": 108 }, { "entropy": 0.568328857421875, "epoch": 1.2386363636363638, "grad_norm": 53.47004513594076, "learning_rate": 4.6843924360644385e-06, "loss": 0.6714, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 90402321.0, "step": 109 }, { "entropy": 0.5533065795898438, "epoch": 1.25, "grad_norm": 48.99806983812518, "learning_rate": 4.676725020373255e-06, "loss": 0.6269, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 91203638.0, "step": 110 }, { "entropy": 0.543212890625, "epoch": 1.2613636363636362, "grad_norm": 49.403098540554055, "learning_rate": 4.6689720140093445e-06, "loss": 0.6146, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 92020252.0, "step": 111 }, { "entropy": 0.5560073852539062, "epoch": 1.2727272727272727, "grad_norm": 46.54252333124646, "learning_rate": 4.661133721827487e-06, "loss": 0.5562, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 92816807.0, "step": 112 }, { "entropy": 0.5302276611328125, "epoch": 1.2840909090909092, "grad_norm": 45.66376732399781, "learning_rate": 4.653210452035974e-06, "loss": 0.5397, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 93649991.0, "step": 113 }, { "entropy": 0.5438156127929688, "epoch": 1.2954545454545454, "grad_norm": 45.87602686823475, "learning_rate": 4.645202516184492e-06, "loss": 0.5526, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 94459730.0, "step": 114 }, { "entropy": 0.5356369018554688, "epoch": 1.3068181818181819, "grad_norm": 44.56282799528236, "learning_rate": 4.6371102291518635e-06, "loss": 0.5097, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 95302339.0, "step": 115 }, { "entropy": 0.5306625366210938, "epoch": 1.3181818181818181, "grad_norm": 42.809418293734176, "learning_rate": 4.628933909133674e-06, "loss": 0.4839, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 96149112.0, "step": 116 }, { "entropy": 0.5399169921875, "epoch": 1.3295454545454546, "grad_norm": 43.262845013372875, "learning_rate": 4.620673877629757e-06, "loss": 0.4791, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 96974920.0, "step": 117 }, { "entropy": 0.5676727294921875, "epoch": 1.3409090909090908, "grad_norm": 41.54182990822529, "learning_rate": 4.612330459431552e-06, "loss": 0.453, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 97741872.0, "step": 118 }, { "entropy": 0.5345535278320312, "epoch": 1.3522727272727273, "grad_norm": 42.1177627835191, "learning_rate": 4.603903982609334e-06, "loss": 0.4717, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 98575899.0, "step": 119 }, { "entropy": 0.5326919555664062, "epoch": 1.3636363636363638, "grad_norm": 43.3669026761141, "learning_rate": 4.595394778499314e-06, "loss": 0.5153, "mean_token_accuracy": 0.901041672565043, "num_tokens": 99431810.0, "step": 120 }, { "entropy": 0.5554275512695312, "epoch": 1.375, "grad_norm": 36.50419461925312, "learning_rate": 4.586803181690609e-06, "loss": 0.435, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 100229559.0, "step": 121 }, { "entropy": 0.5372390747070312, "epoch": 1.3863636363636362, "grad_norm": 36.592474361723475, "learning_rate": 4.5781295300120885e-06, "loss": 0.4384, "mean_token_accuracy": 0.912760421866551, "num_tokens": 101066027.0, "step": 122 }, { "entropy": 0.5535049438476562, "epoch": 1.3977272727272727, "grad_norm": 36.245607196315184, "learning_rate": 4.569374164519088e-06, "loss": 0.4139, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 101861078.0, "step": 123 }, { "entropy": 0.5516357421875, "epoch": 1.4090909090909092, "grad_norm": 32.59705233863274, "learning_rate": 4.560537429479998e-06, "loss": 0.3721, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 102654586.0, "step": 124 }, { "entropy": 0.5359344482421875, "epoch": 1.4204545454545454, "grad_norm": 31.92040372605121, "learning_rate": 4.5516196723627325e-06, "loss": 0.3577, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 103497056.0, "step": 125 }, { "entropy": 0.5435409545898438, "epoch": 1.4318181818181819, "grad_norm": 30.46099601858736, "learning_rate": 4.542621243821058e-06, "loss": 0.3295, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 104315903.0, "step": 126 }, { "entropy": 0.554229736328125, "epoch": 1.4431818181818181, "grad_norm": 32.04665107939428, "learning_rate": 4.533542497680811e-06, "loss": 0.3545, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 105097772.0, "step": 127 }, { "entropy": 0.5345306396484375, "epoch": 1.4545454545454546, "grad_norm": 32.593755043031194, "learning_rate": 4.524383790925987e-06, "loss": 0.3498, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 105938088.0, "step": 128 }, { "entropy": 0.560516357421875, "epoch": 1.4659090909090908, "grad_norm": 27.86040213744899, "learning_rate": 4.515145483684696e-06, "loss": 0.2999, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 106725926.0, "step": 129 }, { "entropy": 0.5250091552734375, "epoch": 1.4772727272727273, "grad_norm": 31.53561853505193, "learning_rate": 4.505827939215009e-06, "loss": 0.338, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 107586088.0, "step": 130 }, { "entropy": 0.545989990234375, "epoch": 1.4886363636363638, "grad_norm": 25.96701371896309, "learning_rate": 4.496431523890673e-06, "loss": 0.2851, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 108406686.0, "step": 131 }, { "entropy": 0.5378570556640625, "epoch": 1.5, "grad_norm": 26.273437673336062, "learning_rate": 4.486956607186702e-06, "loss": 0.291, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 109266458.0, "step": 132 }, { "entropy": 0.5393905639648438, "epoch": 1.5113636363636362, "grad_norm": 24.208179077445585, "learning_rate": 4.477403561664852e-06, "loss": 0.2684, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 110097799.0, "step": 133 }, { "entropy": 0.5587005615234375, "epoch": 1.5227272727272727, "grad_norm": 28.135497638609476, "learning_rate": 4.467772762958968e-06, "loss": 0.2883, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 110885530.0, "step": 134 }, { "entropy": 0.5532913208007812, "epoch": 1.5340909090909092, "grad_norm": 20.865989221665703, "learning_rate": 4.458064589760221e-06, "loss": 0.2387, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 111709692.0, "step": 135 }, { "entropy": 0.5501937866210938, "epoch": 1.5454545454545454, "grad_norm": 24.535283552721097, "learning_rate": 4.448279423802207e-06, "loss": 0.2446, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 112521769.0, "step": 136 }, { "entropy": 0.5506134033203125, "epoch": 1.5568181818181817, "grad_norm": 19.120665717121184, "learning_rate": 4.438417649845946e-06, "loss": 0.208, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 113374792.0, "step": 137 }, { "entropy": 0.5475006103515625, "epoch": 1.5681818181818183, "grad_norm": 21.153400031877272, "learning_rate": 4.428479655664748e-06, "loss": 0.217, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 114209235.0, "step": 138 }, { "entropy": 0.54583740234375, "epoch": 1.5795454545454546, "grad_norm": 18.33572562599262, "learning_rate": 4.4184658320289675e-06, "loss": 0.2144, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 115024770.0, "step": 139 }, { "entropy": 0.5428314208984375, "epoch": 1.5909090909090908, "grad_norm": 17.275245293829077, "learning_rate": 4.408376572690638e-06, "loss": 0.1946, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 115851204.0, "step": 140 }, { "entropy": 0.5377197265625, "epoch": 1.6022727272727273, "grad_norm": 18.38711220300695, "learning_rate": 4.3982122743679875e-06, "loss": 0.2152, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 116699283.0, "step": 141 }, { "entropy": 0.5587234497070312, "epoch": 1.6136363636363638, "grad_norm": 15.030526444653805, "learning_rate": 4.387973336729841e-06, "loss": 0.1849, "mean_token_accuracy": 0.955729169305414, "num_tokens": 117503423.0, "step": 142 }, { "entropy": 0.5555343627929688, "epoch": 1.625, "grad_norm": 49.09119160394531, "learning_rate": 4.377660162379904e-06, "loss": 0.1757, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 118291651.0, "step": 143 }, { "entropy": 0.5510406494140625, "epoch": 1.6363636363636362, "grad_norm": 17.06703017323922, "learning_rate": 4.3672731568409344e-06, "loss": 0.1835, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 119098075.0, "step": 144 }, { "entropy": 0.5520248413085938, "epoch": 1.6477272727272727, "grad_norm": 14.519167602295752, "learning_rate": 4.3568127285387925e-06, "loss": 0.1815, "mean_token_accuracy": 0.945312503259629, "num_tokens": 119888254.0, "step": 145 }, { "entropy": 0.5263671875, "epoch": 1.6590909090909092, "grad_norm": 15.842685933217664, "learning_rate": 4.346279288786387e-06, "loss": 0.1841, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 120730204.0, "step": 146 }, { "entropy": 0.546875, "epoch": 1.6704545454545454, "grad_norm": 14.102138793551976, "learning_rate": 4.3356732517674935e-06, "loss": 0.1665, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 121547571.0, "step": 147 }, { "entropy": 0.5335464477539062, "epoch": 1.6818181818181817, "grad_norm": 13.42344794507078, "learning_rate": 4.32499503452048e-06, "loss": 0.1651, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 122398891.0, "step": 148 }, { "entropy": 0.5406112670898438, "epoch": 1.6931818181818183, "grad_norm": 16.777966557638916, "learning_rate": 4.314245056921899e-06, "loss": 0.2101, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 123217975.0, "step": 149 }, { "entropy": 0.5275344848632812, "epoch": 1.7045454545454546, "grad_norm": 12.73456653618887, "learning_rate": 4.303423741669978e-06, "loss": 0.1711, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 124067259.0, "step": 150 }, { "entropy": 0.5336456298828125, "epoch": 1.7159090909090908, "grad_norm": 14.693936825241831, "learning_rate": 4.292531514268008e-06, "loss": 0.1729, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 124928545.0, "step": 151 }, { "entropy": 0.5305709838867188, "epoch": 1.7272727272727273, "grad_norm": 10.728134350130471, "learning_rate": 4.281568803007601e-06, "loss": 0.1743, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 125802700.0, "step": 152 }, { "entropy": 0.5507659912109375, "epoch": 1.7386363636363638, "grad_norm": 11.357422242866724, "learning_rate": 4.270536038951855e-06, "loss": 0.1455, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 126615551.0, "step": 153 }, { "entropy": 0.528594970703125, "epoch": 1.75, "grad_norm": 13.659522995364625, "learning_rate": 4.259433655918404e-06, "loss": 0.1593, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 127457356.0, "step": 154 }, { "entropy": 0.52630615234375, "epoch": 1.7613636363636362, "grad_norm": 11.98866784293711, "learning_rate": 4.24826209046236e-06, "loss": 0.146, "mean_token_accuracy": 0.955729169305414, "num_tokens": 128340971.0, "step": 155 }, { "entropy": 0.5377883911132812, "epoch": 1.7727272727272727, "grad_norm": 10.302404549159972, "learning_rate": 4.237021781859143e-06, "loss": 0.1488, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 129194640.0, "step": 156 }, { "entropy": 0.54241943359375, "epoch": 1.7840909090909092, "grad_norm": 12.40519661964789, "learning_rate": 4.225713172087216e-06, "loss": 0.148, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 130035771.0, "step": 157 }, { "entropy": 0.5303497314453125, "epoch": 1.7954545454545454, "grad_norm": 8.481343011266665, "learning_rate": 4.2143367058107e-06, "loss": 0.1295, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 130925665.0, "step": 158 }, { "entropy": 0.5450363159179688, "epoch": 1.8068181818181817, "grad_norm": 7.512602992919226, "learning_rate": 4.202892830361892e-06, "loss": 0.1347, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 131762678.0, "step": 159 }, { "entropy": 0.53131103515625, "epoch": 1.8181818181818183, "grad_norm": 8.20500103400398, "learning_rate": 4.191381995723672e-06, "loss": 0.143, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 132615569.0, "step": 160 }, { "entropy": 0.5467453002929688, "epoch": 1.8295454545454546, "grad_norm": 12.101821695010367, "learning_rate": 4.179804654511816e-06, "loss": 0.155, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 133443981.0, "step": 161 }, { "entropy": 0.539459228515625, "epoch": 1.8409090909090908, "grad_norm": 7.20954852117008, "learning_rate": 4.168161261957192e-06, "loss": 0.1236, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 134283417.0, "step": 162 }, { "entropy": 0.5276107788085938, "epoch": 1.8522727272727273, "grad_norm": 12.45647994209618, "learning_rate": 4.1564522758878656e-06, "loss": 0.1541, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 135129505.0, "step": 163 }, { "entropy": 0.525909423828125, "epoch": 1.8636363636363638, "grad_norm": 11.397437854508587, "learning_rate": 4.144678156711091e-06, "loss": 0.1648, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 135982474.0, "step": 164 }, { "entropy": 0.5355148315429688, "epoch": 1.875, "grad_norm": 9.052055602010471, "learning_rate": 4.132839367395215e-06, "loss": 0.1254, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 136810572.0, "step": 165 }, { "entropy": 0.544097900390625, "epoch": 1.8863636363636362, "grad_norm": 12.358520941926209, "learning_rate": 4.120936373451467e-06, "loss": 0.1435, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 137623231.0, "step": 166 }, { "entropy": 0.5360260009765625, "epoch": 1.8977272727272727, "grad_norm": 5.240176699801722, "learning_rate": 4.108969642915658e-06, "loss": 0.125, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 138474048.0, "step": 167 }, { "entropy": 0.533233642578125, "epoch": 1.9090909090909092, "grad_norm": 10.779052698306451, "learning_rate": 4.096939646329775e-06, "loss": 0.135, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 139318039.0, "step": 168 }, { "entropy": 0.5390167236328125, "epoch": 1.9204545454545454, "grad_norm": 6.710289531825517, "learning_rate": 4.08484685672348e-06, "loss": 0.1308, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 140136159.0, "step": 169 }, { "entropy": 0.5381927490234375, "epoch": 1.9318181818181817, "grad_norm": 13.851350819778412, "learning_rate": 4.07269174959551e-06, "loss": 0.1462, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 140989018.0, "step": 170 }, { "entropy": 0.5294189453125, "epoch": 1.9431818181818183, "grad_norm": 5.416353381363566, "learning_rate": 4.06047480289498e-06, "loss": 0.1009, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 141834446.0, "step": 171 }, { "entropy": 0.5440444946289062, "epoch": 1.9545454545454546, "grad_norm": 12.01909827068296, "learning_rate": 4.0481964970025885e-06, "loss": 0.1342, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 142637643.0, "step": 172 }, { "entropy": 0.5352020263671875, "epoch": 1.9659090909090908, "grad_norm": 7.410850925326847, "learning_rate": 4.035857314711729e-06, "loss": 0.1064, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 143446972.0, "step": 173 }, { "entropy": 0.5480270385742188, "epoch": 1.9772727272727273, "grad_norm": 10.80598929383265, "learning_rate": 4.023457741209509e-06, "loss": 0.1294, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 144219843.0, "step": 174 }, { "entropy": 0.5273895263671875, "epoch": 1.9886363636363638, "grad_norm": 11.406603208620567, "learning_rate": 4.0109982640576676e-06, "loss": 0.1345, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 145065280.0, "step": 175 }, { "entropy": 0.533416748046875, "epoch": 2.0, "grad_norm": 5.403077497706771, "learning_rate": 3.998479373173406e-06, "loss": 0.1099, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 145884441.0, "step": 176 }, { "entropy": 0.5384902954101562, "epoch": 2.0113636363636362, "grad_norm": 9.064993783573478, "learning_rate": 3.985901560810126e-06, "loss": 0.1228, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 146692231.0, "step": 177 }, { "entropy": 0.527313232421875, "epoch": 2.022727272727273, "grad_norm": 5.7544500718889, "learning_rate": 3.973265321538069e-06, "loss": 0.106, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 147528006.0, "step": 178 }, { "entropy": 0.5183944702148438, "epoch": 2.034090909090909, "grad_norm": 4.613351890776185, "learning_rate": 3.960571152224872e-06, "loss": 0.0908, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 148417260.0, "step": 179 }, { "entropy": 0.541961669921875, "epoch": 2.0454545454545454, "grad_norm": 3.8462617773274728, "learning_rate": 3.9478195520160355e-06, "loss": 0.0756, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 149242794.0, "step": 180 }, { "entropy": 0.524169921875, "epoch": 2.0568181818181817, "grad_norm": 5.4596119310313185, "learning_rate": 3.935011022315284e-06, "loss": 0.082, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 150098081.0, "step": 181 }, { "entropy": 0.5390472412109375, "epoch": 2.0681818181818183, "grad_norm": 10.39377561593915, "learning_rate": 3.922146066764863e-06, "loss": 0.1071, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 150917507.0, "step": 182 }, { "entropy": 0.525787353515625, "epoch": 2.0795454545454546, "grad_norm": 13.4198283536437, "learning_rate": 3.9092251912257286e-06, "loss": 0.1439, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 151774062.0, "step": 183 }, { "entropy": 0.5273284912109375, "epoch": 2.090909090909091, "grad_norm": 8.530055658216385, "learning_rate": 3.896248903757658e-06, "loss": 0.0821, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 152611097.0, "step": 184 }, { "entropy": 0.531890869140625, "epoch": 2.102272727272727, "grad_norm": 14.787176248320279, "learning_rate": 3.883217714599273e-06, "loss": 0.1179, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 153466669.0, "step": 185 }, { "entropy": 0.5436248779296875, "epoch": 2.1136363636363638, "grad_norm": 9.750570935700242, "learning_rate": 3.870132136147977e-06, "loss": 0.0984, "mean_token_accuracy": 0.955729169305414, "num_tokens": 154298660.0, "step": 186 }, { "entropy": 0.5355148315429688, "epoch": 2.125, "grad_norm": 17.447751788939698, "learning_rate": 3.856992682939803e-06, "loss": 0.1534, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 155130292.0, "step": 187 }, { "entropy": 0.538909912109375, "epoch": 2.1363636363636362, "grad_norm": 14.892648839395319, "learning_rate": 3.84379987162919e-06, "loss": 0.1339, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 155975863.0, "step": 188 }, { "entropy": 0.54864501953125, "epoch": 2.147727272727273, "grad_norm": 3.5306413830760657, "learning_rate": 3.830554220968661e-06, "loss": 0.0968, "mean_token_accuracy": 0.967447918606922, "num_tokens": 156800322.0, "step": 189 }, { "entropy": 0.5349349975585938, "epoch": 2.159090909090909, "grad_norm": 13.850121100268511, "learning_rate": 3.817256251788425e-06, "loss": 0.1411, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 157668790.0, "step": 190 }, { "entropy": 0.543304443359375, "epoch": 2.1704545454545454, "grad_norm": 12.355866968351991, "learning_rate": 3.803906486975901e-06, "loss": 0.1229, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 158496829.0, "step": 191 }, { "entropy": 0.5365829467773438, "epoch": 2.1818181818181817, "grad_norm": 3.337519584172672, "learning_rate": 3.790505451455158e-06, "loss": 0.0812, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 159347676.0, "step": 192 }, { "entropy": 0.5381317138671875, "epoch": 2.1931818181818183, "grad_norm": 12.868212538236262, "learning_rate": 3.77705367216627e-06, "loss": 0.1274, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 160186343.0, "step": 193 }, { "entropy": 0.5421295166015625, "epoch": 2.2045454545454546, "grad_norm": 11.400363668337347, "learning_rate": 3.7635516780446e-06, "loss": 0.136, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 160994811.0, "step": 194 }, { "entropy": 0.5428924560546875, "epoch": 2.215909090909091, "grad_norm": 4.068567700010866, "learning_rate": 3.7500000000000005e-06, "loss": 0.0905, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 161804760.0, "step": 195 }, { "entropy": 0.525787353515625, "epoch": 2.227272727272727, "grad_norm": 7.662109925607153, "learning_rate": 3.7363991708959386e-06, "loss": 0.1078, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 162669635.0, "step": 196 }, { "entropy": 0.5388870239257812, "epoch": 2.2386363636363638, "grad_norm": 6.488639817122286, "learning_rate": 3.7227497255285416e-06, "loss": 0.0965, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 163509429.0, "step": 197 }, { "entropy": 0.5268783569335938, "epoch": 2.25, "grad_norm": 4.879390086259226, "learning_rate": 3.709052200605572e-06, "loss": 0.097, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 164350633.0, "step": 198 }, { "entropy": 0.5328140258789062, "epoch": 2.2613636363636362, "grad_norm": 8.016614290480945, "learning_rate": 3.6953071347253167e-06, "loss": 0.105, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 165187521.0, "step": 199 }, { "entropy": 0.5488815307617188, "epoch": 2.2727272727272725, "grad_norm": 5.848981338498339, "learning_rate": 3.6815150683554187e-06, "loss": 0.0809, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 166006751.0, "step": 200 }, { "entropy": 0.5378341674804688, "epoch": 2.284090909090909, "grad_norm": 5.298493599070519, "learning_rate": 3.6676765438116157e-06, "loss": 0.1057, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 166843238.0, "step": 201 }, { "entropy": 0.5397262573242188, "epoch": 2.2954545454545454, "grad_norm": 7.197323291540629, "learning_rate": 3.6537921052364223e-06, "loss": 0.1094, "mean_token_accuracy": 0.955729169305414, "num_tokens": 167691986.0, "step": 202 }, { "entropy": 0.5311660766601562, "epoch": 2.3068181818181817, "grad_norm": 4.266710229403864, "learning_rate": 3.6398622985777314e-06, "loss": 0.0743, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 168532989.0, "step": 203 }, { "entropy": 0.5288314819335938, "epoch": 2.3181818181818183, "grad_norm": 8.448421834056651, "learning_rate": 3.6258876715673475e-06, "loss": 0.0813, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 169363455.0, "step": 204 }, { "entropy": 0.5425033569335938, "epoch": 2.3295454545454546, "grad_norm": 5.07863510651651, "learning_rate": 3.611868773699449e-06, "loss": 0.0984, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 170177432.0, "step": 205 }, { "entropy": 0.5307388305664062, "epoch": 2.340909090909091, "grad_norm": 3.6790168522255255, "learning_rate": 3.597806156208982e-06, "loss": 0.0686, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 171014423.0, "step": 206 }, { "entropy": 0.5391464233398438, "epoch": 2.3522727272727275, "grad_norm": 3.7261987700436667, "learning_rate": 3.5837003720499853e-06, "loss": 0.0625, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 171832784.0, "step": 207 }, { "entropy": 0.52593994140625, "epoch": 2.3636363636363638, "grad_norm": 10.359990795629523, "learning_rate": 3.569551975873847e-06, "loss": 0.1121, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 172678098.0, "step": 208 }, { "entropy": 0.5261611938476562, "epoch": 2.375, "grad_norm": 7.621881324218433, "learning_rate": 3.555361524007498e-06, "loss": 0.073, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 173521043.0, "step": 209 }, { "entropy": 0.5422592163085938, "epoch": 2.3863636363636362, "grad_norm": 8.923292571279323, "learning_rate": 3.541129574431532e-06, "loss": 0.0778, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 174330129.0, "step": 210 }, { "entropy": 0.52294921875, "epoch": 2.3977272727272725, "grad_norm": 7.832050365863067, "learning_rate": 3.526856686758269e-06, "loss": 0.0854, "mean_token_accuracy": 0.967447918606922, "num_tokens": 175179051.0, "step": 211 }, { "entropy": 0.5177383422851562, "epoch": 2.409090909090909, "grad_norm": 7.830008454039604, "learning_rate": 3.51254342220975e-06, "loss": 0.0794, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 176037909.0, "step": 212 }, { "entropy": 0.5296478271484375, "epoch": 2.4204545454545454, "grad_norm": 6.8243379606452095, "learning_rate": 3.4981903435956675e-06, "loss": 0.0672, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 176860131.0, "step": 213 }, { "entropy": 0.5161209106445312, "epoch": 2.4318181818181817, "grad_norm": 5.986660832754099, "learning_rate": 3.4837980152912393e-06, "loss": 0.0728, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 177705136.0, "step": 214 }, { "entropy": 0.5319290161132812, "epoch": 2.4431818181818183, "grad_norm": 7.570755754525308, "learning_rate": 3.4693670032150117e-06, "loss": 0.0786, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 178510671.0, "step": 215 }, { "entropy": 0.5307540893554688, "epoch": 2.4545454545454546, "grad_norm": 5.407915840203935, "learning_rate": 3.4548978748066115e-06, "loss": 0.0594, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 179329651.0, "step": 216 }, { "entropy": 0.52447509765625, "epoch": 2.465909090909091, "grad_norm": 6.447443460350779, "learning_rate": 3.440391199004431e-06, "loss": 0.061, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 180158843.0, "step": 217 }, { "entropy": 0.5199203491210938, "epoch": 2.4772727272727275, "grad_norm": 5.6196857220042835, "learning_rate": 3.4258475462232586e-06, "loss": 0.0709, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 181003645.0, "step": 218 }, { "entropy": 0.5209808349609375, "epoch": 2.4886363636363638, "grad_norm": 4.149800169775993, "learning_rate": 3.4112674883318477e-06, "loss": 0.0559, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 181838208.0, "step": 219 }, { "entropy": 0.519622802734375, "epoch": 2.5, "grad_norm": 7.847869214902608, "learning_rate": 3.3966515986304322e-06, "loss": 0.0647, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 182681925.0, "step": 220 }, { "entropy": 0.5227890014648438, "epoch": 2.5113636363636362, "grad_norm": 3.885151215528788, "learning_rate": 3.3820004518281835e-06, "loss": 0.0482, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 183519031.0, "step": 221 }, { "entropy": 0.5285797119140625, "epoch": 2.5227272727272725, "grad_norm": 9.287577782435898, "learning_rate": 3.367314624020613e-06, "loss": 0.084, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 184358917.0, "step": 222 }, { "entropy": 0.513397216796875, "epoch": 2.534090909090909, "grad_norm": 10.74097807636218, "learning_rate": 3.352594692666915e-06, "loss": 0.0786, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 185224412.0, "step": 223 }, { "entropy": 0.5338287353515625, "epoch": 2.5454545454545454, "grad_norm": 3.4282665798632515, "learning_rate": 3.337841236567268e-06, "loss": 0.043, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 186048899.0, "step": 224 }, { "entropy": 0.5269088745117188, "epoch": 2.5568181818181817, "grad_norm": 12.447034376257264, "learning_rate": 3.32305483584007e-06, "loss": 0.111, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 186880043.0, "step": 225 }, { "entropy": 0.5433578491210938, "epoch": 2.5681818181818183, "grad_norm": 10.265085259781317, "learning_rate": 3.30823607189913e-06, "loss": 0.0983, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 187654506.0, "step": 226 }, { "entropy": 0.5324630737304688, "epoch": 2.5795454545454546, "grad_norm": 6.24491633738438, "learning_rate": 3.2933855274308067e-06, "loss": 0.0744, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 188464920.0, "step": 227 }, { "entropy": 0.5325698852539062, "epoch": 2.590909090909091, "grad_norm": 9.798838479871074, "learning_rate": 3.278503786371095e-06, "loss": 0.0844, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 189293301.0, "step": 228 }, { "entropy": 0.5347824096679688, "epoch": 2.6022727272727275, "grad_norm": 12.7274044235036, "learning_rate": 3.2635914338826665e-06, "loss": 0.1058, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 190121007.0, "step": 229 }, { "entropy": 0.5603790283203125, "epoch": 2.6136363636363638, "grad_norm": 7.03460715005398, "learning_rate": 3.2486490563318605e-06, "loss": 0.0768, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 190881003.0, "step": 230 }, { "entropy": 0.5350341796875, "epoch": 2.625, "grad_norm": 3.155224324649305, "learning_rate": 3.233677241265627e-06, "loss": 0.0588, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 191699288.0, "step": 231 }, { "entropy": 0.5343246459960938, "epoch": 2.6363636363636362, "grad_norm": 6.092967080136461, "learning_rate": 3.218676577388424e-06, "loss": 0.0673, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 192526668.0, "step": 232 }, { "entropy": 0.5334014892578125, "epoch": 2.6477272727272725, "grad_norm": 5.321016163063104, "learning_rate": 3.2036476545390695e-06, "loss": 0.0702, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 193349142.0, "step": 233 }, { "entropy": 0.5334320068359375, "epoch": 2.659090909090909, "grad_norm": 3.466858920718226, "learning_rate": 3.188591063667548e-06, "loss": 0.0469, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 194176787.0, "step": 234 }, { "entropy": 0.53265380859375, "epoch": 2.6704545454545454, "grad_norm": 8.83147751793894, "learning_rate": 3.1735073968117743e-06, "loss": 0.0596, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 195004474.0, "step": 235 }, { "entropy": 0.540924072265625, "epoch": 2.6818181818181817, "grad_norm": 12.22958519345781, "learning_rate": 3.1583972470743123e-06, "loss": 0.088, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 195807358.0, "step": 236 }, { "entropy": 0.5318756103515625, "epoch": 2.6931818181818183, "grad_norm": 5.996711651679238, "learning_rate": 3.1432612085990576e-06, "loss": 0.0677, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 196617818.0, "step": 237 }, { "entropy": 0.5335693359375, "epoch": 2.7045454545454546, "grad_norm": 5.0941973108424365, "learning_rate": 3.1280998765478725e-06, "loss": 0.0645, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 197430311.0, "step": 238 }, { "entropy": 0.528717041015625, "epoch": 2.715909090909091, "grad_norm": 5.622809967264127, "learning_rate": 3.1129138470771823e-06, "loss": 0.0579, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 198277321.0, "step": 239 }, { "entropy": 0.5289077758789062, "epoch": 2.7272727272727275, "grad_norm": 5.096620942507479, "learning_rate": 3.0977037173145387e-06, "loss": 0.0442, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 199108815.0, "step": 240 }, { "entropy": 0.5392990112304688, "epoch": 2.7386363636363638, "grad_norm": 4.634448923382875, "learning_rate": 3.082470085335133e-06, "loss": 0.0561, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 199899561.0, "step": 241 }, { "entropy": 0.5410385131835938, "epoch": 2.75, "grad_norm": 4.486339153887012, "learning_rate": 3.0672135501382894e-06, "loss": 0.0724, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 200725431.0, "step": 242 }, { "entropy": 0.5286865234375, "epoch": 2.7613636363636362, "grad_norm": 2.9315078295067893, "learning_rate": 3.0519347116239e-06, "loss": 0.0423, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 201560765.0, "step": 243 }, { "entropy": 0.538360595703125, "epoch": 2.7727272727272725, "grad_norm": 3.2320025767052982, "learning_rate": 3.036634170568847e-06, "loss": 0.0401, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 202381503.0, "step": 244 }, { "entropy": 0.5329360961914062, "epoch": 2.784090909090909, "grad_norm": 5.331727343345033, "learning_rate": 3.021312528603371e-06, "loss": 0.0533, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 203203187.0, "step": 245 }, { "entropy": 0.5489501953125, "epoch": 2.7954545454545454, "grad_norm": 3.3498628859906723, "learning_rate": 3.0059703881874232e-06, "loss": 0.0357, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 203986073.0, "step": 246 }, { "entropy": 0.5378494262695312, "epoch": 2.8068181818181817, "grad_norm": 3.8988231949793417, "learning_rate": 2.990608352586965e-06, "loss": 0.0498, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 204805437.0, "step": 247 }, { "entropy": 0.5403289794921875, "epoch": 2.8181818181818183, "grad_norm": 5.259116353604022, "learning_rate": 2.9752270258502593e-06, "loss": 0.056, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 205608028.0, "step": 248 }, { "entropy": 0.5286102294921875, "epoch": 2.8295454545454546, "grad_norm": 6.195966706572306, "learning_rate": 2.959827012784108e-06, "loss": 0.048, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 206426839.0, "step": 249 }, { "entropy": 0.5177841186523438, "epoch": 2.840909090909091, "grad_norm": 7.2897806046347196, "learning_rate": 2.9444089189300783e-06, "loss": 0.0588, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 207279265.0, "step": 250 }, { "entropy": 0.5423431396484375, "epoch": 2.8522727272727275, "grad_norm": 6.301702261877335, "learning_rate": 2.92897335054069e-06, "loss": 0.059, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 208074284.0, "step": 251 }, { "entropy": 0.5258865356445312, "epoch": 2.8636363636363638, "grad_norm": 6.437422190596041, "learning_rate": 2.913520914555572e-06, "loss": 0.057, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 208920165.0, "step": 252 }, { "entropy": 0.5269393920898438, "epoch": 2.875, "grad_norm": 4.473468381789689, "learning_rate": 2.8980522185776065e-06, "loss": 0.0429, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 209763076.0, "step": 253 }, { "entropy": 0.523956298828125, "epoch": 2.8863636363636362, "grad_norm": 3.685173302105583, "learning_rate": 2.882567870849029e-06, "loss": 0.0398, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 210578485.0, "step": 254 }, { "entropy": 0.5370407104492188, "epoch": 2.8977272727272725, "grad_norm": 4.774066475110165, "learning_rate": 2.8670684802275173e-06, "loss": 0.0368, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 211388341.0, "step": 255 }, { "entropy": 0.5209121704101562, "epoch": 2.909090909090909, "grad_norm": 4.220527200517054, "learning_rate": 2.8515546561622464e-06, "loss": 0.0325, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 212229038.0, "step": 256 }, { "entropy": 0.5267486572265625, "epoch": 2.9204545454545454, "grad_norm": 4.5755301854337045, "learning_rate": 2.8360270086699274e-06, "loss": 0.0372, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 213072715.0, "step": 257 }, { "entropy": 0.5250320434570312, "epoch": 2.9318181818181817, "grad_norm": 4.428753877599414, "learning_rate": 2.820486148310822e-06, "loss": 0.0376, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 213912498.0, "step": 258 }, { "entropy": 0.534210205078125, "epoch": 2.9431818181818183, "grad_norm": 4.791248923408223, "learning_rate": 2.8049326861647303e-06, "loss": 0.0454, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 214726839.0, "step": 259 }, { "entropy": 0.5367584228515625, "epoch": 2.9545454545454546, "grad_norm": 4.490486917260571, "learning_rate": 2.7893672338069666e-06, "loss": 0.0418, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 215527901.0, "step": 260 }, { "entropy": 0.5217819213867188, "epoch": 2.965909090909091, "grad_norm": 4.610870917217071, "learning_rate": 2.7737904032843105e-06, "loss": 0.0462, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 216370628.0, "step": 261 }, { "entropy": 0.528656005859375, "epoch": 2.9772727272727275, "grad_norm": 4.762926208072535, "learning_rate": 2.7582028070909415e-06, "loss": 0.0343, "mean_token_accuracy": 0.989583333954215, "num_tokens": 217191647.0, "step": 262 }, { "entropy": 0.5208206176757812, "epoch": 2.9886363636363638, "grad_norm": 2.6855558389598406, "learning_rate": 2.742605058144352e-06, "loss": 0.0187, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 218045769.0, "step": 263 }, { "entropy": 0.5156478881835938, "epoch": 3.0, "grad_norm": 3.8864194579259568, "learning_rate": 2.7269977697612515e-06, "loss": 0.0274, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 218903155.0, "step": 264 }, { "entropy": 0.5219039916992188, "epoch": 3.0113636363636362, "grad_norm": 2.7681144892880765, "learning_rate": 2.7113815556334478e-06, "loss": 0.0143, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 219733127.0, "step": 265 }, { "entropy": 0.5251312255859375, "epoch": 3.022727272727273, "grad_norm": 4.368355859336438, "learning_rate": 2.6957570298037156e-06, "loss": 0.0188, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 220560193.0, "step": 266 }, { "entropy": 0.5105361938476562, "epoch": 3.034090909090909, "grad_norm": 9.088977324112165, "learning_rate": 2.680124806641654e-06, "loss": 0.036, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 221419000.0, "step": 267 }, { "entropy": 0.5338973999023438, "epoch": 3.0454545454545454, "grad_norm": 6.018166776800449, "learning_rate": 2.664485500819527e-06, "loss": 0.0333, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 222209715.0, "step": 268 }, { "entropy": 0.5257644653320312, "epoch": 3.0568181818181817, "grad_norm": 5.468563141530249, "learning_rate": 2.6488397272880943e-06, "loss": 0.0287, "mean_token_accuracy": 0.989583333954215, "num_tokens": 223024902.0, "step": 269 }, { "entropy": 0.5193328857421875, "epoch": 3.0681818181818183, "grad_norm": 7.295106536435693, "learning_rate": 2.633188101252433e-06, "loss": 0.0337, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 223854643.0, "step": 270 }, { "entropy": 0.5175323486328125, "epoch": 3.0795454545454546, "grad_norm": 6.883191149944938, "learning_rate": 2.617531238147744e-06, "loss": 0.0501, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 224719358.0, "step": 271 }, { "entropy": 0.5205154418945312, "epoch": 3.090909090909091, "grad_norm": 5.88154920217092, "learning_rate": 2.6018697536151554e-06, "loss": 0.0381, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 225549159.0, "step": 272 }, { "entropy": 0.5368423461914062, "epoch": 3.102272727272727, "grad_norm": 9.2073262872856, "learning_rate": 2.5862042634775125e-06, "loss": 0.0618, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 226368587.0, "step": 273 }, { "entropy": 0.53558349609375, "epoch": 3.1136363636363638, "grad_norm": 8.618806420796583, "learning_rate": 2.5705353837151655e-06, "loss": 0.0316, "mean_token_accuracy": 0.989583333954215, "num_tokens": 227156622.0, "step": 274 }, { "entropy": 0.53497314453125, "epoch": 3.125, "grad_norm": 5.171416458961046, "learning_rate": 2.554863730441748e-06, "loss": 0.0376, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 227980137.0, "step": 275 }, { "entropy": 0.5346145629882812, "epoch": 3.1363636363636362, "grad_norm": 3.2259069311735753, "learning_rate": 2.5391899198799475e-06, "loss": 0.0217, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 228789801.0, "step": 276 }, { "entropy": 0.5227127075195312, "epoch": 3.147727272727273, "grad_norm": 5.680603759953942, "learning_rate": 2.5235145683372813e-06, "loss": 0.0445, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 229631483.0, "step": 277 }, { "entropy": 0.5258560180664062, "epoch": 3.159090909090909, "grad_norm": 3.7672880245068754, "learning_rate": 2.507838292181858e-06, "loss": 0.0274, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 230473829.0, "step": 278 }, { "entropy": 0.525543212890625, "epoch": 3.1704545454545454, "grad_norm": 4.446400889769841, "learning_rate": 2.4921617078181425e-06, "loss": 0.0295, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 231297503.0, "step": 279 }, { "entropy": 0.5142059326171875, "epoch": 3.1818181818181817, "grad_norm": 8.655584591637458, "learning_rate": 2.47648543166272e-06, "loss": 0.053, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 232167203.0, "step": 280 }, { "entropy": 0.53271484375, "epoch": 3.1931818181818183, "grad_norm": 8.687831510758203, "learning_rate": 2.4608100801200533e-06, "loss": 0.0455, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 232982992.0, "step": 281 }, { "entropy": 0.5253372192382812, "epoch": 3.2045454545454546, "grad_norm": 4.055331476139395, "learning_rate": 2.445136269558254e-06, "loss": 0.0185, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 233794951.0, "step": 282 }, { "entropy": 0.5300216674804688, "epoch": 3.215909090909091, "grad_norm": 5.006857935808983, "learning_rate": 2.4294646162848353e-06, "loss": 0.0418, "mean_token_accuracy": 0.989583333954215, "num_tokens": 234627063.0, "step": 283 }, { "entropy": 0.530303955078125, "epoch": 3.227272727272727, "grad_norm": 4.2488937867763585, "learning_rate": 2.413795736522489e-06, "loss": 0.0286, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 235422472.0, "step": 284 }, { "entropy": 0.5381851196289062, "epoch": 3.2386363636363638, "grad_norm": 3.8677868707482665, "learning_rate": 2.3981302463848454e-06, "loss": 0.0239, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 236250854.0, "step": 285 }, { "entropy": 0.5477142333984375, "epoch": 3.25, "grad_norm": 2.4908003048655005, "learning_rate": 2.3824687618522567e-06, "loss": 0.019, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 237025724.0, "step": 286 }, { "entropy": 0.5311508178710938, "epoch": 3.2613636363636362, "grad_norm": 2.8300800369105423, "learning_rate": 2.366811898747568e-06, "loss": 0.0324, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 237847006.0, "step": 287 }, { "entropy": 0.5348968505859375, "epoch": 3.2727272727272725, "grad_norm": 3.74649110100955, "learning_rate": 2.351160272711907e-06, "loss": 0.0265, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 238659400.0, "step": 288 }, { "entropy": 0.5213699340820312, "epoch": 3.284090909090909, "grad_norm": 2.2370472232591556, "learning_rate": 2.3355144991804736e-06, "loss": 0.0146, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 239520566.0, "step": 289 }, { "entropy": 0.5295486450195312, "epoch": 3.2954545454545454, "grad_norm": 4.645357526431567, "learning_rate": 2.3198751933583463e-06, "loss": 0.0251, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 240338169.0, "step": 290 }, { "entropy": 0.5078277587890625, "epoch": 3.3068181818181817, "grad_norm": 3.977909281667933, "learning_rate": 2.304242970196285e-06, "loss": 0.0157, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 241205913.0, "step": 291 }, { "entropy": 0.518096923828125, "epoch": 3.3181818181818183, "grad_norm": 3.705571268224213, "learning_rate": 2.2886184443665522e-06, "loss": 0.017, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 242031860.0, "step": 292 }, { "entropy": 0.51983642578125, "epoch": 3.3295454545454546, "grad_norm": 3.153417353690183, "learning_rate": 2.2730022302387493e-06, "loss": 0.0209, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 242863430.0, "step": 293 }, { "entropy": 0.5169601440429688, "epoch": 3.340909090909091, "grad_norm": 5.244316414765351, "learning_rate": 2.257394941855648e-06, "loss": 0.0194, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 243706209.0, "step": 294 }, { "entropy": 0.5330963134765625, "epoch": 3.3522727272727275, "grad_norm": 5.974620564211638, "learning_rate": 2.2417971929090593e-06, "loss": 0.0176, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 244495154.0, "step": 295 }, { "entropy": 0.5119476318359375, "epoch": 3.3636363636363638, "grad_norm": 6.28569760192753, "learning_rate": 2.2262095967156895e-06, "loss": 0.0232, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 245361556.0, "step": 296 }, { "entropy": 0.5241546630859375, "epoch": 3.375, "grad_norm": 2.976749613690859, "learning_rate": 2.2106327661930343e-06, "loss": 0.0129, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 246179032.0, "step": 297 }, { "entropy": 0.5186386108398438, "epoch": 3.3863636363636362, "grad_norm": 5.205476897606577, "learning_rate": 2.19506731383527e-06, "loss": 0.0174, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 247044591.0, "step": 298 }, { "entropy": 0.5159530639648438, "epoch": 3.3977272727272725, "grad_norm": 4.885033948373779, "learning_rate": 2.1795138516891786e-06, "loss": 0.0263, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 247849642.0, "step": 299 }, { "entropy": 0.5101547241210938, "epoch": 3.409090909090909, "grad_norm": 4.276142732793436, "learning_rate": 2.163972991330073e-06, "loss": 0.0117, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 248687645.0, "step": 300 }, { "entropy": 0.512420654296875, "epoch": 3.4204545454545454, "grad_norm": 5.6616807718007065, "learning_rate": 2.148445343837755e-06, "loss": 0.0187, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 249523174.0, "step": 301 }, { "entropy": 0.5093612670898438, "epoch": 3.4318181818181817, "grad_norm": 7.495530375469061, "learning_rate": 2.1329315197724835e-06, "loss": 0.0137, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 250381087.0, "step": 302 }, { "entropy": 0.5103759765625, "epoch": 3.4431818181818183, "grad_norm": 1.4405232803470114, "learning_rate": 2.1174321291509716e-06, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 251231901.0, "step": 303 }, { "entropy": 0.5128860473632812, "epoch": 3.4545454545454546, "grad_norm": 9.845854165742995, "learning_rate": 2.1019477814223943e-06, "loss": 0.0327, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 252066427.0, "step": 304 }, { "entropy": 0.5172500610351562, "epoch": 3.465909090909091, "grad_norm": 6.8859140835256545, "learning_rate": 2.086479085444429e-06, "loss": 0.0267, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 252898682.0, "step": 305 }, { "entropy": 0.4980316162109375, "epoch": 3.4772727272727275, "grad_norm": 4.267657568764952, "learning_rate": 2.071026649459311e-06, "loss": 0.0092, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 253766783.0, "step": 306 }, { "entropy": 0.5067977905273438, "epoch": 3.4886363636363638, "grad_norm": 7.713129362260799, "learning_rate": 2.055591081069922e-06, "loss": 0.0193, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 254619003.0, "step": 307 }, { "entropy": 0.5159912109375, "epoch": 3.5, "grad_norm": 5.82481587389578, "learning_rate": 2.040172987215893e-06, "loss": 0.0164, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 255449310.0, "step": 308 }, { "entropy": 0.5168304443359375, "epoch": 3.5113636363636362, "grad_norm": 9.77767804352304, "learning_rate": 2.024772974149741e-06, "loss": 0.0319, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 256269971.0, "step": 309 }, { "entropy": 0.508331298828125, "epoch": 3.5227272727272725, "grad_norm": 3.110919919007407, "learning_rate": 2.0093916474130354e-06, "loss": 0.0083, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 257110990.0, "step": 310 }, { "entropy": 0.49806976318359375, "epoch": 3.534090909090909, "grad_norm": 4.028515631346399, "learning_rate": 1.9940296118125776e-06, "loss": 0.0178, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 257969152.0, "step": 311 }, { "entropy": 0.5093307495117188, "epoch": 3.5454545454545454, "grad_norm": 4.838995412482845, "learning_rate": 1.9786874713966293e-06, "loss": 0.0413, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 258787610.0, "step": 312 }, { "entropy": 0.5196762084960938, "epoch": 3.5568181818181817, "grad_norm": 6.530173083527098, "learning_rate": 1.9633658294311535e-06, "loss": 0.0311, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 259591761.0, "step": 313 }, { "entropy": 0.5156402587890625, "epoch": 3.5681818181818183, "grad_norm": 6.929601010585583, "learning_rate": 1.9480652883761007e-06, "loss": 0.0292, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 260397894.0, "step": 314 }, { "entropy": 0.5229644775390625, "epoch": 3.5795454545454546, "grad_norm": 7.831867296735421, "learning_rate": 1.9327864498617114e-06, "loss": 0.0298, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 261200683.0, "step": 315 }, { "entropy": 0.5133056640625, "epoch": 3.590909090909091, "grad_norm": 3.5816335430908777, "learning_rate": 1.9175299146648672e-06, "loss": 0.0201, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 262040136.0, "step": 316 }, { "entropy": 0.5169219970703125, "epoch": 3.6022727272727275, "grad_norm": 3.2198926229429823, "learning_rate": 1.9022962826854619e-06, "loss": 0.0212, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 262885485.0, "step": 317 }, { "entropy": 0.5167617797851562, "epoch": 3.6136363636363638, "grad_norm": 2.5731365142380587, "learning_rate": 1.887086152922818e-06, "loss": 0.024, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 263732921.0, "step": 318 }, { "entropy": 0.5399856567382812, "epoch": 3.625, "grad_norm": 5.5206910993901355, "learning_rate": 1.8719001234521283e-06, "loss": 0.0325, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 264525028.0, "step": 319 }, { "entropy": 0.5193099975585938, "epoch": 3.6363636363636362, "grad_norm": 2.9209940154455545, "learning_rate": 1.8567387914009432e-06, "loss": 0.0138, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 265358646.0, "step": 320 }, { "entropy": 0.508575439453125, "epoch": 3.6477272727272725, "grad_norm": 2.5817093209649444, "learning_rate": 1.8416027529256885e-06, "loss": 0.0191, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 266215215.0, "step": 321 }, { "entropy": 0.5082931518554688, "epoch": 3.659090909090909, "grad_norm": 2.503376524092804, "learning_rate": 1.8264926031882274e-06, "loss": 0.0151, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 267080581.0, "step": 322 }, { "entropy": 0.5336380004882812, "epoch": 3.6704545454545454, "grad_norm": 2.601198540739444, "learning_rate": 1.8114089363324525e-06, "loss": 0.0181, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 267854230.0, "step": 323 }, { "entropy": 0.5327835083007812, "epoch": 3.6818181818181817, "grad_norm": 3.063054591464437, "learning_rate": 1.7963523454609317e-06, "loss": 0.0094, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 268647574.0, "step": 324 }, { "entropy": 0.5183258056640625, "epoch": 3.6931818181818183, "grad_norm": 2.758926546908894, "learning_rate": 1.7813234226115767e-06, "loss": 0.0077, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 269479857.0, "step": 325 }, { "entropy": 0.5243988037109375, "epoch": 3.7045454545454546, "grad_norm": 4.055775400118274, "learning_rate": 1.766322758734374e-06, "loss": 0.0242, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 270278460.0, "step": 326 }, { "entropy": 0.5056533813476562, "epoch": 3.715909090909091, "grad_norm": 3.6447943171210877, "learning_rate": 1.75135094366814e-06, "loss": 0.0181, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 271125235.0, "step": 327 }, { "entropy": 0.518310546875, "epoch": 3.7272727272727275, "grad_norm": 6.528608409582633, "learning_rate": 1.7364085661173346e-06, "loss": 0.0219, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 271947014.0, "step": 328 }, { "entropy": 0.5148544311523438, "epoch": 3.7386363636363638, "grad_norm": 4.846437727927079, "learning_rate": 1.721496213628906e-06, "loss": 0.0179, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 272778931.0, "step": 329 }, { "entropy": 0.5037765502929688, "epoch": 3.75, "grad_norm": 5.614029654133794, "learning_rate": 1.7066144725691933e-06, "loss": 0.0209, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 273609373.0, "step": 330 }, { "entropy": 0.5075302124023438, "epoch": 3.7613636363636362, "grad_norm": 6.557376782306085, "learning_rate": 1.6917639281008703e-06, "loss": 0.0171, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 274452240.0, "step": 331 }, { "entropy": 0.512725830078125, "epoch": 3.7727272727272725, "grad_norm": 2.752525740999078, "learning_rate": 1.6769451641599305e-06, "loss": 0.0112, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 275279923.0, "step": 332 }, { "entropy": 0.5047988891601562, "epoch": 3.784090909090909, "grad_norm": 3.2366011635785368, "learning_rate": 1.6621587634327328e-06, "loss": 0.01, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 276099706.0, "step": 333 }, { "entropy": 0.5012435913085938, "epoch": 3.7954545454545454, "grad_norm": 6.289968133807858, "learning_rate": 1.647405307333085e-06, "loss": 0.028, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 276948323.0, "step": 334 }, { "entropy": 0.5199050903320312, "epoch": 3.8068181818181817, "grad_norm": 5.07228106259114, "learning_rate": 1.6326853759793878e-06, "loss": 0.0227, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 277752317.0, "step": 335 }, { "entropy": 0.5204315185546875, "epoch": 3.8181818181818183, "grad_norm": 1.8589073814632076, "learning_rate": 1.6179995481718165e-06, "loss": 0.017, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 278553380.0, "step": 336 }, { "entropy": 0.5152053833007812, "epoch": 3.8295454545454546, "grad_norm": 5.580405395858779, "learning_rate": 1.6033484013695688e-06, "loss": 0.0216, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 279380180.0, "step": 337 }, { "entropy": 0.51837158203125, "epoch": 3.840909090909091, "grad_norm": 1.8203808497415166, "learning_rate": 1.588732511668153e-06, "loss": 0.0107, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 280190463.0, "step": 338 }, { "entropy": 0.5064926147460938, "epoch": 3.8522727272727275, "grad_norm": 0.6183424685913169, "learning_rate": 1.5741524537767427e-06, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 281038954.0, "step": 339 }, { "entropy": 0.5031051635742188, "epoch": 3.8636363636363638, "grad_norm": 5.5233826679734195, "learning_rate": 1.5596088009955695e-06, "loss": 0.0152, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 281892856.0, "step": 340 }, { "entropy": 0.5198516845703125, "epoch": 3.875, "grad_norm": 5.607369212208562, "learning_rate": 1.5451021251933895e-06, "loss": 0.025, "mean_token_accuracy": 0.989583333954215, "num_tokens": 282714631.0, "step": 341 }, { "entropy": 0.4959564208984375, "epoch": 3.8863636363636362, "grad_norm": 4.099811537737254, "learning_rate": 1.5306329967849887e-06, "loss": 0.0242, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 283568255.0, "step": 342 }, { "entropy": 0.507598876953125, "epoch": 3.8977272727272725, "grad_norm": 2.3164448440263428, "learning_rate": 1.5162019847087616e-06, "loss": 0.0054, "mean_token_accuracy": 1.0, "num_tokens": 284396996.0, "step": 343 }, { "entropy": 0.507171630859375, "epoch": 3.909090909090909, "grad_norm": 2.8517936981521945, "learning_rate": 1.5018096564043333e-06, "loss": 0.0097, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 285208621.0, "step": 344 }, { "entropy": 0.50006103515625, "epoch": 3.9204545454545454, "grad_norm": 4.123129780590149, "learning_rate": 1.4874565777902518e-06, "loss": 0.0107, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 286068274.0, "step": 345 }, { "entropy": 0.513397216796875, "epoch": 3.9318181818181817, "grad_norm": 3.52925414394255, "learning_rate": 1.4731433132417316e-06, "loss": 0.0119, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 286884909.0, "step": 346 }, { "entropy": 0.5006332397460938, "epoch": 3.9431818181818183, "grad_norm": 4.776748695721814, "learning_rate": 1.4588704255684697e-06, "loss": 0.0165, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 287730733.0, "step": 347 }, { "entropy": 0.504791259765625, "epoch": 3.9545454545454546, "grad_norm": 2.537558079581804, "learning_rate": 1.4446384759925024e-06, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 288550016.0, "step": 348 }, { "entropy": 0.5134658813476562, "epoch": 3.965909090909091, "grad_norm": 1.085201575355781, "learning_rate": 1.4304480241261529e-06, "loss": 0.0044, "mean_token_accuracy": 1.0, "num_tokens": 289378888.0, "step": 349 }, { "entropy": 0.47869110107421875, "epoch": 3.9772727272727275, "grad_norm": 1.4589087196629151, "learning_rate": 1.4162996279500158e-06, "loss": 0.0051, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 290281736.0, "step": 350 }, { "entropy": 0.5169143676757812, "epoch": 3.9886363636363638, "grad_norm": 3.4093651803696177, "learning_rate": 1.4021938437910181e-06, "loss": 0.0221, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 291064628.0, "step": 351 }, { "entropy": 0.5065155029296875, "epoch": 4.0, "grad_norm": 3.137419095072893, "learning_rate": 1.388131226300552e-06, "loss": 0.0179, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 291890322.0, "step": 352 }, { "entropy": 0.5136337280273438, "epoch": 4.011363636363637, "grad_norm": 1.4892912968011076, "learning_rate": 1.374112328432652e-06, "loss": 0.0047, "mean_token_accuracy": 1.0, "num_tokens": 292701363.0, "step": 353 }, { "entropy": 0.5075454711914062, "epoch": 4.0227272727272725, "grad_norm": 2.654487334818569, "learning_rate": 1.3601377014222688e-06, "loss": 0.0149, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 293532137.0, "step": 354 }, { "entropy": 0.49842071533203125, "epoch": 4.034090909090909, "grad_norm": 3.028217204521398, "learning_rate": 1.3462078947635781e-06, "loss": 0.0063, "mean_token_accuracy": 1.0, "num_tokens": 294375049.0, "step": 355 }, { "entropy": 0.5129852294921875, "epoch": 4.045454545454546, "grad_norm": 1.9243706005279761, "learning_rate": 1.3323234561883847e-06, "loss": 0.0041, "mean_token_accuracy": 1.0, "num_tokens": 295187394.0, "step": 356 }, { "entropy": 0.5176849365234375, "epoch": 4.056818181818182, "grad_norm": 2.696144952101861, "learning_rate": 1.318484931644582e-06, "loss": 0.0131, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 295972208.0, "step": 357 }, { "entropy": 0.48602294921875, "epoch": 4.068181818181818, "grad_norm": 6.427614860445277, "learning_rate": 1.3046928652746833e-06, "loss": 0.0062, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 296841049.0, "step": 358 }, { "entropy": 0.48738861083984375, "epoch": 4.079545454545454, "grad_norm": 4.960528563198844, "learning_rate": 1.2909477993944286e-06, "loss": 0.0264, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 297687674.0, "step": 359 }, { "entropy": 0.4840240478515625, "epoch": 4.090909090909091, "grad_norm": 6.171675343056278, "learning_rate": 1.2772502744714592e-06, "loss": 0.0211, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 298539579.0, "step": 360 }, { "entropy": 0.49788665771484375, "epoch": 4.1022727272727275, "grad_norm": 7.239641501067476, "learning_rate": 1.2636008291040618e-06, "loss": 0.0096, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 299370858.0, "step": 361 }, { "entropy": 0.5033340454101562, "epoch": 4.113636363636363, "grad_norm": 5.124511979744234, "learning_rate": 1.2500000000000007e-06, "loss": 0.0206, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 300193508.0, "step": 362 }, { "entropy": 0.494537353515625, "epoch": 4.125, "grad_norm": 4.156621336632535, "learning_rate": 1.236448321955401e-06, "loss": 0.0187, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 301045850.0, "step": 363 }, { "entropy": 0.5119476318359375, "epoch": 4.136363636363637, "grad_norm": 2.0665220666683353, "learning_rate": 1.222946327833731e-06, "loss": 0.0037, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 301841078.0, "step": 364 }, { "entropy": 0.4991302490234375, "epoch": 4.1477272727272725, "grad_norm": 7.318250997291169, "learning_rate": 1.2094945485448424e-06, "loss": 0.0076, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 302689150.0, "step": 365 }, { "entropy": 0.528778076171875, "epoch": 4.159090909090909, "grad_norm": 2.876075984900272, "learning_rate": 1.196093513024099e-06, "loss": 0.0131, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 303488060.0, "step": 366 }, { "entropy": 0.501953125, "epoch": 4.170454545454546, "grad_norm": 7.049810570291405, "learning_rate": 1.182743748211576e-06, "loss": 0.0165, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 304341157.0, "step": 367 }, { "entropy": 0.496307373046875, "epoch": 4.181818181818182, "grad_norm": 8.371870138705338, "learning_rate": 1.1694457790313403e-06, "loss": 0.013, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 305186459.0, "step": 368 }, { "entropy": 0.5236282348632812, "epoch": 4.193181818181818, "grad_norm": 2.7232406189892493, "learning_rate": 1.15620012837081e-06, "loss": 0.0105, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 305978157.0, "step": 369 }, { "entropy": 0.5146942138671875, "epoch": 4.204545454545454, "grad_norm": 1.032258263050008, "learning_rate": 1.1430073170601968e-06, "loss": 0.01, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 306769045.0, "step": 370 }, { "entropy": 0.5118789672851562, "epoch": 4.215909090909091, "grad_norm": 3.0442908381820417, "learning_rate": 1.1298678638520247e-06, "loss": 0.0062, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 307562505.0, "step": 371 }, { "entropy": 0.496978759765625, "epoch": 4.2272727272727275, "grad_norm": 2.0148794276847513, "learning_rate": 1.1167822854007265e-06, "loss": 0.0069, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 308441553.0, "step": 372 }, { "entropy": 0.5107421875, "epoch": 4.238636363636363, "grad_norm": 0.9950819388070626, "learning_rate": 1.1037510962423425e-06, "loss": 0.0074, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 309253697.0, "step": 373 }, { "entropy": 0.5004348754882812, "epoch": 4.25, "grad_norm": 0.4801227777140967, "learning_rate": 1.0907748087742716e-06, "loss": 0.0027, "mean_token_accuracy": 1.0, "num_tokens": 310096440.0, "step": 374 }, { "entropy": 0.4990081787109375, "epoch": 4.261363636363637, "grad_norm": 1.5559270159153842, "learning_rate": 1.0778539332351374e-06, "loss": 0.005, "mean_token_accuracy": 1.0, "num_tokens": 310946699.0, "step": 375 }, { "entropy": 0.5043411254882812, "epoch": 4.2727272727272725, "grad_norm": 1.6535039299735041, "learning_rate": 1.0649889776847161e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 311780327.0, "step": 376 }, { "entropy": 0.5130233764648438, "epoch": 4.284090909090909, "grad_norm": 0.9666561955269652, "learning_rate": 1.0521804479839651e-06, "loss": 0.0107, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 312599293.0, "step": 377 }, { "entropy": 0.506927490234375, "epoch": 4.295454545454546, "grad_norm": 1.0467474251432374, "learning_rate": 1.0394288477751274e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 313433046.0, "step": 378 }, { "entropy": 0.52001953125, "epoch": 4.306818181818182, "grad_norm": 1.9504543167360786, "learning_rate": 1.0267346784619324e-06, "loss": 0.0082, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 314231321.0, "step": 379 }, { "entropy": 0.4952545166015625, "epoch": 4.318181818181818, "grad_norm": 1.1273016731224978, "learning_rate": 1.0140984391898744e-06, "loss": 0.0104, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 315090907.0, "step": 380 }, { "entropy": 0.49346160888671875, "epoch": 4.329545454545454, "grad_norm": 1.1570735678574222, "learning_rate": 1.0015206268265948e-06, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 315934937.0, "step": 381 }, { "entropy": 0.5069198608398438, "epoch": 4.340909090909091, "grad_norm": 1.645370918123862, "learning_rate": 9.890017359423326e-07, "loss": 0.0152, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 316744484.0, "step": 382 }, { "entropy": 0.5043182373046875, "epoch": 4.3522727272727275, "grad_norm": 4.167752578085236, "learning_rate": 9.765422587904919e-07, "loss": 0.0084, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 317569584.0, "step": 383 }, { "entropy": 0.511566162109375, "epoch": 4.363636363636363, "grad_norm": 4.169768059903615, "learning_rate": 9.641426852882717e-07, "loss": 0.0078, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 318380963.0, "step": 384 }, { "entropy": 0.5172653198242188, "epoch": 4.375, "grad_norm": 1.1444633617257265, "learning_rate": 9.518035029974127e-07, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 319177808.0, "step": 385 }, { "entropy": 0.5063858032226562, "epoch": 4.386363636363637, "grad_norm": 3.231652165172302, "learning_rate": 9.395251971050206e-07, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 319991752.0, "step": 386 }, { "entropy": 0.5062026977539062, "epoch": 4.3977272727272725, "grad_norm": 0.4562096042156392, "learning_rate": 9.273082504044903e-07, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 320823781.0, "step": 387 }, { "entropy": 0.4913787841796875, "epoch": 4.409090909090909, "grad_norm": 4.6988922150451975, "learning_rate": 9.151531432765204e-07, "loss": 0.015, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 321672380.0, "step": 388 }, { "entropy": 0.508453369140625, "epoch": 4.420454545454546, "grad_norm": 1.933648265502234, "learning_rate": 9.030603536702254e-07, "loss": 0.0109, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 322475535.0, "step": 389 }, { "entropy": 0.49637603759765625, "epoch": 4.431818181818182, "grad_norm": 0.4749255618924114, "learning_rate": 8.910303570843423e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 323285742.0, "step": 390 }, { "entropy": 0.5011215209960938, "epoch": 4.443181818181818, "grad_norm": 0.4458695094810891, "learning_rate": 8.790636265485333e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 324121368.0, "step": 391 }, { "entropy": 0.49810791015625, "epoch": 4.454545454545454, "grad_norm": 1.5274219135461808, "learning_rate": 8.67160632604786e-07, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 324973947.0, "step": 392 }, { "entropy": 0.49318695068359375, "epoch": 4.465909090909091, "grad_norm": 1.1132782159428174, "learning_rate": 8.553218432889091e-07, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 325804782.0, "step": 393 }, { "entropy": 0.510528564453125, "epoch": 4.4772727272727275, "grad_norm": 3.6263901612660328, "learning_rate": 8.435477241121354e-07, "loss": 0.0049, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 326603097.0, "step": 394 }, { "entropy": 0.48946380615234375, "epoch": 4.488636363636363, "grad_norm": 0.47538702415452083, "learning_rate": 8.31838738042808e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 327455117.0, "step": 395 }, { "entropy": 0.49471282958984375, "epoch": 4.5, "grad_norm": 5.432484108713521, "learning_rate": 8.201953454881844e-07, "loss": 0.0191, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 328271844.0, "step": 396 }, { "entropy": 0.49318695068359375, "epoch": 4.511363636363637, "grad_norm": 2.7195053826099134, "learning_rate": 8.086180042763284e-07, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 329105170.0, "step": 397 }, { "entropy": 0.5019912719726562, "epoch": 4.5227272727272725, "grad_norm": 1.6339953775013658, "learning_rate": 7.971071696381089e-07, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 329922811.0, "step": 398 }, { "entropy": 0.5027236938476562, "epoch": 4.534090909090909, "grad_norm": 1.9153786326147693, "learning_rate": 7.856632941893e-07, "loss": 0.0028, "mean_token_accuracy": 1.0, "num_tokens": 330722929.0, "step": 399 }, { "entropy": 0.4850311279296875, "epoch": 4.545454545454545, "grad_norm": 1.6886270091316586, "learning_rate": 7.74286827912785e-07, "loss": 0.0025, "mean_token_accuracy": 1.0, "num_tokens": 331559428.0, "step": 400 }, { "entropy": 0.48662567138671875, "epoch": 4.556818181818182, "grad_norm": 1.5068421366606104, "learning_rate": 7.629782181408574e-07, "loss": 0.0065, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 332413515.0, "step": 401 }, { "entropy": 0.482879638671875, "epoch": 4.568181818181818, "grad_norm": 0.47925368613938213, "learning_rate": 7.517379095376418e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 333265076.0, "step": 402 }, { "entropy": 0.49607086181640625, "epoch": 4.579545454545455, "grad_norm": 1.440219771154895, "learning_rate": 7.405663440815968e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 334094385.0, "step": 403 }, { "entropy": 0.490692138671875, "epoch": 4.590909090909091, "grad_norm": 3.7567750667717665, "learning_rate": 7.294639610481461e-07, "loss": 0.0037, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 334947835.0, "step": 404 }, { "entropy": 0.49617767333984375, "epoch": 4.6022727272727275, "grad_norm": 5.25000234097866, "learning_rate": 7.184311969924002e-07, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 335753110.0, "step": 405 }, { "entropy": 0.4870147705078125, "epoch": 4.613636363636363, "grad_norm": 0.2953367174000916, "learning_rate": 7.074684857319928e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 336600931.0, "step": 406 }, { "entropy": 0.47736358642578125, "epoch": 4.625, "grad_norm": 2.92622111202191, "learning_rate": 6.965762583300223e-07, "loss": 0.004, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 337462613.0, "step": 407 }, { "entropy": 0.494781494140625, "epoch": 4.636363636363637, "grad_norm": 0.28460107939061985, "learning_rate": 6.85754943078103e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 338298847.0, "step": 408 }, { "entropy": 0.49102783203125, "epoch": 4.6477272727272725, "grad_norm": 0.6146968002181714, "learning_rate": 6.750049654795199e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 339113689.0, "step": 409 }, { "entropy": 0.492462158203125, "epoch": 4.659090909090909, "grad_norm": 3.809716843433178, "learning_rate": 6.643267482325061e-07, "loss": 0.003, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 339940988.0, "step": 410 }, { "entropy": 0.5035018920898438, "epoch": 4.670454545454545, "grad_norm": 1.467212122986146, "learning_rate": 6.537207112136143e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 340752578.0, "step": 411 }, { "entropy": 0.5028610229492188, "epoch": 4.681818181818182, "grad_norm": 5.889457746996348, "learning_rate": 6.431872714612072e-07, "loss": 0.0105, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 341554983.0, "step": 412 }, { "entropy": 0.5011444091796875, "epoch": 4.693181818181818, "grad_norm": 5.019388657039859, "learning_rate": 6.327268431590664e-07, "loss": 0.0115, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 342358183.0, "step": 413 }, { "entropy": 0.48370361328125, "epoch": 4.704545454545455, "grad_norm": 3.72166366036877, "learning_rate": 6.223398376200956e-07, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 343205856.0, "step": 414 }, { "entropy": 0.48567962646484375, "epoch": 4.715909090909091, "grad_norm": 1.9949701003957454, "learning_rate": 6.1202666327016e-07, "loss": 0.0106, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 344033478.0, "step": 415 }, { "entropy": 0.49942779541015625, "epoch": 4.7272727272727275, "grad_norm": 1.9263163276444206, "learning_rate": 6.017877256320132e-07, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 344827009.0, "step": 416 }, { "entropy": 0.4829559326171875, "epoch": 4.738636363636363, "grad_norm": 0.34749654268128166, "learning_rate": 5.916234273093624e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 345670205.0, "step": 417 }, { "entropy": 0.49527740478515625, "epoch": 4.75, "grad_norm": 1.7904763213822166, "learning_rate": 5.815341679710327e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 346471682.0, "step": 418 }, { "entropy": 0.485931396484375, "epoch": 4.761363636363637, "grad_norm": 0.45454733529461383, "learning_rate": 5.715203443352526e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 347308458.0, "step": 419 }, { "entropy": 0.4832916259765625, "epoch": 4.7727272727272725, "grad_norm": 0.4231547605381226, "learning_rate": 5.615823501540546e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 348141445.0, "step": 420 }, { "entropy": 0.4787139892578125, "epoch": 4.784090909090909, "grad_norm": 3.404461235555557, "learning_rate": 5.51720576197794e-07, "loss": 0.0031, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 349005773.0, "step": 421 }, { "entropy": 0.4886322021484375, "epoch": 4.795454545454545, "grad_norm": 2.694747189659317, "learning_rate": 5.419354102397792e-07, "loss": 0.0171, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 349829599.0, "step": 422 }, { "entropy": 0.48751068115234375, "epoch": 4.806818181818182, "grad_norm": 1.4354262979624335, "learning_rate": 5.32227237041032e-07, "loss": 0.0098, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 350647234.0, "step": 423 }, { "entropy": 0.46784210205078125, "epoch": 4.818181818181818, "grad_norm": 0.7211976403917981, "learning_rate": 5.22596438335149e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 351529720.0, "step": 424 }, { "entropy": 0.48293304443359375, "epoch": 4.829545454545455, "grad_norm": 0.7921740483961287, "learning_rate": 5.130433928132983e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 352366018.0, "step": 425 }, { "entropy": 0.4916839599609375, "epoch": 4.840909090909091, "grad_norm": 3.255008493711048, "learning_rate": 5.035684761093273e-07, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 353187956.0, "step": 426 }, { "entropy": 0.493743896484375, "epoch": 4.8522727272727275, "grad_norm": 0.4639379515372835, "learning_rate": 4.941720607849912e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 354005342.0, "step": 427 }, { "entropy": 0.48528289794921875, "epoch": 4.863636363636363, "grad_norm": 2.7318226665423335, "learning_rate": 4.848545163153048e-07, "loss": 0.0106, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 354847121.0, "step": 428 }, { "entropy": 0.47846221923828125, "epoch": 4.875, "grad_norm": 3.3364962212306226, "learning_rate": 4.756162090740135e-07, "loss": 0.004, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 355685163.0, "step": 429 }, { "entropy": 0.49137115478515625, "epoch": 4.886363636363637, "grad_norm": 0.6805978064704994, "learning_rate": 4.6645750231918864e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 356523319.0, "step": 430 }, { "entropy": 0.4839019775390625, "epoch": 4.8977272727272725, "grad_norm": 0.33583284419497017, "learning_rate": 4.5737875617894225e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 357363691.0, "step": 431 }, { "entropy": 0.49709320068359375, "epoch": 4.909090909090909, "grad_norm": 0.3853403035900075, "learning_rate": 4.4838032763726806e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 358180072.0, "step": 432 }, { "entropy": 0.47998809814453125, "epoch": 4.920454545454545, "grad_norm": 1.590063239763804, "learning_rate": 4.394625705200012e-07, "loss": 0.0111, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 359024240.0, "step": 433 }, { "entropy": 0.49835205078125, "epoch": 4.931818181818182, "grad_norm": 5.266170835548258, "learning_rate": 4.3062583548091256e-07, "loss": 0.0119, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 359847230.0, "step": 434 }, { "entropy": 0.4920501708984375, "epoch": 4.943181818181818, "grad_norm": 0.27517320814335366, "learning_rate": 4.218704699879117e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 360685374.0, "step": 435 }, { "entropy": 0.47795867919921875, "epoch": 4.954545454545455, "grad_norm": 4.084902711490606, "learning_rate": 4.1319681830939124e-07, "loss": 0.0131, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 361526820.0, "step": 436 }, { "entropy": 0.48442840576171875, "epoch": 4.965909090909091, "grad_norm": 0.6198177710303161, "learning_rate": 4.0460522150068684e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 362340123.0, "step": 437 }, { "entropy": 0.48004913330078125, "epoch": 4.9772727272727275, "grad_norm": 0.5029069058309816, "learning_rate": 3.9609601739066664e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 363177369.0, "step": 438 }, { "entropy": 0.48892974853515625, "epoch": 4.988636363636363, "grad_norm": 0.29134437168907484, "learning_rate": 3.876695405684486e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 364019483.0, "step": 439 }, { "entropy": 0.5021286010742188, "epoch": 5.0, "grad_norm": 1.9225403101164527, "learning_rate": 3.793261223702441e-07, "loss": 0.008, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 364810814.0, "step": 440 }, { "entropy": 0.4940185546875, "epoch": 5.011363636363637, "grad_norm": 2.938178788349801, "learning_rate": 3.7106609086632635e-07, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 365630038.0, "step": 441 }, { "entropy": 0.489227294921875, "epoch": 5.0227272727272725, "grad_norm": 0.291480299250409, "learning_rate": 3.628897708481377e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 366463667.0, "step": 442 }, { "entropy": 0.4832611083984375, "epoch": 5.034090909090909, "grad_norm": 3.0078477238914507, "learning_rate": 3.5479748381550855e-07, "loss": 0.0094, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 367311117.0, "step": 443 }, { "entropy": 0.490234375, "epoch": 5.045454545454546, "grad_norm": 3.510570730894089, "learning_rate": 3.4678954796402624e-07, "loss": 0.0035, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 368129489.0, "step": 444 }, { "entropy": 0.4967803955078125, "epoch": 5.056818181818182, "grad_norm": 0.7674575202655312, "learning_rate": 3.388662781725141e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 368926469.0, "step": 445 }, { "entropy": 0.49871826171875, "epoch": 5.068181818181818, "grad_norm": 0.3305389935908899, "learning_rate": 3.310279859906565e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 369725096.0, "step": 446 }, { "entropy": 0.48119354248046875, "epoch": 5.079545454545454, "grad_norm": 0.6603801299623112, "learning_rate": 3.232749796267451e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 370550137.0, "step": 447 }, { "entropy": 0.48421478271484375, "epoch": 5.090909090909091, "grad_norm": 5.247118592826336, "learning_rate": 3.1560756393556187e-07, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 371400483.0, "step": 448 }, { "entropy": 0.49166107177734375, "epoch": 5.1022727272727275, "grad_norm": 0.27569457066734937, "learning_rate": 3.0802604040639034e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 372240572.0, "step": 449 }, { "entropy": 0.47963714599609375, "epoch": 5.113636363636363, "grad_norm": 0.311204056940979, "learning_rate": 3.0053070715116153e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 373094230.0, "step": 450 }, { "entropy": 0.488861083984375, "epoch": 5.125, "grad_norm": 0.24417698499128285, "learning_rate": 2.9312185889273147e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 373920609.0, "step": 451 }, { "entropy": 0.4911956787109375, "epoch": 5.136363636363637, "grad_norm": 0.2718455632587909, "learning_rate": 2.8579978695329386e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 374735909.0, "step": 452 }, { "entropy": 0.4797515869140625, "epoch": 5.1477272727272725, "grad_norm": 0.23825012117768773, "learning_rate": 2.785647792429233e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 375593425.0, "step": 453 }, { "entropy": 0.48360443115234375, "epoch": 5.159090909090909, "grad_norm": 0.23818462066480345, "learning_rate": 2.714171202482538e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 376407379.0, "step": 454 }, { "entropy": 0.48807525634765625, "epoch": 5.170454545454546, "grad_norm": 3.2507164294292457, "learning_rate": 2.6435709102129727e-07, "loss": 0.0077, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 377243881.0, "step": 455 }, { "entropy": 0.49269866943359375, "epoch": 5.181818181818182, "grad_norm": 1.259251187795761, "learning_rate": 2.5738496916838524e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 378061945.0, "step": 456 }, { "entropy": 0.489898681640625, "epoch": 5.193181818181818, "grad_norm": 0.24830082315404475, "learning_rate": 2.505010288392587e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 378873096.0, "step": 457 }, { "entropy": 0.48850250244140625, "epoch": 5.204545454545454, "grad_norm": 5.651716128783114, "learning_rate": 2.4370554071628613e-07, "loss": 0.0112, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 379691075.0, "step": 458 }, { "entropy": 0.494598388671875, "epoch": 5.215909090909091, "grad_norm": 0.2395804306738229, "learning_rate": 2.3699877200382026e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 380488242.0, "step": 459 }, { "entropy": 0.4767303466796875, "epoch": 5.2272727272727275, "grad_norm": 0.2389717565975432, "learning_rate": 2.303809864176909e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 381344689.0, "step": 460 }, { "entropy": 0.47606658935546875, "epoch": 5.238636363636363, "grad_norm": 0.23243670447716458, "learning_rate": 2.2385244417483743e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 382191810.0, "step": 461 }, { "entropy": 0.490386962890625, "epoch": 5.25, "grad_norm": 2.071131604510824, "learning_rate": 2.174134019830726e-07, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 383013514.0, "step": 462 }, { "entropy": 0.4799957275390625, "epoch": 5.261363636363637, "grad_norm": 0.23311333167810783, "learning_rate": 2.1106411303099455e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 383851533.0, "step": 463 }, { "entropy": 0.47631072998046875, "epoch": 5.2727272727272725, "grad_norm": 0.3585355393010696, "learning_rate": 2.0480482697802507e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 384690195.0, "step": 464 }, { "entropy": 0.4852447509765625, "epoch": 5.284090909090909, "grad_norm": 3.2075162933183266, "learning_rate": 1.986357899445976e-07, "loss": 0.0075, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 385510439.0, "step": 465 }, { "entropy": 0.4665679931640625, "epoch": 5.295454545454546, "grad_norm": 0.24275030501776346, "learning_rate": 1.9255724450247676e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 386394511.0, "step": 466 }, { "entropy": 0.48211669921875, "epoch": 5.306818181818182, "grad_norm": 2.8992951132080074, "learning_rate": 1.8656942966522124e-07, "loss": 0.0189, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 387221719.0, "step": 467 }, { "entropy": 0.49072265625, "epoch": 5.318181818181818, "grad_norm": 5.332513171073772, "learning_rate": 1.8067258087878597e-07, "loss": 0.005, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 388021747.0, "step": 468 }, { "entropy": 0.495758056640625, "epoch": 5.329545454545454, "grad_norm": 2.471809529485113, "learning_rate": 1.748669300122627e-07, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 388858436.0, "step": 469 }, { "entropy": 0.47252655029296875, "epoch": 5.340909090909091, "grad_norm": 0.25195781137704687, "learning_rate": 1.691527053487646e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 389713106.0, "step": 470 }, { "entropy": 0.47857666015625, "epoch": 5.3522727272727275, "grad_norm": 0.35830977915651785, "learning_rate": 1.635301315764484e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 390560663.0, "step": 471 }, { "entropy": 0.4821929931640625, "epoch": 5.363636363636363, "grad_norm": 1.1731732549746987, "learning_rate": 1.579994297796808e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 391378014.0, "step": 472 }, { "entropy": 0.47939300537109375, "epoch": 5.375, "grad_norm": 0.2964542659586891, "learning_rate": 1.5256081743034336e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 392217186.0, "step": 473 }, { "entropy": 0.490966796875, "epoch": 5.386363636363637, "grad_norm": 0.36855686939198057, "learning_rate": 1.472145083792842e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 393043338.0, "step": 474 }, { "entropy": 0.49106597900390625, "epoch": 5.3977272727272725, "grad_norm": 0.24516913749219083, "learning_rate": 1.419607128479053e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 393861984.0, "step": 475 }, { "entropy": 0.4851837158203125, "epoch": 5.409090909090909, "grad_norm": 0.30692899589107897, "learning_rate": 1.3679963741990127e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 394694902.0, "step": 476 }, { "entropy": 0.48297882080078125, "epoch": 5.420454545454546, "grad_norm": 0.2871018170154113, "learning_rate": 1.317314850331314e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 395512370.0, "step": 477 }, { "entropy": 0.481048583984375, "epoch": 5.431818181818182, "grad_norm": 0.23400972272255227, "learning_rate": 1.2675645497164352e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 396351709.0, "step": 478 }, { "entropy": 0.4695892333984375, "epoch": 5.443181818181818, "grad_norm": 0.23441130155017773, "learning_rate": 1.2187474285783623e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 397205354.0, "step": 479 }, { "entropy": 0.4716339111328125, "epoch": 5.454545454545454, "grad_norm": 0.2363995447234875, "learning_rate": 1.1708654064476743e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 398068964.0, "step": 480 }, { "entropy": 0.47133636474609375, "epoch": 5.465909090909091, "grad_norm": 0.24166295350052908, "learning_rate": 1.1239203660860648e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 398931333.0, "step": 481 }, { "entropy": 0.48919677734375, "epoch": 5.4772727272727275, "grad_norm": 0.23376095292532034, "learning_rate": 1.0779141534123127e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 399743561.0, "step": 482 }, { "entropy": 0.4713287353515625, "epoch": 5.488636363636363, "grad_norm": 3.7980897041869994, "learning_rate": 1.0328485774296875e-07, "loss": 0.0068, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 400597274.0, "step": 483 }, { "entropy": 0.47472381591796875, "epoch": 5.5, "grad_norm": 0.22904406267035865, "learning_rate": 9.887254101548422e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 401443536.0, "step": 484 }, { "entropy": 0.49383544921875, "epoch": 5.511363636363637, "grad_norm": 1.9032152568586889, "learning_rate": 9.455463865481019e-08, "loss": 0.0118, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 402253531.0, "step": 485 }, { "entropy": 0.47495269775390625, "epoch": 5.5227272727272725, "grad_norm": 2.058118970687396, "learning_rate": 9.033132044452775e-08, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 403096077.0, "step": 486 }, { "entropy": 0.4779815673828125, "epoch": 5.534090909090909, "grad_norm": 0.2320773396883086, "learning_rate": 8.620275244908826e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 403931464.0, "step": 487 }, { "entropy": 0.481781005859375, "epoch": 5.545454545454545, "grad_norm": 0.906876727315481, "learning_rate": 8.216909700728498e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 404753168.0, "step": 488 }, { "entropy": 0.49048614501953125, "epoch": 5.556818181818182, "grad_norm": 0.23101403155939168, "learning_rate": 7.823051272586812e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 405574379.0, "step": 489 }, { "entropy": 0.47937774658203125, "epoch": 5.568181818181818, "grad_norm": 0.22825579969208615, "learning_rate": 7.438715447331018e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 406389523.0, "step": 490 }, { "entropy": 0.48981475830078125, "epoch": 5.579545454545455, "grad_norm": 0.2503992325015645, "learning_rate": 7.063917337371495e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 407214371.0, "step": 491 }, { "entropy": 0.48102569580078125, "epoch": 5.590909090909091, "grad_norm": 0.2513608944740575, "learning_rate": 6.698671680087643e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 408046275.0, "step": 492 }, { "entropy": 0.47139739990234375, "epoch": 5.6022727272727275, "grad_norm": 0.23256492242420046, "learning_rate": 6.342992837248235e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 408911193.0, "step": 493 }, { "entropy": 0.48833465576171875, "epoch": 5.613636363636363, "grad_norm": 0.23168022955887826, "learning_rate": 5.996894794446817e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 409719072.0, "step": 494 }, { "entropy": 0.49280548095703125, "epoch": 5.625, "grad_norm": 0.23122760281603366, "learning_rate": 5.660391160551837e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 410525184.0, "step": 495 }, { "entropy": 0.4806060791015625, "epoch": 5.636363636363637, "grad_norm": 0.22643998190015072, "learning_rate": 5.333495167171354e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 411347407.0, "step": 496 }, { "entropy": 0.48348236083984375, "epoch": 5.6477272727272725, "grad_norm": 0.2258812067160099, "learning_rate": 5.016219668132871e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 412178040.0, "step": 497 }, { "entropy": 0.48406982421875, "epoch": 5.659090909090909, "grad_norm": 0.22725534443329942, "learning_rate": 4.708577138977932e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 413017601.0, "step": 498 }, { "entropy": 0.4803466796875, "epoch": 5.670454545454545, "grad_norm": 0.22801318528268766, "learning_rate": 4.410579676471571e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 413853566.0, "step": 499 }, { "entropy": 0.47000885009765625, "epoch": 5.681818181818182, "grad_norm": 0.2325930970029929, "learning_rate": 4.1222389981265546e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 414704010.0, "step": 500 }, { "entropy": 0.47524261474609375, "epoch": 5.693181818181818, "grad_norm": 0.22453266382199571, "learning_rate": 3.843566441742774e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 415551512.0, "step": 501 }, { "entropy": 0.479248046875, "epoch": 5.704545454545455, "grad_norm": 0.23083370781694745, "learning_rate": 3.574572964961304e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 416393119.0, "step": 502 }, { "entropy": 0.47689056396484375, "epoch": 5.715909090909091, "grad_norm": 2.1901642978456235, "learning_rate": 3.3152691448336825e-08, "loss": 0.003, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 417211364.0, "step": 503 }, { "entropy": 0.4746856689453125, "epoch": 5.7272727272727275, "grad_norm": 0.22865275732966547, "learning_rate": 3.065665177405808e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 418047976.0, "step": 504 }, { "entropy": 0.487152099609375, "epoch": 5.738636363636363, "grad_norm": 0.22775595614971952, "learning_rate": 2.825770877317363e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 418866422.0, "step": 505 }, { "entropy": 0.49505615234375, "epoch": 5.75, "grad_norm": 0.23101305104816183, "learning_rate": 2.5955956774154633e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 419667732.0, "step": 506 }, { "entropy": 0.48188018798828125, "epoch": 5.761363636363637, "grad_norm": 0.22916418909007583, "learning_rate": 2.3751486283840884e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 420500583.0, "step": 507 }, { "entropy": 0.4888153076171875, "epoch": 5.7727272727272725, "grad_norm": 0.2258574174809483, "learning_rate": 2.1644383983880356e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 421327571.0, "step": 508 }, { "entropy": 0.47490692138671875, "epoch": 5.784090909090909, "grad_norm": 0.22690266496506178, "learning_rate": 1.9634732727321636e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 422199122.0, "step": 509 }, { "entropy": 0.4826202392578125, "epoch": 5.795454545454545, "grad_norm": 0.2321274055889175, "learning_rate": 1.7722611535355426e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 423032124.0, "step": 510 }, { "entropy": 0.4724884033203125, "epoch": 5.806818181818182, "grad_norm": 1.107169238484095, "learning_rate": 1.5908095594207585e-08, "loss": 0.0086, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 423876920.0, "step": 511 }, { "entropy": 0.48011016845703125, "epoch": 5.818181818181818, "grad_norm": 0.22872107811700226, "learning_rate": 1.4191256252182595e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 424725606.0, "step": 512 }, { "entropy": 0.479339599609375, "epoch": 5.829545454545455, "grad_norm": 0.222813351912687, "learning_rate": 1.2572161016858874e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 425566413.0, "step": 513 }, { "entropy": 0.48807525634765625, "epoch": 5.840909090909091, "grad_norm": 0.23276414117276925, "learning_rate": 1.1050873552433394e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 426383910.0, "step": 514 }, { "entropy": 0.47275543212890625, "epoch": 5.8522727272727275, "grad_norm": 0.23492927422242632, "learning_rate": 9.627453677218402e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 427233159.0, "step": 515 }, { "entropy": 0.4876251220703125, "epoch": 5.863636363636363, "grad_norm": 0.2293692620353427, "learning_rate": 8.301957361289969e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 428045659.0, "step": 516 }, { "entropy": 0.4870147705078125, "epoch": 5.875, "grad_norm": 2.4244775640221263, "learning_rate": 7.074436724286704e-09, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 428849891.0, "step": 517 }, { "entropy": 0.4868316650390625, "epoch": 5.886363636363637, "grad_norm": 0.22674009504309858, "learning_rate": 5.944940033360269e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 429665132.0, "step": 518 }, { "entropy": 0.48014068603515625, "epoch": 5.8977272727272725, "grad_norm": 0.2269960396547218, "learning_rate": 4.913511701278017e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 430492556.0, "step": 519 }, { "entropy": 0.4708099365234375, "epoch": 5.909090909090909, "grad_norm": 0.229788356926301, "learning_rate": 3.98019228467661e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 431344266.0, "step": 520 }, { "entropy": 0.4788055419921875, "epoch": 5.920454545454545, "grad_norm": 2.692434219230262, "learning_rate": 3.1450184824657892e-09, "loss": 0.0072, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 432174654.0, "step": 521 }, { "entropy": 0.48198699951171875, "epoch": 5.931818181818182, "grad_norm": 5.577294262893831, "learning_rate": 2.408023134387871e-09, "loss": 0.0035, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 433016456.0, "step": 522 }, { "entropy": 0.47763824462890625, "epoch": 5.943181818181818, "grad_norm": 0.22567980456899045, "learning_rate": 1.7692352197240525e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 433844975.0, "step": 523 }, { "entropy": 0.47437286376953125, "epoch": 5.954545454545455, "grad_norm": 0.22722227788364677, "learning_rate": 1.2286798561572666e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 434715113.0, "step": 524 }, { "entropy": 0.487579345703125, "epoch": 5.965909090909091, "grad_norm": 0.22818571422197098, "learning_rate": 7.863782987821422e-10, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 435516656.0, "step": 525 }, { "entropy": 0.4957275390625, "epoch": 5.9772727272727275, "grad_norm": 0.2305305154875918, "learning_rate": 4.4234793927094845e-10, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 436285686.0, "step": 526 }, { "entropy": 0.49588775634765625, "epoch": 5.988636363636363, "grad_norm": 2.880139289310222, "learning_rate": 1.9660230518886436e-10, "loss": 0.0078, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 437058962.0, "step": 527 }, { "entropy": 0.4855499267578125, "epoch": 6.0, "grad_norm": 0.22660971052663118, "learning_rate": 4.915105946246002e-11, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 437879517.0, "step": 528 }, { "epoch": 6.0, "step": 528, "total_flos": 515196244262912.0, "train_loss": 0.5535088468757088, "train_runtime": 94446.5755, "train_samples_per_second": 2.607, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 528, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 44, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 515196244262912.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }