{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5571823120117188, "epoch": 0.011363636363636364, "grad_norm": 385.21640368234523, "learning_rate": 0.0, "loss": 8.3268, "mean_token_accuracy": 0.0, "num_tokens": 822388.0, "step": 1 }, { "entropy": 0.5536270141601562, "epoch": 0.022727272727272728, "grad_norm": 384.42809469067583, "learning_rate": 1.8518518518518518e-07, "loss": 8.3143, "mean_token_accuracy": 0.0, "num_tokens": 1663780.0, "step": 2 }, { "entropy": 0.5437393188476562, "epoch": 0.03409090909090909, "grad_norm": 381.67155444176905, "learning_rate": 3.7037037037037036e-07, "loss": 8.3398, "mean_token_accuracy": 0.0, "num_tokens": 2509387.0, "step": 3 }, { "entropy": 0.565765380859375, "epoch": 0.045454545454545456, "grad_norm": 393.0428633816126, "learning_rate": 5.555555555555555e-07, "loss": 8.2353, "mean_token_accuracy": 0.0, "num_tokens": 3309476.0, "step": 4 }, { "entropy": 0.569061279296875, "epoch": 0.056818181818181816, "grad_norm": 393.9524081345757, "learning_rate": 7.407407407407407e-07, "loss": 8.0805, "mean_token_accuracy": 0.0, "num_tokens": 4091951.0, "step": 5 }, { "entropy": 0.5697250366210938, "epoch": 0.06818181818181818, "grad_norm": 393.9329273505747, "learning_rate": 9.259259259259259e-07, "loss": 8.0282, "mean_token_accuracy": 0.0, "num_tokens": 4914273.0, "step": 6 }, { "entropy": 0.5641937255859375, "epoch": 0.07954545454545454, "grad_norm": 395.153279708828, "learning_rate": 1.111111111111111e-06, "loss": 7.4123, "mean_token_accuracy": 0.0, "num_tokens": 5731174.0, "step": 7 }, { "entropy": 0.5568161010742188, "epoch": 0.09090909090909091, "grad_norm": 268.0751377771076, "learning_rate": 1.2962962962962962e-06, "loss": 5.8473, "mean_token_accuracy": 0.0013020833721384406, "num_tokens": 6548593.0, "step": 8 }, { "entropy": 0.5587234497070312, "epoch": 0.10227272727272728, "grad_norm": 224.3841134938914, "learning_rate": 1.4814814814814815e-06, "loss": 5.5565, "mean_token_accuracy": 0.0026041667442768812, "num_tokens": 7368193.0, "step": 9 }, { "entropy": 0.5561370849609375, "epoch": 0.11363636363636363, "grad_norm": 189.00265530913885, "learning_rate": 1.6666666666666667e-06, "loss": 5.2765, "mean_token_accuracy": 0.02343750069849193, "num_tokens": 8218117.0, "step": 10 }, { "entropy": 0.5667953491210938, "epoch": 0.125, "grad_norm": 103.65546608546263, "learning_rate": 1.8518518518518519e-06, "loss": 4.1131, "mean_token_accuracy": 0.5013020982732996, "num_tokens": 9033810.0, "step": 11 }, { "entropy": 0.5578842163085938, "epoch": 0.13636363636363635, "grad_norm": 96.59175301418695, "learning_rate": 2.037037037037037e-06, "loss": 4.0319, "mean_token_accuracy": 0.5195312654832378, "num_tokens": 9874463.0, "step": 12 }, { "entropy": 0.5625457763671875, "epoch": 0.14772727272727273, "grad_norm": 82.36322891422334, "learning_rate": 2.222222222222222e-06, "loss": 3.8226, "mean_token_accuracy": 0.5182291821110994, "num_tokens": 10695671.0, "step": 13 }, { "entropy": 0.5607070922851562, "epoch": 0.1590909090909091, "grad_norm": 74.48472875071353, "learning_rate": 2.4074074074074075e-06, "loss": 3.7081, "mean_token_accuracy": 0.5078125151339918, "num_tokens": 11502999.0, "step": 14 }, { "entropy": 0.544952392578125, "epoch": 0.17045454545454544, "grad_norm": 59.80968159915808, "learning_rate": 2.5925925925925925e-06, "loss": 3.269, "mean_token_accuracy": 0.514322931994684, "num_tokens": 12334457.0, "step": 15 }, { "entropy": 0.5462646484375, "epoch": 0.18181818181818182, "grad_norm": 58.86687930413585, "learning_rate": 2.7777777777777783e-06, "loss": 3.1993, "mean_token_accuracy": 0.5312500158324838, "num_tokens": 13183738.0, "step": 16 }, { "entropy": 0.556304931640625, "epoch": 0.19318181818181818, "grad_norm": 57.620691853575465, "learning_rate": 2.962962962962963e-06, "loss": 3.1523, "mean_token_accuracy": 0.5429687661817297, "num_tokens": 14017046.0, "step": 17 }, { "entropy": 0.5455474853515625, "epoch": 0.20454545454545456, "grad_norm": 57.480386770069764, "learning_rate": 3.1481481481481483e-06, "loss": 3.0979, "mean_token_accuracy": 0.5234375155996531, "num_tokens": 14864077.0, "step": 18 }, { "entropy": 0.5444412231445312, "epoch": 0.2159090909090909, "grad_norm": 57.75046967405615, "learning_rate": 3.3333333333333333e-06, "loss": 3.0502, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 15703491.0, "step": 19 }, { "entropy": 0.54150390625, "epoch": 0.22727272727272727, "grad_norm": 58.06698268429491, "learning_rate": 3.5185185185185187e-06, "loss": 2.9672, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 16547715.0, "step": 20 }, { "entropy": 0.532958984375, "epoch": 0.23863636363636365, "grad_norm": 57.73279218676568, "learning_rate": 3.7037037037037037e-06, "loss": 2.9237, "mean_token_accuracy": 0.5481770996702835, "num_tokens": 17398906.0, "step": 21 }, { "entropy": 0.5381011962890625, "epoch": 0.25, "grad_norm": 58.76356982867235, "learning_rate": 3.88888888888889e-06, "loss": 2.9056, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 18233243.0, "step": 22 }, { "entropy": 0.5452651977539062, "epoch": 0.26136363636363635, "grad_norm": 60.813397297916524, "learning_rate": 4.074074074074074e-06, "loss": 2.8986, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 19088014.0, "step": 23 }, { "entropy": 0.535552978515625, "epoch": 0.2727272727272727, "grad_norm": 58.575483868055635, "learning_rate": 4.2592592592592596e-06, "loss": 2.8673, "mean_token_accuracy": 0.5442708495538682, "num_tokens": 19910897.0, "step": 24 }, { "entropy": 0.551544189453125, "epoch": 0.2840909090909091, "grad_norm": 58.059741108606026, "learning_rate": 4.444444444444444e-06, "loss": 2.8368, "mean_token_accuracy": 0.5416666828095913, "num_tokens": 20704060.0, "step": 25 }, { "entropy": 0.5462570190429688, "epoch": 0.29545454545454547, "grad_norm": 57.74145163004164, "learning_rate": 4.62962962962963e-06, "loss": 2.8138, "mean_token_accuracy": 0.5559895999031141, "num_tokens": 21502809.0, "step": 26 }, { "entropy": 0.535614013671875, "epoch": 0.3068181818181818, "grad_norm": 57.087826374867525, "learning_rate": 4.814814814814815e-06, "loss": 2.7772, "mean_token_accuracy": 0.5546875165309757, "num_tokens": 22341880.0, "step": 27 }, { "entropy": 0.5427932739257812, "epoch": 0.3181818181818182, "grad_norm": 57.19764036404579, "learning_rate": 5e-06, "loss": 2.7402, "mean_token_accuracy": 0.5625000167638063, "num_tokens": 23163457.0, "step": 28 }, { "entropy": 0.5347366333007812, "epoch": 0.32954545454545453, "grad_norm": 57.175262078414924, "learning_rate": 4.999950848940538e-06, "loss": 2.7043, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 24015452.0, "step": 29 }, { "entropy": 0.5446624755859375, "epoch": 0.3409090909090909, "grad_norm": 58.34082962604848, "learning_rate": 4.999803397694811e-06, "loss": 2.6615, "mean_token_accuracy": 0.5807291839737445, "num_tokens": 24837556.0, "step": 30 }, { "entropy": 0.5349578857421875, "epoch": 0.3522727272727273, "grad_norm": 59.10124128246407, "learning_rate": 4.999557652060729e-06, "loss": 2.6609, "mean_token_accuracy": 0.5494791830424219, "num_tokens": 25669799.0, "step": 31 }, { "entropy": 0.5413665771484375, "epoch": 0.36363636363636365, "grad_norm": 58.793667611594415, "learning_rate": 4.9992136217012184e-06, "loss": 2.6378, "mean_token_accuracy": 0.5533854331588373, "num_tokens": 26474626.0, "step": 32 }, { "entropy": 0.5285873413085938, "epoch": 0.375, "grad_norm": 58.352494787635415, "learning_rate": 4.998771320143843e-06, "loss": 2.5815, "mean_token_accuracy": 0.5755208504851907, "num_tokens": 27323183.0, "step": 33 }, { "entropy": 0.5407485961914062, "epoch": 0.38636363636363635, "grad_norm": 59.10192767158911, "learning_rate": 4.998230764780277e-06, "loss": 2.5841, "mean_token_accuracy": 0.5559895999031141, "num_tokens": 28160848.0, "step": 34 }, { "entropy": 0.5376205444335938, "epoch": 0.3977272727272727, "grad_norm": 58.51220654461849, "learning_rate": 4.9975919768656125e-06, "loss": 2.5609, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 28984769.0, "step": 35 }, { "entropy": 0.5457916259765625, "epoch": 0.4090909090909091, "grad_norm": 58.821677282544606, "learning_rate": 4.996854981517535e-06, "loss": 2.5264, "mean_token_accuracy": 0.5690104336244985, "num_tokens": 29793313.0, "step": 36 }, { "entropy": 0.5445785522460938, "epoch": 0.42045454545454547, "grad_norm": 58.75541166163536, "learning_rate": 4.996019807715324e-06, "loss": 2.483, "mean_token_accuracy": 0.582031267345883, "num_tokens": 30612686.0, "step": 37 }, { "entropy": 0.5450210571289062, "epoch": 0.4318181818181818, "grad_norm": 59.079272782778474, "learning_rate": 4.995086488298723e-06, "loss": 2.4537, "mean_token_accuracy": 0.5781250172294676, "num_tokens": 31438404.0, "step": 38 }, { "entropy": 0.5361175537109375, "epoch": 0.4431818181818182, "grad_norm": 59.24200293608633, "learning_rate": 4.994055059966641e-06, "loss": 2.449, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 32287533.0, "step": 39 }, { "entropy": 0.5356903076171875, "epoch": 0.45454545454545453, "grad_norm": 59.44643683264253, "learning_rate": 4.992925563275714e-06, "loss": 2.4143, "mean_token_accuracy": 0.5716146003687754, "num_tokens": 33108278.0, "step": 40 }, { "entropy": 0.5239410400390625, "epoch": 0.4659090909090909, "grad_norm": 59.81603078314444, "learning_rate": 4.991698042638711e-06, "loss": 2.395, "mean_token_accuracy": 0.5742187671130523, "num_tokens": 33976204.0, "step": 41 }, { "entropy": 0.5448150634765625, "epoch": 0.4772727272727273, "grad_norm": 59.85712972440978, "learning_rate": 4.990372546322782e-06, "loss": 2.3735, "mean_token_accuracy": 0.5742187671130523, "num_tokens": 34776355.0, "step": 42 }, { "entropy": 0.536407470703125, "epoch": 0.48863636363636365, "grad_norm": 59.778479025488444, "learning_rate": 4.988949126447567e-06, "loss": 2.3306, "mean_token_accuracy": 0.5846354340901598, "num_tokens": 35613147.0, "step": 43 }, { "entropy": 0.539764404296875, "epoch": 0.5, "grad_norm": 60.09274289295303, "learning_rate": 4.987427838983141e-06, "loss": 2.3192, "mean_token_accuracy": 0.5898437672294676, "num_tokens": 36433167.0, "step": 44 }, { "entropy": 0.5478591918945312, "epoch": 0.5113636363636364, "grad_norm": 60.55403859339502, "learning_rate": 4.985808743747817e-06, "loss": 2.3149, "mean_token_accuracy": 0.6705729302484542, "num_tokens": 37236428.0, "step": 45 }, { "entropy": 0.5397415161132812, "epoch": 0.5227272727272727, "grad_norm": 62.35805884322216, "learning_rate": 4.984091904405793e-06, "loss": 2.2891, "mean_token_accuracy": 0.8437500081490725, "num_tokens": 38067267.0, "step": 46 }, { "entropy": 0.5498504638671875, "epoch": 0.5340909090909091, "grad_norm": 60.83879041574092, "learning_rate": 4.9822773884646444e-06, "loss": 2.2473, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 38875935.0, "step": 47 }, { "entropy": 0.5675277709960938, "epoch": 0.5454545454545454, "grad_norm": 61.64315026315899, "learning_rate": 4.980365267272679e-06, "loss": 2.2178, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 39647451.0, "step": 48 }, { "entropy": 0.5543670654296875, "epoch": 0.5568181818181818, "grad_norm": 60.46219588564338, "learning_rate": 4.97835561601612e-06, "loss": 2.2005, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 40443361.0, "step": 49 }, { "entropy": 0.5416412353515625, "epoch": 0.5681818181818182, "grad_norm": 61.28005392495543, "learning_rate": 4.97624851371616e-06, "loss": 2.1746, "mean_token_accuracy": 0.912760421866551, "num_tokens": 41287350.0, "step": 50 }, { "entropy": 0.5378799438476562, "epoch": 0.5795454545454546, "grad_norm": 60.86193896824303, "learning_rate": 4.974044043225846e-06, "loss": 2.1638, "mean_token_accuracy": 0.8971354227978736, "num_tokens": 42124083.0, "step": 51 }, { "entropy": 0.5416336059570312, "epoch": 0.5909090909090909, "grad_norm": 60.167542832450046, "learning_rate": 4.9717422912268265e-06, "loss": 2.1153, "mean_token_accuracy": 0.923177087912336, "num_tokens": 42953412.0, "step": 52 }, { "entropy": 0.5258102416992188, "epoch": 0.6022727272727273, "grad_norm": 60.83496750149086, "learning_rate": 4.969343348225942e-06, "loss": 2.1027, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 43836493.0, "step": 53 }, { "entropy": 0.5307540893554688, "epoch": 0.6136363636363636, "grad_norm": 59.703581210859504, "learning_rate": 4.966847308551664e-06, "loss": 2.0617, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 44713961.0, "step": 54 }, { "entropy": 0.5646438598632812, "epoch": 0.625, "grad_norm": 59.70907545001026, "learning_rate": 4.9642542703503874e-06, "loss": 2.0297, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 45494752.0, "step": 55 }, { "entropy": 0.5508499145507812, "epoch": 0.6363636363636364, "grad_norm": 60.33452301695544, "learning_rate": 4.961564335582572e-06, "loss": 2.0186, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 46313051.0, "step": 56 }, { "entropy": 0.5439376831054688, "epoch": 0.6477272727272727, "grad_norm": 59.417420203908236, "learning_rate": 4.958777610018734e-06, "loss": 1.9711, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 47131891.0, "step": 57 }, { "entropy": 0.535064697265625, "epoch": 0.6590909090909091, "grad_norm": 61.1612600556266, "learning_rate": 4.955894203235285e-06, "loss": 1.987, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 47986676.0, "step": 58 }, { "entropy": 0.5515365600585938, "epoch": 0.6704545454545454, "grad_norm": 59.294618242081526, "learning_rate": 4.952914228610221e-06, "loss": 1.925, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 48787244.0, "step": 59 }, { "entropy": 0.5373687744140625, "epoch": 0.6818181818181818, "grad_norm": 59.999785375745, "learning_rate": 4.949837803318672e-06, "loss": 1.9208, "mean_token_accuracy": 0.9075520888436586, "num_tokens": 49618007.0, "step": 60 }, { "entropy": 0.5391998291015625, "epoch": 0.6931818181818182, "grad_norm": 59.46795359368021, "learning_rate": 4.946665048328288e-06, "loss": 1.9003, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 50454992.0, "step": 61 }, { "entropy": 0.5372238159179688, "epoch": 0.7045454545454546, "grad_norm": 58.901412847118884, "learning_rate": 4.943396088394482e-06, "loss": 1.8635, "mean_token_accuracy": 0.923177087912336, "num_tokens": 51281637.0, "step": 62 }, { "entropy": 0.5271835327148438, "epoch": 0.7159090909090909, "grad_norm": 59.33608755577397, "learning_rate": 4.940031052055532e-06, "loss": 1.8429, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 52144982.0, "step": 63 }, { "entropy": 0.5448532104492188, "epoch": 0.7272727272727273, "grad_norm": 58.68009408640386, "learning_rate": 4.936570071627517e-06, "loss": 1.8119, "mean_token_accuracy": 0.923177087912336, "num_tokens": 52944511.0, "step": 64 }, { "entropy": 0.5371246337890625, "epoch": 0.7386363636363636, "grad_norm": 58.322502737167724, "learning_rate": 4.933013283199124e-06, "loss": 1.7839, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 53800640.0, "step": 65 }, { "entropy": 0.5559005737304688, "epoch": 0.75, "grad_norm": 58.84329551172846, "learning_rate": 4.929360826626286e-06, "loss": 1.7529, "mean_token_accuracy": 0.923177087912336, "num_tokens": 54608145.0, "step": 66 }, { "entropy": 0.5375823974609375, "epoch": 0.7613636363636364, "grad_norm": 58.43819701446535, "learning_rate": 4.925612845526691e-06, "loss": 1.7149, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 55447213.0, "step": 67 }, { "entropy": 0.53643798828125, "epoch": 0.7727272727272727, "grad_norm": 58.71021880546486, "learning_rate": 4.921769487274132e-06, "loss": 1.7003, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 56264710.0, "step": 68 }, { "entropy": 0.5373077392578125, "epoch": 0.7840909090909091, "grad_norm": 59.089983009907726, "learning_rate": 4.917830902992716e-06, "loss": 1.6821, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 57110504.0, "step": 69 }, { "entropy": 0.5417709350585938, "epoch": 0.7954545454545454, "grad_norm": 59.291978039995975, "learning_rate": 4.913797247550912e-06, "loss": 1.6536, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 57947180.0, "step": 70 }, { "entropy": 0.5328216552734375, "epoch": 0.8068181818181818, "grad_norm": 59.78514618831787, "learning_rate": 4.9096686795554725e-06, "loss": 1.6315, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 58807727.0, "step": 71 }, { "entropy": 0.5300979614257812, "epoch": 0.8181818181818182, "grad_norm": 58.70313760166556, "learning_rate": 4.90544536134519e-06, "loss": 1.6079, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 59648751.0, "step": 72 }, { "entropy": 0.5442657470703125, "epoch": 0.8295454545454546, "grad_norm": 58.89538681848992, "learning_rate": 4.901127458984516e-06, "loss": 1.57, "mean_token_accuracy": 0.923177087912336, "num_tokens": 60449685.0, "step": 73 }, { "entropy": 0.5391616821289062, "epoch": 0.8409090909090909, "grad_norm": 58.164567635067016, "learning_rate": 4.8967151422570314e-06, "loss": 1.5271, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 61277350.0, "step": 74 }, { "entropy": 0.5380706787109375, "epoch": 0.8522727272727273, "grad_norm": 58.50553706931795, "learning_rate": 4.89220858465877e-06, "loss": 1.5198, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 62103032.0, "step": 75 }, { "entropy": 0.5351943969726562, "epoch": 0.8636363636363636, "grad_norm": 58.68232205702871, "learning_rate": 4.887607963391394e-06, "loss": 1.4977, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 62933713.0, "step": 76 }, { "entropy": 0.5309371948242188, "epoch": 0.875, "grad_norm": 58.4965215154889, "learning_rate": 4.882913459355233e-06, "loss": 1.4475, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 63777329.0, "step": 77 }, { "entropy": 0.5414886474609375, "epoch": 0.8863636363636364, "grad_norm": 58.945500460294106, "learning_rate": 4.878125257142165e-06, "loss": 1.406, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 64599815.0, "step": 78 }, { "entropy": 0.5440673828125, "epoch": 0.8977272727272727, "grad_norm": 59.64424314623865, "learning_rate": 4.873243545028356e-06, "loss": 1.3944, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 65404581.0, "step": 79 }, { "entropy": 0.5271148681640625, "epoch": 0.9090909090909091, "grad_norm": 58.8986649354466, "learning_rate": 4.868268514966869e-06, "loss": 1.3574, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 66249371.0, "step": 80 }, { "entropy": 0.533050537109375, "epoch": 0.9204545454545454, "grad_norm": 58.86258736268561, "learning_rate": 4.8632003625800995e-06, "loss": 1.3254, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 67065160.0, "step": 81 }, { "entropy": 0.5338363647460938, "epoch": 0.9318181818181818, "grad_norm": 58.54026874474276, "learning_rate": 4.858039287152095e-06, "loss": 1.3278, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 67910596.0, "step": 82 }, { "entropy": 0.54443359375, "epoch": 0.9431818181818182, "grad_norm": 59.28551515148561, "learning_rate": 4.852785491620716e-06, "loss": 1.2784, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 68714945.0, "step": 83 }, { "entropy": 0.5306396484375, "epoch": 0.9545454545454546, "grad_norm": 58.110179443715445, "learning_rate": 4.847439182569656e-06, "loss": 1.2402, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 69553156.0, "step": 84 }, { "entropy": 0.5545120239257812, "epoch": 0.9659090909090909, "grad_norm": 57.89545411687433, "learning_rate": 4.84200057022032e-06, "loss": 1.2466, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 70339835.0, "step": 85 }, { "entropy": 0.5556106567382812, "epoch": 0.9772727272727273, "grad_norm": 57.867886870613994, "learning_rate": 4.836469868423552e-06, "loss": 1.19, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 71112665.0, "step": 86 }, { "entropy": 0.5272903442382812, "epoch": 0.9886363636363636, "grad_norm": 57.47960696717133, "learning_rate": 4.830847294651236e-06, "loss": 1.1546, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 71973001.0, "step": 87 }, { "entropy": 0.5232162475585938, "epoch": 1.0, "grad_norm": 57.363764683028826, "learning_rate": 4.825133069987737e-06, "loss": 1.128, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 72849046.0, "step": 88 }, { "entropy": 0.5514373779296875, "epoch": 1.0113636363636365, "grad_norm": 58.53909714804268, "learning_rate": 4.819327419121215e-06, "loss": 1.1251, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 73657560.0, "step": 89 }, { "entropy": 0.5366439819335938, "epoch": 1.0227272727272727, "grad_norm": 57.5800616711963, "learning_rate": 4.81343057033478e-06, "loss": 1.0801, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 74476899.0, "step": 90 }, { "entropy": 0.5482177734375, "epoch": 1.0340909090909092, "grad_norm": 56.91254331845363, "learning_rate": 4.8074427554975235e-06, "loss": 1.0439, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 75271183.0, "step": 91 }, { "entropy": 0.5299148559570312, "epoch": 1.0454545454545454, "grad_norm": 56.66420629218523, "learning_rate": 4.8013642100554034e-06, "loss": 1.0211, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 76125797.0, "step": 92 }, { "entropy": 0.5338363647460938, "epoch": 1.0568181818181819, "grad_norm": 57.11090670537613, "learning_rate": 4.795195173021976e-06, "loss": 0.9976, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 76945222.0, "step": 93 }, { "entropy": 0.5279388427734375, "epoch": 1.0681818181818181, "grad_norm": 56.299495025749735, "learning_rate": 4.7889358869690065e-06, "loss": 0.9768, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 77779661.0, "step": 94 }, { "entropy": 0.535430908203125, "epoch": 1.0795454545454546, "grad_norm": 56.34968738969898, "learning_rate": 4.782586598016928e-06, "loss": 0.9661, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 78597732.0, "step": 95 }, { "entropy": 0.5536880493164062, "epoch": 1.0909090909090908, "grad_norm": 55.85444993026245, "learning_rate": 4.776147555825164e-06, "loss": 0.9308, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 79368044.0, "step": 96 }, { "entropy": 0.53662109375, "epoch": 1.1022727272727273, "grad_norm": 55.50053901427309, "learning_rate": 4.769619013582309e-06, "loss": 0.8934, "mean_token_accuracy": 0.955729169305414, "num_tokens": 80185996.0, "step": 97 }, { "entropy": 0.5202407836914062, "epoch": 1.1136363636363635, "grad_norm": 55.28328962448184, "learning_rate": 4.7630012279961805e-06, "loss": 0.886, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 81053015.0, "step": 98 }, { "entropy": 0.5313262939453125, "epoch": 1.125, "grad_norm": 54.8320410044771, "learning_rate": 4.7562944592837145e-06, "loss": 0.8601, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 81874229.0, "step": 99 }, { "entropy": 0.52691650390625, "epoch": 1.1363636363636362, "grad_norm": 54.964844470984715, "learning_rate": 4.749498971160742e-06, "loss": 0.8504, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 82712812.0, "step": 100 }, { "entropy": 0.524139404296875, "epoch": 1.1477272727272727, "grad_norm": 54.2272069397531, "learning_rate": 4.742615030831615e-06, "loss": 0.8163, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 83530069.0, "step": 101 }, { "entropy": 0.5243072509765625, "epoch": 1.1590909090909092, "grad_norm": 53.79974981157999, "learning_rate": 4.735642908978704e-06, "loss": 0.7875, "mean_token_accuracy": 0.945312503259629, "num_tokens": 84350790.0, "step": 102 }, { "entropy": 0.5246505737304688, "epoch": 1.1704545454545454, "grad_norm": 53.51593166643302, "learning_rate": 4.728582879751746e-06, "loss": 0.7576, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 85161819.0, "step": 103 }, { "entropy": 0.509765625, "epoch": 1.1818181818181819, "grad_norm": 53.138333179917204, "learning_rate": 4.721435220757078e-06, "loss": 0.7282, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 86006089.0, "step": 104 }, { "entropy": 0.527130126953125, "epoch": 1.1931818181818181, "grad_norm": 52.99230989719208, "learning_rate": 4.714200213046707e-06, "loss": 0.741, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 86836190.0, "step": 105 }, { "entropy": 0.5215072631835938, "epoch": 1.2045454545454546, "grad_norm": 52.31920733776493, "learning_rate": 4.706878141107269e-06, "loss": 0.7269, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 87654111.0, "step": 106 }, { "entropy": 0.5284652709960938, "epoch": 1.2159090909090908, "grad_norm": 51.7538829709675, "learning_rate": 4.699469292848839e-06, "loss": 0.6748, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 88442734.0, "step": 107 }, { "entropy": 0.5117950439453125, "epoch": 1.2272727272727273, "grad_norm": 50.822310194810335, "learning_rate": 4.691973959593609e-06, "loss": 0.6566, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 89296093.0, "step": 108 }, { "entropy": 0.5231399536132812, "epoch": 1.2386363636363638, "grad_norm": 50.17015940796914, "learning_rate": 4.6843924360644385e-06, "loss": 0.6354, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 90133661.0, "step": 109 }, { "entropy": 0.5147552490234375, "epoch": 1.25, "grad_norm": 48.95194138585349, "learning_rate": 4.676725020373255e-06, "loss": 0.6273, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 90980602.0, "step": 110 }, { "entropy": 0.52349853515625, "epoch": 1.2613636363636362, "grad_norm": 47.39724961748752, "learning_rate": 4.6689720140093445e-06, "loss": 0.5975, "mean_token_accuracy": 0.945312503259629, "num_tokens": 91807048.0, "step": 111 }, { "entropy": 0.521514892578125, "epoch": 1.2727272727272727, "grad_norm": 46.42644291070126, "learning_rate": 4.661133721827487e-06, "loss": 0.5747, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 92647140.0, "step": 112 }, { "entropy": 0.5202713012695312, "epoch": 1.2840909090909092, "grad_norm": 46.2730968549775, "learning_rate": 4.653210452035974e-06, "loss": 0.5663, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 93498584.0, "step": 113 }, { "entropy": 0.52655029296875, "epoch": 1.2954545454545454, "grad_norm": 46.71747285833351, "learning_rate": 4.645202516184492e-06, "loss": 0.5568, "mean_token_accuracy": 0.945312503259629, "num_tokens": 94324355.0, "step": 114 }, { "entropy": 0.5159225463867188, "epoch": 1.3068181818181819, "grad_norm": 45.84890607800894, "learning_rate": 4.6371102291518635e-06, "loss": 0.5298, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 95183202.0, "step": 115 }, { "entropy": 0.5389862060546875, "epoch": 1.3181818181818181, "grad_norm": 42.59073288918198, "learning_rate": 4.628933909133674e-06, "loss": 0.5097, "mean_token_accuracy": 0.945312503259629, "num_tokens": 95998136.0, "step": 116 }, { "entropy": 0.529327392578125, "epoch": 1.3295454545454546, "grad_norm": 41.670866693165614, "learning_rate": 4.620673877629757e-06, "loss": 0.4863, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 96842943.0, "step": 117 }, { "entropy": 0.5238037109375, "epoch": 1.3409090909090908, "grad_norm": 40.26261711766899, "learning_rate": 4.612330459431552e-06, "loss": 0.4633, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 97710263.0, "step": 118 }, { "entropy": 0.5423736572265625, "epoch": 1.3522727272727273, "grad_norm": 42.256122892341864, "learning_rate": 4.603903982609334e-06, "loss": 0.4653, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 98497301.0, "step": 119 }, { "entropy": 0.5260009765625, "epoch": 1.3636363636363638, "grad_norm": 43.312904169159644, "learning_rate": 4.595394778499314e-06, "loss": 0.5063, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 99332885.0, "step": 120 }, { "entropy": 0.5403060913085938, "epoch": 1.375, "grad_norm": 35.515421668024835, "learning_rate": 4.586803181690609e-06, "loss": 0.4049, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 100153118.0, "step": 121 }, { "entropy": 0.530426025390625, "epoch": 1.3863636363636362, "grad_norm": 39.33381625799498, "learning_rate": 4.5781295300120885e-06, "loss": 0.4432, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 101017186.0, "step": 122 }, { "entropy": 0.54278564453125, "epoch": 1.3977272727272727, "grad_norm": 33.567114056620284, "learning_rate": 4.569374164519088e-06, "loss": 0.3836, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 101837538.0, "step": 123 }, { "entropy": 0.5357742309570312, "epoch": 1.4090909090909092, "grad_norm": 34.647421460830614, "learning_rate": 4.560537429479998e-06, "loss": 0.4015, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 102674523.0, "step": 124 }, { "entropy": 0.5452957153320312, "epoch": 1.4204545454545454, "grad_norm": 31.848760000487612, "learning_rate": 4.5516196723627325e-06, "loss": 0.3631, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 103498410.0, "step": 125 }, { "entropy": 0.5371170043945312, "epoch": 1.4318181818181819, "grad_norm": 32.82915500567515, "learning_rate": 4.542621243821058e-06, "loss": 0.3459, "mean_token_accuracy": 0.945312503259629, "num_tokens": 104317389.0, "step": 126 }, { "entropy": 0.5457000732421875, "epoch": 1.4431818181818181, "grad_norm": 30.802170293048523, "learning_rate": 4.533542497680811e-06, "loss": 0.3474, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 105130635.0, "step": 127 }, { "entropy": 0.5292892456054688, "epoch": 1.4545454545454546, "grad_norm": 28.598798267034123, "learning_rate": 4.524383790925987e-06, "loss": 0.2939, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 105967667.0, "step": 128 }, { "entropy": 0.541412353515625, "epoch": 1.4659090909090908, "grad_norm": 31.427843756705663, "learning_rate": 4.515145483684696e-06, "loss": 0.3418, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 106764890.0, "step": 129 }, { "entropy": 0.540252685546875, "epoch": 1.4772727272727273, "grad_norm": 26.796560576120022, "learning_rate": 4.505827939215009e-06, "loss": 0.2719, "mean_token_accuracy": 0.967447918606922, "num_tokens": 107567415.0, "step": 130 }, { "entropy": 0.5262680053710938, "epoch": 1.4886363636363638, "grad_norm": 28.763274799154782, "learning_rate": 4.496431523890673e-06, "loss": 0.3127, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 108411240.0, "step": 131 }, { "entropy": 0.5342483520507812, "epoch": 1.5, "grad_norm": 25.48018544383524, "learning_rate": 4.486956607186702e-06, "loss": 0.2803, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 109210428.0, "step": 132 }, { "entropy": 0.5379180908203125, "epoch": 1.5113636363636362, "grad_norm": 24.901297620374315, "learning_rate": 4.477403561664852e-06, "loss": 0.2872, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 110045310.0, "step": 133 }, { "entropy": 0.5262680053710938, "epoch": 1.5227272727272727, "grad_norm": 22.091806427948622, "learning_rate": 4.467772762958968e-06, "loss": 0.2496, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 110882589.0, "step": 134 }, { "entropy": 0.5295639038085938, "epoch": 1.5340909090909092, "grad_norm": 23.292778405119115, "learning_rate": 4.458064589760221e-06, "loss": 0.2408, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 111705631.0, "step": 135 }, { "entropy": 0.5286483764648438, "epoch": 1.5454545454545454, "grad_norm": 21.213889535771415, "learning_rate": 4.448279423802207e-06, "loss": 0.2283, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 112539225.0, "step": 136 }, { "entropy": 0.5333480834960938, "epoch": 1.5568181818181817, "grad_norm": 19.684812094647675, "learning_rate": 4.438417649845946e-06, "loss": 0.2291, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 113362874.0, "step": 137 }, { "entropy": 0.5320663452148438, "epoch": 1.5681818181818183, "grad_norm": 18.51754571632508, "learning_rate": 4.428479655664748e-06, "loss": 0.1981, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 114182515.0, "step": 138 }, { "entropy": 0.5383987426757812, "epoch": 1.5795454545454546, "grad_norm": 17.723585060157205, "learning_rate": 4.4184658320289675e-06, "loss": 0.2078, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 115003365.0, "step": 139 }, { "entropy": 0.5347518920898438, "epoch": 1.5909090909090908, "grad_norm": 16.78330133242101, "learning_rate": 4.408376572690638e-06, "loss": 0.2172, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 115805016.0, "step": 140 }, { "entropy": 0.5168228149414062, "epoch": 1.6022727272727273, "grad_norm": 15.362502774932778, "learning_rate": 4.3982122743679875e-06, "loss": 0.1947, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 116648818.0, "step": 141 }, { "entropy": 0.5343856811523438, "epoch": 1.6136363636363638, "grad_norm": 18.024974195199757, "learning_rate": 4.387973336729841e-06, "loss": 0.2382, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 117458175.0, "step": 142 }, { "entropy": 0.521270751953125, "epoch": 1.625, "grad_norm": 14.018027983518943, "learning_rate": 4.377660162379904e-06, "loss": 0.1929, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 118312779.0, "step": 143 }, { "entropy": 0.5243148803710938, "epoch": 1.6363636363636362, "grad_norm": 14.99949121416359, "learning_rate": 4.3672731568409344e-06, "loss": 0.1898, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 119140778.0, "step": 144 }, { "entropy": 0.5383453369140625, "epoch": 1.6477272727272727, "grad_norm": 13.695194386624797, "learning_rate": 4.3568127285387925e-06, "loss": 0.1841, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 119957153.0, "step": 145 }, { "entropy": 0.5297164916992188, "epoch": 1.6590909090909092, "grad_norm": 12.975994099921467, "learning_rate": 4.346279288786387e-06, "loss": 0.1681, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 120773292.0, "step": 146 }, { "entropy": 0.5347518920898438, "epoch": 1.6704545454545454, "grad_norm": 21.362591160453682, "learning_rate": 4.3356732517674935e-06, "loss": 0.2171, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 121574882.0, "step": 147 }, { "entropy": 0.5269546508789062, "epoch": 1.6818181818181817, "grad_norm": 19.616475421681326, "learning_rate": 4.32499503452048e-06, "loss": 0.1828, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 122412333.0, "step": 148 }, { "entropy": 0.5350723266601562, "epoch": 1.6931818181818183, "grad_norm": 10.633055149272264, "learning_rate": 4.314245056921899e-06, "loss": 0.1523, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 123241270.0, "step": 149 }, { "entropy": 0.5313796997070312, "epoch": 1.7045454545454546, "grad_norm": 11.24894351109355, "learning_rate": 4.303423741669978e-06, "loss": 0.1697, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 124075099.0, "step": 150 }, { "entropy": 0.536651611328125, "epoch": 1.7159090909090908, "grad_norm": 15.692069667523771, "learning_rate": 4.292531514268008e-06, "loss": 0.182, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 124928980.0, "step": 151 }, { "entropy": 0.5392379760742188, "epoch": 1.7272727272727273, "grad_norm": 9.664745819966296, "learning_rate": 4.281568803007601e-06, "loss": 0.1791, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 125748569.0, "step": 152 }, { "entropy": 0.54058837890625, "epoch": 1.7386363636363638, "grad_norm": 15.28963396758985, "learning_rate": 4.270536038951855e-06, "loss": 0.1828, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 126583565.0, "step": 153 }, { "entropy": 0.5325546264648438, "epoch": 1.75, "grad_norm": 8.152421877410646, "learning_rate": 4.259433655918404e-06, "loss": 0.1505, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 127434652.0, "step": 154 }, { "entropy": 0.5367050170898438, "epoch": 1.7613636363636362, "grad_norm": 13.314659677750605, "learning_rate": 4.24826209046236e-06, "loss": 0.1763, "mean_token_accuracy": 0.945312503259629, "num_tokens": 128265274.0, "step": 155 }, { "entropy": 0.5238265991210938, "epoch": 1.7727272727272727, "grad_norm": 8.732379857922384, "learning_rate": 4.237021781859143e-06, "loss": 0.1685, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 129118921.0, "step": 156 }, { "entropy": 0.5306396484375, "epoch": 1.7840909090909092, "grad_norm": 13.453938351329127, "learning_rate": 4.225713172087216e-06, "loss": 0.1672, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 129924616.0, "step": 157 }, { "entropy": 0.5287246704101562, "epoch": 1.7954545454545454, "grad_norm": 8.201845038506509, "learning_rate": 4.2143367058107e-06, "loss": 0.1545, "mean_token_accuracy": 0.955729169305414, "num_tokens": 130759784.0, "step": 158 }, { "entropy": 0.5428314208984375, "epoch": 1.8068181818181817, "grad_norm": 10.976889686141423, "learning_rate": 4.202892830361892e-06, "loss": 0.1581, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 131523961.0, "step": 159 }, { "entropy": 0.5345458984375, "epoch": 1.8181818181818183, "grad_norm": 8.166044940510316, "learning_rate": 4.191381995723672e-06, "loss": 0.1324, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 132329209.0, "step": 160 }, { "entropy": 0.5333099365234375, "epoch": 1.8295454545454546, "grad_norm": 12.143258833540356, "learning_rate": 4.179804654511816e-06, "loss": 0.1465, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 133146370.0, "step": 161 }, { "entropy": 0.5122833251953125, "epoch": 1.8409090909090908, "grad_norm": 6.838641869872832, "learning_rate": 4.168161261957192e-06, "loss": 0.1375, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 133988967.0, "step": 162 }, { "entropy": 0.5211181640625, "epoch": 1.8522727272727273, "grad_norm": 15.025449078819449, "learning_rate": 4.1564522758878656e-06, "loss": 0.1562, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 134816536.0, "step": 163 }, { "entropy": 0.5217361450195312, "epoch": 1.8636363636363638, "grad_norm": 7.58342476695868, "learning_rate": 4.144678156711091e-06, "loss": 0.1333, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 135660302.0, "step": 164 }, { "entropy": 0.5208892822265625, "epoch": 1.875, "grad_norm": 12.39285081760315, "learning_rate": 4.132839367395215e-06, "loss": 0.144, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 136493782.0, "step": 165 }, { "entropy": 0.533935546875, "epoch": 1.8863636363636362, "grad_norm": 11.441864197889792, "learning_rate": 4.120936373451467e-06, "loss": 0.1625, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 137296430.0, "step": 166 }, { "entropy": 0.5355453491210938, "epoch": 1.8977272727272727, "grad_norm": 6.843215535272569, "learning_rate": 4.108969642915658e-06, "loss": 0.1353, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 138111216.0, "step": 167 }, { "entropy": 0.5361480712890625, "epoch": 1.9090909090909092, "grad_norm": 8.869401349710477, "learning_rate": 4.096939646329775e-06, "loss": 0.1442, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 138921744.0, "step": 168 }, { "entropy": 0.5290679931640625, "epoch": 1.9204545454545454, "grad_norm": 4.78181815999074, "learning_rate": 4.08484685672348e-06, "loss": 0.1167, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 139736919.0, "step": 169 }, { "entropy": 0.5113754272460938, "epoch": 1.9318181818181817, "grad_norm": 8.861363241151182, "learning_rate": 4.07269174959551e-06, "loss": 0.1207, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 140617027.0, "step": 170 }, { "entropy": 0.5159988403320312, "epoch": 1.9431818181818183, "grad_norm": 8.006569945370714, "learning_rate": 4.06047480289498e-06, "loss": 0.1212, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 141443086.0, "step": 171 }, { "entropy": 0.51470947265625, "epoch": 1.9545454545454546, "grad_norm": 10.431183762698996, "learning_rate": 4.0481964970025885e-06, "loss": 0.1442, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 142285456.0, "step": 172 }, { "entropy": 0.5068130493164062, "epoch": 1.9659090909090908, "grad_norm": 8.414148072333015, "learning_rate": 4.035857314711729e-06, "loss": 0.1396, "mean_token_accuracy": 0.945312503259629, "num_tokens": 143142101.0, "step": 173 }, { "entropy": 0.5181655883789062, "epoch": 1.9772727272727273, "grad_norm": 7.281260662915632, "learning_rate": 4.023457741209509e-06, "loss": 0.1226, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 143972519.0, "step": 174 }, { "entropy": 0.52288818359375, "epoch": 1.9886363636363638, "grad_norm": 4.149697190907548, "learning_rate": 4.0109982640576676e-06, "loss": 0.1123, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 144814806.0, "step": 175 }, { "entropy": 0.517059326171875, "epoch": 2.0, "grad_norm": 7.882059155042237, "learning_rate": 3.998479373173406e-06, "loss": 0.1111, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 145648585.0, "step": 176 }, { "entropy": 0.5061798095703125, "epoch": 2.0113636363636362, "grad_norm": 3.755333968867442, "learning_rate": 3.985901560810126e-06, "loss": 0.0993, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 146516717.0, "step": 177 }, { "entropy": 0.509613037109375, "epoch": 2.022727272727273, "grad_norm": 7.578047639747768, "learning_rate": 3.973265321538069e-06, "loss": 0.1273, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 147377138.0, "step": 178 }, { "entropy": 0.526214599609375, "epoch": 2.034090909090909, "grad_norm": 9.248690622373502, "learning_rate": 3.960571152224872e-06, "loss": 0.0844, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 148164451.0, "step": 179 }, { "entropy": 0.51904296875, "epoch": 2.0454545454545454, "grad_norm": 4.995096147331217, "learning_rate": 3.9478195520160355e-06, "loss": 0.0826, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 148952463.0, "step": 180 }, { "entropy": 0.5115966796875, "epoch": 2.0568181818181817, "grad_norm": 9.976065288968513, "learning_rate": 3.935011022315284e-06, "loss": 0.1184, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 149789754.0, "step": 181 }, { "entropy": 0.5167007446289062, "epoch": 2.0681818181818183, "grad_norm": 6.753980258357957, "learning_rate": 3.922146066764863e-06, "loss": 0.1101, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 150576600.0, "step": 182 }, { "entropy": 0.5221023559570312, "epoch": 2.0795454545454546, "grad_norm": 4.233617937880569, "learning_rate": 3.9092251912257286e-06, "loss": 0.073, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 151392147.0, "step": 183 }, { "entropy": 0.5233230590820312, "epoch": 2.090909090909091, "grad_norm": 6.412674458392547, "learning_rate": 3.896248903757658e-06, "loss": 0.0898, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 152228107.0, "step": 184 }, { "entropy": 0.53240966796875, "epoch": 2.102272727272727, "grad_norm": 6.31471950629154, "learning_rate": 3.883217714599273e-06, "loss": 0.1037, "mean_token_accuracy": 0.967447918606922, "num_tokens": 153053726.0, "step": 185 }, { "entropy": 0.5299835205078125, "epoch": 2.1136363636363638, "grad_norm": 5.524811254106886, "learning_rate": 3.870132136147977e-06, "loss": 0.0859, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 153905918.0, "step": 186 }, { "entropy": 0.5373382568359375, "epoch": 2.125, "grad_norm": 5.959063680482344, "learning_rate": 3.856992682939803e-06, "loss": 0.0936, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 154726918.0, "step": 187 }, { "entropy": 0.5167999267578125, "epoch": 2.1363636363636362, "grad_norm": 4.047026610074852, "learning_rate": 3.84379987162919e-06, "loss": 0.088, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 155571272.0, "step": 188 }, { "entropy": 0.5206451416015625, "epoch": 2.147727272727273, "grad_norm": 14.950526464718012, "learning_rate": 3.830554220968661e-06, "loss": 0.117, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 156393962.0, "step": 189 }, { "entropy": 0.5257492065429688, "epoch": 2.159090909090909, "grad_norm": 6.350323940638027, "learning_rate": 3.817256251788425e-06, "loss": 0.0684, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 157224989.0, "step": 190 }, { "entropy": 0.5198822021484375, "epoch": 2.1704545454545454, "grad_norm": 17.607176029933502, "learning_rate": 3.803906486975901e-06, "loss": 0.1604, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 158088785.0, "step": 191 }, { "entropy": 0.5287246704101562, "epoch": 2.1818181818181817, "grad_norm": 17.168183781109484, "learning_rate": 3.790505451455158e-06, "loss": 0.1742, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 158924638.0, "step": 192 }, { "entropy": 0.5184326171875, "epoch": 2.1931818181818183, "grad_norm": 6.269834392958401, "learning_rate": 3.77705367216627e-06, "loss": 0.0732, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 159745234.0, "step": 193 }, { "entropy": 0.5220794677734375, "epoch": 2.2045454545454546, "grad_norm": 15.721070801554754, "learning_rate": 3.7635516780446e-06, "loss": 0.1183, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 160611881.0, "step": 194 }, { "entropy": 0.53619384765625, "epoch": 2.215909090909091, "grad_norm": 18.33813662745301, "learning_rate": 3.7500000000000005e-06, "loss": 0.1767, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 161426112.0, "step": 195 }, { "entropy": 0.5299606323242188, "epoch": 2.227272727272727, "grad_norm": 11.189196081242297, "learning_rate": 3.7363991708959386e-06, "loss": 0.1316, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 162252165.0, "step": 196 }, { "entropy": 0.5212249755859375, "epoch": 2.2386363636363638, "grad_norm": 3.775620350441528, "learning_rate": 3.7227497255285416e-06, "loss": 0.1001, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 163113356.0, "step": 197 }, { "entropy": 0.5298538208007812, "epoch": 2.25, "grad_norm": 10.56752963682954, "learning_rate": 3.709052200605572e-06, "loss": 0.1411, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 163956631.0, "step": 198 }, { "entropy": 0.5439224243164062, "epoch": 2.2613636363636362, "grad_norm": 9.338301436041878, "learning_rate": 3.6953071347253167e-06, "loss": 0.1117, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 164735604.0, "step": 199 }, { "entropy": 0.530059814453125, "epoch": 2.2727272727272725, "grad_norm": 2.888565812031598, "learning_rate": 3.6815150683554187e-06, "loss": 0.0845, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 165562321.0, "step": 200 }, { "entropy": 0.5339889526367188, "epoch": 2.284090909090909, "grad_norm": 6.631223942318063, "learning_rate": 3.6676765438116157e-06, "loss": 0.1074, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 166385295.0, "step": 201 }, { "entropy": 0.5253219604492188, "epoch": 2.2954545454545454, "grad_norm": 14.061780677797357, "learning_rate": 3.6537921052364223e-06, "loss": 0.1289, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 167193522.0, "step": 202 }, { "entropy": 0.532470703125, "epoch": 2.3068181818181817, "grad_norm": 6.341361614121059, "learning_rate": 3.6398622985777314e-06, "loss": 0.0977, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 168023123.0, "step": 203 }, { "entropy": 0.5181808471679688, "epoch": 2.3181818181818183, "grad_norm": 7.085053762621007, "learning_rate": 3.6258876715673475e-06, "loss": 0.1024, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 168884198.0, "step": 204 }, { "entropy": 0.5181198120117188, "epoch": 2.3295454545454546, "grad_norm": 6.0840959915959445, "learning_rate": 3.611868773699449e-06, "loss": 0.0818, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 169721369.0, "step": 205 }, { "entropy": 0.525115966796875, "epoch": 2.340909090909091, "grad_norm": 3.8765193636226587, "learning_rate": 3.597806156208982e-06, "loss": 0.0713, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 170586563.0, "step": 206 }, { "entropy": 0.5304183959960938, "epoch": 2.3522727272727275, "grad_norm": 8.283485896461336, "learning_rate": 3.5837003720499853e-06, "loss": 0.0828, "mean_token_accuracy": 0.967447918606922, "num_tokens": 171389861.0, "step": 207 }, { "entropy": 0.521881103515625, "epoch": 2.3636363636363638, "grad_norm": 8.549193433703433, "learning_rate": 3.569551975873847e-06, "loss": 0.0994, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 172247665.0, "step": 208 }, { "entropy": 0.52642822265625, "epoch": 2.375, "grad_norm": 4.731480811392632, "learning_rate": 3.555361524007498e-06, "loss": 0.0764, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 173082355.0, "step": 209 }, { "entropy": 0.5319061279296875, "epoch": 2.3863636363636362, "grad_norm": 4.2015486425538855, "learning_rate": 3.541129574431532e-06, "loss": 0.0615, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 173912416.0, "step": 210 }, { "entropy": 0.51190185546875, "epoch": 2.3977272727272725, "grad_norm": 2.509367450469538, "learning_rate": 3.526856686758269e-06, "loss": 0.0456, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 174774730.0, "step": 211 }, { "entropy": 0.5276336669921875, "epoch": 2.409090909090909, "grad_norm": 3.6294914532585163, "learning_rate": 3.51254342220975e-06, "loss": 0.0551, "mean_token_accuracy": 0.977864584652707, "num_tokens": 175600690.0, "step": 212 }, { "entropy": 0.51702880859375, "epoch": 2.4204545454545454, "grad_norm": 6.099017771755878, "learning_rate": 3.4981903435956675e-06, "loss": 0.0561, "mean_token_accuracy": 0.977864584652707, "num_tokens": 176433236.0, "step": 213 }, { "entropy": 0.5221633911132812, "epoch": 2.4318181818181817, "grad_norm": 5.444792509114168, "learning_rate": 3.4837980152912393e-06, "loss": 0.0638, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 177252798.0, "step": 214 }, { "entropy": 0.51513671875, "epoch": 2.4431818181818183, "grad_norm": 6.166295424467747, "learning_rate": 3.4693670032150117e-06, "loss": 0.0598, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 178087586.0, "step": 215 }, { "entropy": 0.5172805786132812, "epoch": 2.4545454545454546, "grad_norm": 7.216293257274722, "learning_rate": 3.4548978748066115e-06, "loss": 0.059, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 178904966.0, "step": 216 }, { "entropy": 0.5154190063476562, "epoch": 2.465909090909091, "grad_norm": 5.774270384427778, "learning_rate": 3.440391199004431e-06, "loss": 0.0573, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 179732100.0, "step": 217 }, { "entropy": 0.5367050170898438, "epoch": 2.4772727272727275, "grad_norm": 5.71656956287317, "learning_rate": 3.4258475462232586e-06, "loss": 0.0531, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 180499306.0, "step": 218 }, { "entropy": 0.5283737182617188, "epoch": 2.4886363636363638, "grad_norm": 5.454253375411191, "learning_rate": 3.4112674883318477e-06, "loss": 0.0508, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 181302762.0, "step": 219 }, { "entropy": 0.5136947631835938, "epoch": 2.5, "grad_norm": 4.865211217099932, "learning_rate": 3.3966515986304322e-06, "loss": 0.0641, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 182132901.0, "step": 220 }, { "entropy": 0.5163040161132812, "epoch": 2.5113636363636362, "grad_norm": 7.9460874989938075, "learning_rate": 3.3820004518281835e-06, "loss": 0.0641, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 182956676.0, "step": 221 }, { "entropy": 0.5112228393554688, "epoch": 2.5227272727272725, "grad_norm": 3.996587612081756, "learning_rate": 3.367314624020613e-06, "loss": 0.0414, "mean_token_accuracy": 0.989583333954215, "num_tokens": 183817292.0, "step": 222 }, { "entropy": 0.5117111206054688, "epoch": 2.534090909090909, "grad_norm": 9.534113890080645, "learning_rate": 3.352594692666915e-06, "loss": 0.0903, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 184637423.0, "step": 223 }, { "entropy": 0.5112152099609375, "epoch": 2.5454545454545454, "grad_norm": 10.168817989484715, "learning_rate": 3.337841236567268e-06, "loss": 0.0771, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 185469944.0, "step": 224 }, { "entropy": 0.5121002197265625, "epoch": 2.5568181818181817, "grad_norm": 3.9431029399036825, "learning_rate": 3.32305483584007e-06, "loss": 0.0611, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 186292612.0, "step": 225 }, { "entropy": 0.515106201171875, "epoch": 2.5681818181818183, "grad_norm": 4.814525321844357, "learning_rate": 3.30823607189913e-06, "loss": 0.064, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 187101708.0, "step": 226 }, { "entropy": 0.5209121704101562, "epoch": 2.5795454545454546, "grad_norm": 4.496547790724185, "learning_rate": 3.2933855274308067e-06, "loss": 0.0629, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 187907131.0, "step": 227 }, { "entropy": 0.5145339965820312, "epoch": 2.590909090909091, "grad_norm": 4.755114257664067, "learning_rate": 3.278503786371095e-06, "loss": 0.044, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 188726193.0, "step": 228 }, { "entropy": 0.5033798217773438, "epoch": 2.6022727272727275, "grad_norm": 7.053025964916729, "learning_rate": 3.2635914338826665e-06, "loss": 0.0641, "mean_token_accuracy": 0.977864584652707, "num_tokens": 189574751.0, "step": 229 }, { "entropy": 0.5178298950195312, "epoch": 2.6136363636363638, "grad_norm": 2.732036872044337, "learning_rate": 3.2486490563318605e-06, "loss": 0.0449, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 190392439.0, "step": 230 }, { "entropy": 0.5087661743164062, "epoch": 2.625, "grad_norm": 3.7713000538715247, "learning_rate": 3.233677241265627e-06, "loss": 0.0564, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 191230801.0, "step": 231 }, { "entropy": 0.5154266357421875, "epoch": 2.6363636363636362, "grad_norm": 3.296081610191933, "learning_rate": 3.218676577388424e-06, "loss": 0.0474, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 192055235.0, "step": 232 }, { "entropy": 0.5103530883789062, "epoch": 2.6477272727272725, "grad_norm": 3.5282268907437073, "learning_rate": 3.2036476545390695e-06, "loss": 0.0377, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 192908727.0, "step": 233 }, { "entropy": 0.521148681640625, "epoch": 2.659090909090909, "grad_norm": 6.036365365275266, "learning_rate": 3.188591063667548e-06, "loss": 0.0493, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 193727407.0, "step": 234 }, { "entropy": 0.511871337890625, "epoch": 2.6704545454545454, "grad_norm": 5.617886246430549, "learning_rate": 3.1735073968117743e-06, "loss": 0.0452, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 194567749.0, "step": 235 }, { "entropy": 0.5137405395507812, "epoch": 2.6818181818181817, "grad_norm": 5.613230841959403, "learning_rate": 3.1583972470743123e-06, "loss": 0.039, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 195403493.0, "step": 236 }, { "entropy": 0.5176010131835938, "epoch": 2.6931818181818183, "grad_norm": 4.811211184785151, "learning_rate": 3.1432612085990576e-06, "loss": 0.0585, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 196225816.0, "step": 237 }, { "entropy": 0.517425537109375, "epoch": 2.7045454545454546, "grad_norm": 6.026414758455728, "learning_rate": 3.1280998765478725e-06, "loss": 0.0449, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 197066021.0, "step": 238 }, { "entropy": 0.519012451171875, "epoch": 2.715909090909091, "grad_norm": 5.323494774872785, "learning_rate": 3.1129138470771823e-06, "loss": 0.0466, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 197881192.0, "step": 239 }, { "entropy": 0.5157241821289062, "epoch": 2.7272727272727275, "grad_norm": 3.7572493546476897, "learning_rate": 3.0977037173145387e-06, "loss": 0.0351, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 198696942.0, "step": 240 }, { "entropy": 0.5295562744140625, "epoch": 2.7386363636363638, "grad_norm": 4.351563113236933, "learning_rate": 3.082470085335133e-06, "loss": 0.0379, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 199506760.0, "step": 241 }, { "entropy": 0.5326080322265625, "epoch": 2.75, "grad_norm": 4.3434728185279114, "learning_rate": 3.0672135501382894e-06, "loss": 0.0426, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 200314554.0, "step": 242 }, { "entropy": 0.5197906494140625, "epoch": 2.7613636363636362, "grad_norm": 4.23758686798832, "learning_rate": 3.0519347116239e-06, "loss": 0.0496, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 201160682.0, "step": 243 }, { "entropy": 0.5222244262695312, "epoch": 2.7727272727272725, "grad_norm": 3.3925018109786977, "learning_rate": 3.036634170568847e-06, "loss": 0.0397, "mean_token_accuracy": 0.989583333954215, "num_tokens": 201970697.0, "step": 244 }, { "entropy": 0.5278549194335938, "epoch": 2.784090909090909, "grad_norm": 6.75368386794985, "learning_rate": 3.021312528603371e-06, "loss": 0.0616, "mean_token_accuracy": 0.977864584652707, "num_tokens": 202775841.0, "step": 245 }, { "entropy": 0.5267410278320312, "epoch": 2.7954545454545454, "grad_norm": 3.0938038819070037, "learning_rate": 3.0059703881874232e-06, "loss": 0.0357, "mean_token_accuracy": 0.989583333954215, "num_tokens": 203595456.0, "step": 246 }, { "entropy": 0.518524169921875, "epoch": 2.8068181818181817, "grad_norm": 3.220031225997158, "learning_rate": 2.990608352586965e-06, "loss": 0.0279, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 204424629.0, "step": 247 }, { "entropy": 0.5166702270507812, "epoch": 2.8181818181818183, "grad_norm": 2.5096440894100414, "learning_rate": 2.9752270258502593e-06, "loss": 0.0275, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 205278442.0, "step": 248 }, { "entropy": 0.5141220092773438, "epoch": 2.8295454545454546, "grad_norm": 3.0289759848019924, "learning_rate": 2.959827012784108e-06, "loss": 0.0258, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 206118798.0, "step": 249 }, { "entropy": 0.5139236450195312, "epoch": 2.840909090909091, "grad_norm": 5.597031296469579, "learning_rate": 2.9444089189300783e-06, "loss": 0.0384, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 206937788.0, "step": 250 }, { "entropy": 0.513916015625, "epoch": 2.8522727272727275, "grad_norm": 6.324459117331001, "learning_rate": 2.92897335054069e-06, "loss": 0.0346, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 207777954.0, "step": 251 }, { "entropy": 0.5144729614257812, "epoch": 2.8636363636363638, "grad_norm": 10.825936923114694, "learning_rate": 2.913520914555572e-06, "loss": 0.0537, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 208632133.0, "step": 252 }, { "entropy": 0.5220108032226562, "epoch": 2.875, "grad_norm": 8.746301873134408, "learning_rate": 2.8980522185776065e-06, "loss": 0.0627, "mean_token_accuracy": 0.977864584652707, "num_tokens": 209471247.0, "step": 253 }, { "entropy": 0.5236358642578125, "epoch": 2.8863636363636362, "grad_norm": 1.998986620458919, "learning_rate": 2.882567870849029e-06, "loss": 0.0236, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 210282643.0, "step": 254 }, { "entropy": 0.5129623413085938, "epoch": 2.8977272727272725, "grad_norm": 4.1198700432367605, "learning_rate": 2.8670684802275173e-06, "loss": 0.0269, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 211126378.0, "step": 255 }, { "entropy": 0.5134735107421875, "epoch": 2.909090909090909, "grad_norm": 5.348962259064157, "learning_rate": 2.8515546561622464e-06, "loss": 0.0302, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 211953021.0, "step": 256 }, { "entropy": 0.5159378051757812, "epoch": 2.9204545454545454, "grad_norm": 5.307710488632053, "learning_rate": 2.8360270086699274e-06, "loss": 0.0365, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 212784194.0, "step": 257 }, { "entropy": 0.5262603759765625, "epoch": 2.9318181818181817, "grad_norm": 7.506532419063644, "learning_rate": 2.820486148310822e-06, "loss": 0.0421, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 213585134.0, "step": 258 }, { "entropy": 0.537200927734375, "epoch": 2.9431818181818183, "grad_norm": 3.3635404051746, "learning_rate": 2.8049326861647303e-06, "loss": 0.0233, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 214369055.0, "step": 259 }, { "entropy": 0.5199203491210938, "epoch": 2.9545454545454546, "grad_norm": 3.0956317736212777, "learning_rate": 2.7893672338069666e-06, "loss": 0.026, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 215189129.0, "step": 260 }, { "entropy": 0.5302810668945312, "epoch": 2.965909090909091, "grad_norm": 5.926197968230742, "learning_rate": 2.7737904032843105e-06, "loss": 0.0382, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 215982699.0, "step": 261 }, { "entropy": 0.5257034301757812, "epoch": 2.9772727272727275, "grad_norm": 4.940529587588766, "learning_rate": 2.7582028070909415e-06, "loss": 0.0226, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 216773800.0, "step": 262 }, { "entropy": 0.5219879150390625, "epoch": 2.9886363636363638, "grad_norm": 3.317011621794809, "learning_rate": 2.742605058144352e-06, "loss": 0.024, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 217577013.0, "step": 263 }, { "entropy": 0.5041275024414062, "epoch": 3.0, "grad_norm": 4.375219675071472, "learning_rate": 2.7269977697612515e-06, "loss": 0.0336, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 218445070.0, "step": 264 }, { "entropy": 0.5074462890625, "epoch": 3.0113636363636362, "grad_norm": 5.638061777523044, "learning_rate": 2.7113815556334478e-06, "loss": 0.0541, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 219295636.0, "step": 265 }, { "entropy": 0.5047531127929688, "epoch": 3.022727272727273, "grad_norm": 7.001969084082727, "learning_rate": 2.6957570298037156e-06, "loss": 0.0236, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 220144284.0, "step": 266 }, { "entropy": 0.5129165649414062, "epoch": 3.034090909090909, "grad_norm": 5.988487899098891, "learning_rate": 2.680124806641654e-06, "loss": 0.0352, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 220981512.0, "step": 267 }, { "entropy": 0.5201339721679688, "epoch": 3.0454545454545454, "grad_norm": 5.136716069392655, "learning_rate": 2.664485500819527e-06, "loss": 0.0311, "mean_token_accuracy": 0.989583333954215, "num_tokens": 221793779.0, "step": 268 }, { "entropy": 0.5125503540039062, "epoch": 3.0568181818181817, "grad_norm": 5.2949927163413895, "learning_rate": 2.6488397272880943e-06, "loss": 0.0365, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 222639289.0, "step": 269 }, { "entropy": 0.5237808227539062, "epoch": 3.0681818181818183, "grad_norm": 4.3434336811837335, "learning_rate": 2.633188101252433e-06, "loss": 0.0431, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 223441006.0, "step": 270 }, { "entropy": 0.5213165283203125, "epoch": 3.0795454545454546, "grad_norm": 3.3532803929089723, "learning_rate": 2.617531238147744e-06, "loss": 0.0167, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 224253520.0, "step": 271 }, { "entropy": 0.5045852661132812, "epoch": 3.090909090909091, "grad_norm": 9.04867592578753, "learning_rate": 2.6018697536151554e-06, "loss": 0.034, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 225092756.0, "step": 272 }, { "entropy": 0.5112762451171875, "epoch": 3.102272727272727, "grad_norm": 6.089856114683977, "learning_rate": 2.5862042634775125e-06, "loss": 0.0266, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 225914161.0, "step": 273 }, { "entropy": 0.5180816650390625, "epoch": 3.1136363636363638, "grad_norm": 2.670172765084978, "learning_rate": 2.5705353837151655e-06, "loss": 0.0147, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 226727320.0, "step": 274 }, { "entropy": 0.5142440795898438, "epoch": 3.125, "grad_norm": 4.087182451197898, "learning_rate": 2.554863730441748e-06, "loss": 0.0319, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 227543923.0, "step": 275 }, { "entropy": 0.5253143310546875, "epoch": 3.1363636363636362, "grad_norm": 3.238450710346099, "learning_rate": 2.5391899198799475e-06, "loss": 0.018, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 228354592.0, "step": 276 }, { "entropy": 0.5085906982421875, "epoch": 3.147727272727273, "grad_norm": 4.310933841924269, "learning_rate": 2.5235145683372813e-06, "loss": 0.03, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 229189324.0, "step": 277 }, { "entropy": 0.5243377685546875, "epoch": 3.159090909090909, "grad_norm": 5.982970780385657, "learning_rate": 2.507838292181858e-06, "loss": 0.0273, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 229990721.0, "step": 278 }, { "entropy": 0.5168304443359375, "epoch": 3.1704545454545454, "grad_norm": 5.044799482329287, "learning_rate": 2.4921617078181425e-06, "loss": 0.0214, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 230813428.0, "step": 279 }, { "entropy": 0.5148696899414062, "epoch": 3.1818181818181817, "grad_norm": 4.358921357863228, "learning_rate": 2.47648543166272e-06, "loss": 0.0365, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 231638664.0, "step": 280 }, { "entropy": 0.5061111450195312, "epoch": 3.1931818181818183, "grad_norm": 3.3414807488803993, "learning_rate": 2.4608100801200533e-06, "loss": 0.0167, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 232502400.0, "step": 281 }, { "entropy": 0.5233993530273438, "epoch": 3.2045454545454546, "grad_norm": 2.5835308691930488, "learning_rate": 2.445136269558254e-06, "loss": 0.019, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 233292296.0, "step": 282 }, { "entropy": 0.5108566284179688, "epoch": 3.215909090909091, "grad_norm": 3.063861729689834, "learning_rate": 2.4294646162848353e-06, "loss": 0.0224, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 234121658.0, "step": 283 }, { "entropy": 0.5061569213867188, "epoch": 3.227272727272727, "grad_norm": 4.3710805919644935, "learning_rate": 2.413795736522489e-06, "loss": 0.0199, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 234953163.0, "step": 284 }, { "entropy": 0.49341583251953125, "epoch": 3.2386363636363638, "grad_norm": 5.3087603859535974, "learning_rate": 2.3981302463848454e-06, "loss": 0.0206, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 235818597.0, "step": 285 }, { "entropy": 0.518768310546875, "epoch": 3.25, "grad_norm": 4.697159032722577, "learning_rate": 2.3824687618522567e-06, "loss": 0.0285, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 236602971.0, "step": 286 }, { "entropy": 0.5102920532226562, "epoch": 3.2613636363636362, "grad_norm": 2.1053875542266964, "learning_rate": 2.366811898747568e-06, "loss": 0.0155, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 237421849.0, "step": 287 }, { "entropy": 0.5016021728515625, "epoch": 3.2727272727272725, "grad_norm": 5.007825413263214, "learning_rate": 2.351160272711907e-06, "loss": 0.03, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 238264490.0, "step": 288 }, { "entropy": 0.5040969848632812, "epoch": 3.284090909090909, "grad_norm": 4.54185116627051, "learning_rate": 2.3355144991804736e-06, "loss": 0.0249, "mean_token_accuracy": 0.989583333954215, "num_tokens": 239083537.0, "step": 289 }, { "entropy": 0.5108413696289062, "epoch": 3.2954545454545454, "grad_norm": 3.6101103237544443, "learning_rate": 2.3198751933583463e-06, "loss": 0.0175, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 239887927.0, "step": 290 }, { "entropy": 0.5096359252929688, "epoch": 3.3068181818181817, "grad_norm": 5.673455850680201, "learning_rate": 2.304242970196285e-06, "loss": 0.0176, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 240692045.0, "step": 291 }, { "entropy": 0.5037689208984375, "epoch": 3.3181818181818183, "grad_norm": 4.418027168086691, "learning_rate": 2.2886184443665522e-06, "loss": 0.0154, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 241490985.0, "step": 292 }, { "entropy": 0.5098648071289062, "epoch": 3.3295454545454546, "grad_norm": 4.29411735744527, "learning_rate": 2.2730022302387493e-06, "loss": 0.0237, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 242287672.0, "step": 293 }, { "entropy": 0.49761199951171875, "epoch": 3.340909090909091, "grad_norm": 2.6648692660994255, "learning_rate": 2.257394941855648e-06, "loss": 0.0121, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 243126330.0, "step": 294 }, { "entropy": 0.5023422241210938, "epoch": 3.3522727272727275, "grad_norm": 6.819480101828647, "learning_rate": 2.2417971929090593e-06, "loss": 0.0408, "mean_token_accuracy": 0.989583333954215, "num_tokens": 243934709.0, "step": 295 }, { "entropy": 0.49542236328125, "epoch": 3.3636363636363638, "grad_norm": 4.254807585399875, "learning_rate": 2.2262095967156895e-06, "loss": 0.0236, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 244774180.0, "step": 296 }, { "entropy": 0.5053787231445312, "epoch": 3.375, "grad_norm": 3.7942834183716627, "learning_rate": 2.2106327661930343e-06, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 245590040.0, "step": 297 }, { "entropy": 0.49605560302734375, "epoch": 3.3863636363636362, "grad_norm": 2.942700286159109, "learning_rate": 2.19506731383527e-06, "loss": 0.0181, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 246440516.0, "step": 298 }, { "entropy": 0.5043182373046875, "epoch": 3.3977272727272725, "grad_norm": 3.503113960805014, "learning_rate": 2.1795138516891786e-06, "loss": 0.0184, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 247249584.0, "step": 299 }, { "entropy": 0.5015716552734375, "epoch": 3.409090909090909, "grad_norm": 5.5439080197236095, "learning_rate": 2.163972991330073e-06, "loss": 0.0132, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 248078340.0, "step": 300 }, { "entropy": 0.49102783203125, "epoch": 3.4204545454545454, "grad_norm": 5.088980442111073, "learning_rate": 2.148445343837755e-06, "loss": 0.0129, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 248927712.0, "step": 301 }, { "entropy": 0.49953460693359375, "epoch": 3.4318181818181817, "grad_norm": 5.204176680601301, "learning_rate": 2.1329315197724835e-06, "loss": 0.0273, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 249746453.0, "step": 302 }, { "entropy": 0.49965667724609375, "epoch": 3.4431818181818183, "grad_norm": 5.045844298995128, "learning_rate": 2.1174321291509716e-06, "loss": 0.023, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 250589656.0, "step": 303 }, { "entropy": 0.48998260498046875, "epoch": 3.4545454545454546, "grad_norm": 4.046888074688225, "learning_rate": 2.1019477814223943e-06, "loss": 0.0121, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 251475034.0, "step": 304 }, { "entropy": 0.5137939453125, "epoch": 3.465909090909091, "grad_norm": 5.163377258776933, "learning_rate": 2.086479085444429e-06, "loss": 0.0384, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 252281615.0, "step": 305 }, { "entropy": 0.498199462890625, "epoch": 3.4772727272727275, "grad_norm": 5.648873679939671, "learning_rate": 2.071026649459311e-06, "loss": 0.015, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 253112081.0, "step": 306 }, { "entropy": 0.5233306884765625, "epoch": 3.4886363636363638, "grad_norm": 3.589697703897856, "learning_rate": 2.055591081069922e-06, "loss": 0.0101, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 253899681.0, "step": 307 }, { "entropy": 0.5106887817382812, "epoch": 3.5, "grad_norm": 5.074293050610865, "learning_rate": 2.040172987215893e-06, "loss": 0.0127, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 254708112.0, "step": 308 }, { "entropy": 0.495269775390625, "epoch": 3.5113636363636362, "grad_norm": 3.675745737048211, "learning_rate": 2.024772974149741e-06, "loss": 0.0125, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 255563485.0, "step": 309 }, { "entropy": 0.5013885498046875, "epoch": 3.5227272727272725, "grad_norm": 4.817798054202063, "learning_rate": 2.0093916474130354e-06, "loss": 0.0203, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 256413263.0, "step": 310 }, { "entropy": 0.5038299560546875, "epoch": 3.534090909090909, "grad_norm": 3.1418020211628033, "learning_rate": 1.9940296118125776e-06, "loss": 0.0116, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 257245420.0, "step": 311 }, { "entropy": 0.5105819702148438, "epoch": 3.5454545454545454, "grad_norm": 3.456624325090506, "learning_rate": 1.9786874713966293e-06, "loss": 0.0143, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 258077483.0, "step": 312 }, { "entropy": 0.5098724365234375, "epoch": 3.5568181818181817, "grad_norm": 3.4342539832370003, "learning_rate": 1.9633658294311535e-06, "loss": 0.0101, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 258880273.0, "step": 313 }, { "entropy": 0.4929656982421875, "epoch": 3.5681818181818183, "grad_norm": 3.764505067295885, "learning_rate": 1.9480652883761007e-06, "loss": 0.0197, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 259750176.0, "step": 314 }, { "entropy": 0.516937255859375, "epoch": 3.5795454545454546, "grad_norm": 2.561449184590668, "learning_rate": 1.9327864498617114e-06, "loss": 0.0157, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 260566530.0, "step": 315 }, { "entropy": 0.5121307373046875, "epoch": 3.590909090909091, "grad_norm": 1.0671362172663923, "learning_rate": 1.9175299146648672e-06, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 261367907.0, "step": 316 }, { "entropy": 0.5068359375, "epoch": 3.6022727272727275, "grad_norm": 3.381064113252212, "learning_rate": 1.9022962826854619e-06, "loss": 0.019, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 262183663.0, "step": 317 }, { "entropy": 0.514404296875, "epoch": 3.6136363636363638, "grad_norm": 3.890719744287091, "learning_rate": 1.887086152922818e-06, "loss": 0.0305, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 262991003.0, "step": 318 }, { "entropy": 0.497894287109375, "epoch": 3.625, "grad_norm": 4.459677614690958, "learning_rate": 1.8719001234521283e-06, "loss": 0.0158, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 263838798.0, "step": 319 }, { "entropy": 0.5207977294921875, "epoch": 3.6363636363636362, "grad_norm": 3.3095750180357797, "learning_rate": 1.8567387914009432e-06, "loss": 0.0268, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 264623780.0, "step": 320 }, { "entropy": 0.4974212646484375, "epoch": 3.6477272727272725, "grad_norm": 4.065611989330324, "learning_rate": 1.8416027529256885e-06, "loss": 0.0203, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 265460612.0, "step": 321 }, { "entropy": 0.49669647216796875, "epoch": 3.659090909090909, "grad_norm": 2.750458063856211, "learning_rate": 1.8264926031882274e-06, "loss": 0.0133, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 266306607.0, "step": 322 }, { "entropy": 0.497406005859375, "epoch": 3.6704545454545454, "grad_norm": 2.8131906720041067, "learning_rate": 1.8114089363324525e-06, "loss": 0.0111, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 267163529.0, "step": 323 }, { "entropy": 0.5118255615234375, "epoch": 3.6818181818181817, "grad_norm": 5.003100497642272, "learning_rate": 1.7963523454609317e-06, "loss": 0.0145, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 267949078.0, "step": 324 }, { "entropy": 0.48583984375, "epoch": 3.6931818181818183, "grad_norm": 2.0483918553702356, "learning_rate": 1.7813234226115767e-06, "loss": 0.006, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 268835407.0, "step": 325 }, { "entropy": 0.499237060546875, "epoch": 3.7045454545454546, "grad_norm": 2.86400752347896, "learning_rate": 1.766322758734374e-06, "loss": 0.0085, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 269641194.0, "step": 326 }, { "entropy": 0.50311279296875, "epoch": 3.715909090909091, "grad_norm": 4.259573160972011, "learning_rate": 1.75135094366814e-06, "loss": 0.0268, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 270450463.0, "step": 327 }, { "entropy": 0.499053955078125, "epoch": 3.7272727272727275, "grad_norm": 3.4557767603144476, "learning_rate": 1.7364085661173346e-06, "loss": 0.0115, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 271255011.0, "step": 328 }, { "entropy": 0.4916534423828125, "epoch": 3.7386363636363638, "grad_norm": 3.049050929462369, "learning_rate": 1.721496213628906e-06, "loss": 0.0187, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 272118194.0, "step": 329 }, { "entropy": 0.4965972900390625, "epoch": 3.75, "grad_norm": 3.2555463571102043, "learning_rate": 1.7066144725691933e-06, "loss": 0.0227, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 272929742.0, "step": 330 }, { "entropy": 0.52386474609375, "epoch": 3.7613636363636362, "grad_norm": 4.093269827080145, "learning_rate": 1.6917639281008703e-06, "loss": 0.0126, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 273675439.0, "step": 331 }, { "entropy": 0.5012054443359375, "epoch": 3.7727272727272725, "grad_norm": 1.3792964976746966, "learning_rate": 1.6769451641599305e-06, "loss": 0.0048, "mean_token_accuracy": 1.0, "num_tokens": 274521549.0, "step": 332 }, { "entropy": 0.5058517456054688, "epoch": 3.784090909090909, "grad_norm": 3.589443939404163, "learning_rate": 1.6621587634327328e-06, "loss": 0.0217, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 275329538.0, "step": 333 }, { "entropy": 0.5105743408203125, "epoch": 3.7954545454545454, "grad_norm": 3.9777266239767712, "learning_rate": 1.647405307333085e-06, "loss": 0.0091, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 276142960.0, "step": 334 }, { "entropy": 0.5018463134765625, "epoch": 3.8068181818181817, "grad_norm": 3.3505185228236614, "learning_rate": 1.6326853759793878e-06, "loss": 0.0138, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 276958754.0, "step": 335 }, { "entropy": 0.50335693359375, "epoch": 3.8181818181818183, "grad_norm": 2.099970477822091, "learning_rate": 1.6179995481718165e-06, "loss": 0.0128, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 277760913.0, "step": 336 }, { "entropy": 0.4929351806640625, "epoch": 3.8295454545454546, "grad_norm": 2.610807645269833, "learning_rate": 1.6033484013695688e-06, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 278576783.0, "step": 337 }, { "entropy": 0.49352264404296875, "epoch": 3.840909090909091, "grad_norm": 3.847350242670301, "learning_rate": 1.588732511668153e-06, "loss": 0.0082, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 279418488.0, "step": 338 }, { "entropy": 0.49383544921875, "epoch": 3.8522727272727275, "grad_norm": 4.737949727080085, "learning_rate": 1.5741524537767427e-06, "loss": 0.0108, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 280251812.0, "step": 339 }, { "entropy": 0.495880126953125, "epoch": 3.8636363636363638, "grad_norm": 2.54135506307979, "learning_rate": 1.5596088009955695e-06, "loss": 0.0066, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 281080773.0, "step": 340 }, { "entropy": 0.49477386474609375, "epoch": 3.875, "grad_norm": 2.7265304592667223, "learning_rate": 1.5451021251933895e-06, "loss": 0.0061, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 281904123.0, "step": 341 }, { "entropy": 0.4793701171875, "epoch": 3.8863636363636362, "grad_norm": 1.7759375021299968, "learning_rate": 1.5306329967849887e-06, "loss": 0.0116, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 282775309.0, "step": 342 }, { "entropy": 0.4952545166015625, "epoch": 3.8977272727272725, "grad_norm": 7.042229803217046, "learning_rate": 1.5162019847087616e-06, "loss": 0.0087, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 283606678.0, "step": 343 }, { "entropy": 0.488067626953125, "epoch": 3.909090909090909, "grad_norm": 2.4721993122123305, "learning_rate": 1.5018096564043333e-06, "loss": 0.0052, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 284454938.0, "step": 344 }, { "entropy": 0.48111724853515625, "epoch": 3.9204545454545454, "grad_norm": 2.7747076531230332, "learning_rate": 1.4874565777902518e-06, "loss": 0.0075, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 285309488.0, "step": 345 }, { "entropy": 0.5005111694335938, "epoch": 3.9318181818181817, "grad_norm": 6.503072352593269, "learning_rate": 1.4731433132417316e-06, "loss": 0.0168, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 286135129.0, "step": 346 }, { "entropy": 0.49593353271484375, "epoch": 3.9431818181818183, "grad_norm": 2.587049362174535, "learning_rate": 1.4588704255684697e-06, "loss": 0.0174, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 286958859.0, "step": 347 }, { "entropy": 0.48793792724609375, "epoch": 3.9545454545454546, "grad_norm": 6.747296911451112, "learning_rate": 1.4446384759925024e-06, "loss": 0.0149, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 287805002.0, "step": 348 }, { "entropy": 0.5005645751953125, "epoch": 3.965909090909091, "grad_norm": 4.205331165935632, "learning_rate": 1.4304480241261529e-06, "loss": 0.011, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 288642537.0, "step": 349 }, { "entropy": 0.48992156982421875, "epoch": 3.9772727272727275, "grad_norm": 4.787823771311458, "learning_rate": 1.4162996279500158e-06, "loss": 0.0297, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 289472032.0, "step": 350 }, { "entropy": 0.505859375, "epoch": 3.9886363636363638, "grad_norm": 5.574571691374813, "learning_rate": 1.4021938437910181e-06, "loss": 0.0143, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 290273198.0, "step": 351 }, { "entropy": 0.47916412353515625, "epoch": 4.0, "grad_norm": 3.0588352539730232, "learning_rate": 1.388131226300552e-06, "loss": 0.0081, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 291144353.0, "step": 352 }, { "entropy": 0.4926605224609375, "epoch": 4.011363636363637, "grad_norm": 3.025749527554892, "learning_rate": 1.374112328432652e-06, "loss": 0.014, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 291972475.0, "step": 353 }, { "entropy": 0.4936981201171875, "epoch": 4.0227272727272725, "grad_norm": 1.7451625346318853, "learning_rate": 1.3601377014222688e-06, "loss": 0.0153, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 292787195.0, "step": 354 }, { "entropy": 0.5028610229492188, "epoch": 4.034090909090909, "grad_norm": 2.9232824281274037, "learning_rate": 1.3462078947635781e-06, "loss": 0.01, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 293578363.0, "step": 355 }, { "entropy": 0.5084609985351562, "epoch": 4.045454545454546, "grad_norm": 4.536859602349459, "learning_rate": 1.3323234561883847e-06, "loss": 0.0102, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 294378786.0, "step": 356 }, { "entropy": 0.49819183349609375, "epoch": 4.056818181818182, "grad_norm": 3.746680387955585, "learning_rate": 1.318484931644582e-06, "loss": 0.0071, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 295215438.0, "step": 357 }, { "entropy": 0.5009613037109375, "epoch": 4.068181818181818, "grad_norm": 3.712740822459945, "learning_rate": 1.3046928652746833e-06, "loss": 0.0172, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 296029918.0, "step": 358 }, { "entropy": 0.4818267822265625, "epoch": 4.079545454545454, "grad_norm": 2.8940045765908753, "learning_rate": 1.2909477993944286e-06, "loss": 0.0066, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 296891697.0, "step": 359 }, { "entropy": 0.490875244140625, "epoch": 4.090909090909091, "grad_norm": 2.347888529432484, "learning_rate": 1.2772502744714592e-06, "loss": 0.014, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 297738951.0, "step": 360 }, { "entropy": 0.4999847412109375, "epoch": 4.1022727272727275, "grad_norm": 3.5228155116652844, "learning_rate": 1.2636008291040618e-06, "loss": 0.013, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 298593160.0, "step": 361 }, { "entropy": 0.49713134765625, "epoch": 4.113636363636363, "grad_norm": 3.446218217523892, "learning_rate": 1.2500000000000007e-06, "loss": 0.0141, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 299423053.0, "step": 362 }, { "entropy": 0.49144744873046875, "epoch": 4.125, "grad_norm": 3.0365054145039037, "learning_rate": 1.236448321955401e-06, "loss": 0.0086, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 300256502.0, "step": 363 }, { "entropy": 0.5076980590820312, "epoch": 4.136363636363637, "grad_norm": 0.9932804984104635, "learning_rate": 1.222946327833731e-06, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 301066005.0, "step": 364 }, { "entropy": 0.48675537109375, "epoch": 4.1477272727272725, "grad_norm": 2.3517119546250465, "learning_rate": 1.2094945485448424e-06, "loss": 0.0071, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 301919771.0, "step": 365 }, { "entropy": 0.5002212524414062, "epoch": 4.159090909090909, "grad_norm": 0.9715596419931048, "learning_rate": 1.196093513024099e-06, "loss": 0.0046, "mean_token_accuracy": 1.0, "num_tokens": 302745145.0, "step": 366 }, { "entropy": 0.4864044189453125, "epoch": 4.170454545454546, "grad_norm": 3.7968011078794586, "learning_rate": 1.182743748211576e-06, "loss": 0.0078, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 303616597.0, "step": 367 }, { "entropy": 0.49373626708984375, "epoch": 4.181818181818182, "grad_norm": 2.874821117414963, "learning_rate": 1.1694457790313403e-06, "loss": 0.02, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 304444351.0, "step": 368 }, { "entropy": 0.486968994140625, "epoch": 4.193181818181818, "grad_norm": 2.074445350579162, "learning_rate": 1.15620012837081e-06, "loss": 0.0041, "mean_token_accuracy": 1.0, "num_tokens": 305286990.0, "step": 369 }, { "entropy": 0.49383544921875, "epoch": 4.204545454545454, "grad_norm": 1.9418684032272442, "learning_rate": 1.1430073170601968e-06, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 306109641.0, "step": 370 }, { "entropy": 0.507232666015625, "epoch": 4.215909090909091, "grad_norm": 4.069513556231147, "learning_rate": 1.1298678638520247e-06, "loss": 0.0053, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 306902232.0, "step": 371 }, { "entropy": 0.4990386962890625, "epoch": 4.2272727272727275, "grad_norm": 3.444347669415785, "learning_rate": 1.1167822854007265e-06, "loss": 0.0242, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 307719255.0, "step": 372 }, { "entropy": 0.480438232421875, "epoch": 4.238636363636363, "grad_norm": 0.6683414614854636, "learning_rate": 1.1037510962423425e-06, "loss": 0.0117, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 308575072.0, "step": 373 }, { "entropy": 0.484466552734375, "epoch": 4.25, "grad_norm": 6.59972703836071, "learning_rate": 1.0907748087742716e-06, "loss": 0.0168, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 309421914.0, "step": 374 }, { "entropy": 0.500274658203125, "epoch": 4.261363636363637, "grad_norm": 5.5296244355600015, "learning_rate": 1.0778539332351374e-06, "loss": 0.0066, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 310225102.0, "step": 375 }, { "entropy": 0.48038482666015625, "epoch": 4.2727272727272725, "grad_norm": 2.7332507609875982, "learning_rate": 1.0649889776847161e-06, "loss": 0.0073, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 311092524.0, "step": 376 }, { "entropy": 0.49211883544921875, "epoch": 4.284090909090909, "grad_norm": 6.20627247496486, "learning_rate": 1.0521804479839651e-06, "loss": 0.0165, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 311920496.0, "step": 377 }, { "entropy": 0.5010910034179688, "epoch": 4.295454545454546, "grad_norm": 1.9825033722155454, "learning_rate": 1.0394288477751274e-06, "loss": 0.006, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 312734021.0, "step": 378 }, { "entropy": 0.49372100830078125, "epoch": 4.306818181818182, "grad_norm": 2.062597317498596, "learning_rate": 1.0267346784619324e-06, "loss": 0.0045, "mean_token_accuracy": 1.0, "num_tokens": 313570038.0, "step": 379 }, { "entropy": 0.4984283447265625, "epoch": 4.318181818181818, "grad_norm": 3.0372016073487567, "learning_rate": 1.0140984391898744e-06, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 314385328.0, "step": 380 }, { "entropy": 0.4894866943359375, "epoch": 4.329545454545454, "grad_norm": 2.327251786892091, "learning_rate": 1.0015206268265948e-06, "loss": 0.0042, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 315206928.0, "step": 381 }, { "entropy": 0.49160003662109375, "epoch": 4.340909090909091, "grad_norm": 2.974267125288258, "learning_rate": 9.890017359423326e-07, "loss": 0.0038, "mean_token_accuracy": 1.0, "num_tokens": 316035622.0, "step": 382 }, { "entropy": 0.5143508911132812, "epoch": 4.3522727272727275, "grad_norm": 1.7357788012156292, "learning_rate": 9.765422587904919e-07, "loss": 0.0126, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 316811648.0, "step": 383 }, { "entropy": 0.4999237060546875, "epoch": 4.363636363636363, "grad_norm": 2.6366697025032337, "learning_rate": 9.641426852882717e-07, "loss": 0.0176, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 317623918.0, "step": 384 }, { "entropy": 0.495269775390625, "epoch": 4.375, "grad_norm": 2.501283842298434, "learning_rate": 9.518035029974127e-07, "loss": 0.0045, "mean_token_accuracy": 1.0, "num_tokens": 318428730.0, "step": 385 }, { "entropy": 0.4707489013671875, "epoch": 4.386363636363637, "grad_norm": 1.0161739496619286, "learning_rate": 9.395251971050206e-07, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 319302783.0, "step": 386 }, { "entropy": 0.5036773681640625, "epoch": 4.3977272727272725, "grad_norm": 0.5414812981393075, "learning_rate": 9.273082504044903e-07, "loss": 0.0025, "mean_token_accuracy": 1.0, "num_tokens": 320101409.0, "step": 387 }, { "entropy": 0.4913787841796875, "epoch": 4.409090909090909, "grad_norm": 7.733854824346476, "learning_rate": 9.151531432765204e-07, "loss": 0.0102, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 320913133.0, "step": 388 }, { "entropy": 0.47967529296875, "epoch": 4.420454545454546, "grad_norm": 2.3580133114107587, "learning_rate": 9.030603536702254e-07, "loss": 0.0039, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 321768392.0, "step": 389 }, { "entropy": 0.49045562744140625, "epoch": 4.431818181818182, "grad_norm": 4.210313755601456, "learning_rate": 8.910303570843423e-07, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 322599634.0, "step": 390 }, { "entropy": 0.493011474609375, "epoch": 4.443181818181818, "grad_norm": 3.8913982883862963, "learning_rate": 8.790636265485333e-07, "loss": 0.0162, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 323395015.0, "step": 391 }, { "entropy": 0.495269775390625, "epoch": 4.454545454545454, "grad_norm": 0.346768689760395, "learning_rate": 8.67160632604786e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 324210995.0, "step": 392 }, { "entropy": 0.49155426025390625, "epoch": 4.465909090909091, "grad_norm": 0.5274973505562733, "learning_rate": 8.553218432889091e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 325018202.0, "step": 393 }, { "entropy": 0.48851776123046875, "epoch": 4.4772727272727275, "grad_norm": 3.449091621793021, "learning_rate": 8.435477241121354e-07, "loss": 0.0036, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 325836067.0, "step": 394 }, { "entropy": 0.48577880859375, "epoch": 4.488636363636363, "grad_norm": 1.7526399488203852, "learning_rate": 8.31838738042808e-07, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 326648663.0, "step": 395 }, { "entropy": 0.49868011474609375, "epoch": 4.5, "grad_norm": 3.8013694140306917, "learning_rate": 8.201953454881844e-07, "loss": 0.0074, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 327428989.0, "step": 396 }, { "entropy": 0.48467254638671875, "epoch": 4.511363636363637, "grad_norm": 10.511325463365447, "learning_rate": 8.086180042763284e-07, "loss": 0.0091, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 328263854.0, "step": 397 }, { "entropy": 0.4808349609375, "epoch": 4.5227272727272725, "grad_norm": 2.4305720746807515, "learning_rate": 7.971071696381089e-07, "loss": 0.0141, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 329087106.0, "step": 398 }, { "entropy": 0.497528076171875, "epoch": 4.534090909090909, "grad_norm": 0.675792729286102, "learning_rate": 7.856632941893e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 329907669.0, "step": 399 }, { "entropy": 0.48723602294921875, "epoch": 4.545454545454545, "grad_norm": 2.4628320022302175, "learning_rate": 7.74286827912785e-07, "loss": 0.0031, "mean_token_accuracy": 1.0, "num_tokens": 330727069.0, "step": 400 }, { "entropy": 0.471282958984375, "epoch": 4.556818181818182, "grad_norm": 5.265304674225523, "learning_rate": 7.629782181408574e-07, "loss": 0.0147, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 331590906.0, "step": 401 }, { "entropy": 0.46993255615234375, "epoch": 4.568181818181818, "grad_norm": 4.67378034474964, "learning_rate": 7.517379095376418e-07, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 332461949.0, "step": 402 }, { "entropy": 0.5018539428710938, "epoch": 4.579545454545455, "grad_norm": 2.09672088064596, "learning_rate": 7.405663440815968e-07, "loss": 0.0082, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 333263860.0, "step": 403 }, { "entropy": 0.48856353759765625, "epoch": 4.590909090909091, "grad_norm": 2.0511373755169506, "learning_rate": 7.294639610481461e-07, "loss": 0.0083, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 334055426.0, "step": 404 }, { "entropy": 0.47692108154296875, "epoch": 4.6022727272727275, "grad_norm": 1.5653117450616783, "learning_rate": 7.184311969924002e-07, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 334899503.0, "step": 405 }, { "entropy": 0.47054290771484375, "epoch": 4.613636363636363, "grad_norm": 0.3087773860358458, "learning_rate": 7.074684857319928e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 335767817.0, "step": 406 }, { "entropy": 0.479583740234375, "epoch": 4.625, "grad_norm": 2.99732336372377, "learning_rate": 6.965762583300223e-07, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 336621709.0, "step": 407 }, { "entropy": 0.4801177978515625, "epoch": 4.636363636363637, "grad_norm": 1.3076164367052054, "learning_rate": 6.85754943078103e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 337441489.0, "step": 408 }, { "entropy": 0.49835968017578125, "epoch": 4.6477272727272725, "grad_norm": 1.349284457595706, "learning_rate": 6.750049654795199e-07, "loss": 0.0088, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 338232060.0, "step": 409 }, { "entropy": 0.48317718505859375, "epoch": 4.659090909090909, "grad_norm": 2.1021942804973324, "learning_rate": 6.643267482325061e-07, "loss": 0.0029, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 339051182.0, "step": 410 }, { "entropy": 0.4761505126953125, "epoch": 4.670454545454545, "grad_norm": 4.442088129850368, "learning_rate": 6.537207112136143e-07, "loss": 0.0164, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 339905590.0, "step": 411 }, { "entropy": 0.48587799072265625, "epoch": 4.681818181818182, "grad_norm": 5.065463686315352, "learning_rate": 6.431872714612072e-07, "loss": 0.0129, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 340743579.0, "step": 412 }, { "entropy": 0.46688079833984375, "epoch": 4.693181818181818, "grad_norm": 3.228917361025119, "learning_rate": 6.327268431590664e-07, "loss": 0.0066, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 341596634.0, "step": 413 }, { "entropy": 0.48221588134765625, "epoch": 4.704545454545455, "grad_norm": 0.6719090703949218, "learning_rate": 6.223398376200956e-07, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 342426853.0, "step": 414 }, { "entropy": 0.48246002197265625, "epoch": 4.715909090909091, "grad_norm": 0.8384280760870119, "learning_rate": 6.1202666327016e-07, "loss": 0.0025, "mean_token_accuracy": 1.0, "num_tokens": 343277692.0, "step": 415 }, { "entropy": 0.4742889404296875, "epoch": 4.7272727272727275, "grad_norm": 2.4415138149124735, "learning_rate": 6.017877256320132e-07, "loss": 0.0048, "mean_token_accuracy": 1.0, "num_tokens": 344128535.0, "step": 416 }, { "entropy": 0.4818267822265625, "epoch": 4.738636363636363, "grad_norm": 0.8515596128253197, "learning_rate": 5.916234273093624e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 344948795.0, "step": 417 }, { "entropy": 0.49347686767578125, "epoch": 4.75, "grad_norm": 9.346479245259347, "learning_rate": 5.815341679710327e-07, "loss": 0.0115, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 345737929.0, "step": 418 }, { "entropy": 0.48424530029296875, "epoch": 4.761363636363637, "grad_norm": 1.339805656978658, "learning_rate": 5.715203443352526e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 346553828.0, "step": 419 }, { "entropy": 0.48157501220703125, "epoch": 4.7727272727272725, "grad_norm": 0.2701253366161105, "learning_rate": 5.615823501540546e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 347375058.0, "step": 420 }, { "entropy": 0.4778289794921875, "epoch": 4.784090909090909, "grad_norm": 0.36859401605585224, "learning_rate": 5.51720576197794e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 348217272.0, "step": 421 }, { "entropy": 0.47646331787109375, "epoch": 4.795454545454545, "grad_norm": 1.3198516138713523, "learning_rate": 5.419354102397792e-07, "loss": 0.0117, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 349043381.0, "step": 422 }, { "entropy": 0.49961090087890625, "epoch": 4.806818181818182, "grad_norm": 4.2939387242943585, "learning_rate": 5.32227237041032e-07, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 349814498.0, "step": 423 }, { "entropy": 0.4700927734375, "epoch": 4.818181818181818, "grad_norm": 2.183285915853867, "learning_rate": 5.22596438335149e-07, "loss": 0.0072, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 350661477.0, "step": 424 }, { "entropy": 0.47753143310546875, "epoch": 4.829545454545455, "grad_norm": 4.542934050380425, "learning_rate": 5.130433928132983e-07, "loss": 0.0042, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 351498317.0, "step": 425 }, { "entropy": 0.46913909912109375, "epoch": 4.840909090909091, "grad_norm": 0.23800276817196775, "learning_rate": 5.035684761093273e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 352355692.0, "step": 426 }, { "entropy": 0.47409820556640625, "epoch": 4.8522727272727275, "grad_norm": 0.25047198404759846, "learning_rate": 4.941720607849912e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 353190411.0, "step": 427 }, { "entropy": 0.4987030029296875, "epoch": 4.863636363636363, "grad_norm": 10.75981988944821, "learning_rate": 4.848545163153048e-07, "loss": 0.0131, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 353977106.0, "step": 428 }, { "entropy": 0.5035171508789062, "epoch": 4.875, "grad_norm": 0.2384511087444417, "learning_rate": 4.756162090740135e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 354746145.0, "step": 429 }, { "entropy": 0.46907806396484375, "epoch": 4.886363636363637, "grad_norm": 6.102423857117631, "learning_rate": 4.6645750231918864e-07, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 355611270.0, "step": 430 }, { "entropy": 0.47377777099609375, "epoch": 4.8977272727272725, "grad_norm": 0.24001371751604936, "learning_rate": 4.5737875617894225e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 356445109.0, "step": 431 }, { "entropy": 0.48812103271484375, "epoch": 4.909090909090909, "grad_norm": 4.762057280592623, "learning_rate": 4.4838032763726806e-07, "loss": 0.006, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 357235786.0, "step": 432 }, { "entropy": 0.49410247802734375, "epoch": 4.920454545454545, "grad_norm": 3.949138959783489, "learning_rate": 4.394625705200012e-07, "loss": 0.0219, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 358031050.0, "step": 433 }, { "entropy": 0.46722412109375, "epoch": 4.931818181818182, "grad_norm": 0.284180125397139, "learning_rate": 4.3062583548091256e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 358895808.0, "step": 434 }, { "entropy": 0.4991455078125, "epoch": 4.943181818181818, "grad_norm": 6.520535055001159, "learning_rate": 4.218704699879117e-07, "loss": 0.0071, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 359692075.0, "step": 435 }, { "entropy": 0.4780120849609375, "epoch": 4.954545454545455, "grad_norm": 0.7522536313096782, "learning_rate": 4.1319681830939124e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 360552654.0, "step": 436 }, { "entropy": 0.46907806396484375, "epoch": 4.965909090909091, "grad_norm": 2.3964109369615016, "learning_rate": 4.0460522150068684e-07, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 361395351.0, "step": 437 }, { "entropy": 0.476226806640625, "epoch": 4.9772727272727275, "grad_norm": 0.50213990162671, "learning_rate": 3.9609601739066664e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 362224545.0, "step": 438 }, { "entropy": 0.48921966552734375, "epoch": 4.988636363636363, "grad_norm": 1.4592075617151454, "learning_rate": 3.876695405684486e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 363032893.0, "step": 439 }, { "entropy": 0.489410400390625, "epoch": 5.0, "grad_norm": 2.556748367856961, "learning_rate": 3.793261223702441e-07, "loss": 0.0095, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 363838139.0, "step": 440 }, { "entropy": 0.47222137451171875, "epoch": 5.011363636363637, "grad_norm": 4.214353845258516, "learning_rate": 3.7106609086632635e-07, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 364669029.0, "step": 441 }, { "entropy": 0.48236846923828125, "epoch": 5.0227272727272725, "grad_norm": 0.30780546204673304, "learning_rate": 3.628897708481377e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 365462695.0, "step": 442 }, { "entropy": 0.49261474609375, "epoch": 5.034090909090909, "grad_norm": 1.4822159912495783, "learning_rate": 3.5479748381550855e-07, "loss": 0.0077, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 366246764.0, "step": 443 }, { "entropy": 0.45969390869140625, "epoch": 5.045454545454546, "grad_norm": 0.8370386732840761, "learning_rate": 3.4678954796402624e-07, "loss": 0.0095, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 367104037.0, "step": 444 }, { "entropy": 0.45703887939453125, "epoch": 5.056818181818182, "grad_norm": 0.23893007118099577, "learning_rate": 3.388662781725141e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 367968268.0, "step": 445 }, { "entropy": 0.48757171630859375, "epoch": 5.068181818181818, "grad_norm": 0.2517958929459578, "learning_rate": 3.310279859906565e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 368763452.0, "step": 446 }, { "entropy": 0.475311279296875, "epoch": 5.079545454545454, "grad_norm": 1.7028057764506213, "learning_rate": 3.232749796267451e-07, "loss": 0.0144, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 369589368.0, "step": 447 }, { "entropy": 0.478118896484375, "epoch": 5.090909090909091, "grad_norm": 0.24780863890849855, "learning_rate": 3.1560756393556187e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 370411869.0, "step": 448 }, { "entropy": 0.4748687744140625, "epoch": 5.1022727272727275, "grad_norm": 1.1233286081795228, "learning_rate": 3.0802604040639034e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 371255264.0, "step": 449 }, { "entropy": 0.46393585205078125, "epoch": 5.113636363636363, "grad_norm": 3.08110570654071, "learning_rate": 3.0053070715116153e-07, "loss": 0.0026, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 372135606.0, "step": 450 }, { "entropy": 0.48212432861328125, "epoch": 5.125, "grad_norm": 3.202235664798041, "learning_rate": 2.9312185889273147e-07, "loss": 0.0083, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 372974668.0, "step": 451 }, { "entropy": 0.4832611083984375, "epoch": 5.136363636363637, "grad_norm": 0.7865563069141048, "learning_rate": 2.8579978695329386e-07, "loss": 0.0051, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 373795872.0, "step": 452 }, { "entropy": 0.473236083984375, "epoch": 5.1477272727272725, "grad_norm": 2.506322586954564, "learning_rate": 2.785647792429233e-07, "loss": 0.0035, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 374644330.0, "step": 453 }, { "entropy": 0.4847564697265625, "epoch": 5.159090909090909, "grad_norm": 0.551312091586902, "learning_rate": 2.714171202482538e-07, "loss": 0.0086, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 375451445.0, "step": 454 }, { "entropy": 0.48731231689453125, "epoch": 5.170454545454546, "grad_norm": 6.3859087946178414, "learning_rate": 2.6435709102129727e-07, "loss": 0.0046, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 376254658.0, "step": 455 }, { "entropy": 0.482147216796875, "epoch": 5.181818181818182, "grad_norm": 0.526311488156248, "learning_rate": 2.5738496916838524e-07, "loss": 0.0084, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 377072437.0, "step": 456 }, { "entropy": 0.4810333251953125, "epoch": 5.193181818181818, "grad_norm": 3.7772382358130283, "learning_rate": 2.505010288392587e-07, "loss": 0.012, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 377909031.0, "step": 457 }, { "entropy": 0.471832275390625, "epoch": 5.204545454545454, "grad_norm": 2.329737506097157, "learning_rate": 2.4370554071628613e-07, "loss": 0.0175, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 378749326.0, "step": 458 }, { "entropy": 0.477752685546875, "epoch": 5.215909090909091, "grad_norm": 1.2247209201337799, "learning_rate": 2.3699877200382026e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 379588976.0, "step": 459 }, { "entropy": 0.46756744384765625, "epoch": 5.2272727272727275, "grad_norm": 0.41393867739069445, "learning_rate": 2.303809864176909e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 380444915.0, "step": 460 }, { "entropy": 0.484649658203125, "epoch": 5.238636363636363, "grad_norm": 0.4288087261894904, "learning_rate": 2.2385244417483743e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 381261471.0, "step": 461 }, { "entropy": 0.47428131103515625, "epoch": 5.25, "grad_norm": 2.6805042624913673, "learning_rate": 2.174134019830726e-07, "loss": 0.0031, "mean_token_accuracy": 1.0, "num_tokens": 382101653.0, "step": 462 }, { "entropy": 0.48526763916015625, "epoch": 5.261363636363637, "grad_norm": 0.5789242093428709, "learning_rate": 2.1106411303099455e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 382917569.0, "step": 463 }, { "entropy": 0.4815673828125, "epoch": 5.2727272727272725, "grad_norm": 0.7412441363450578, "learning_rate": 2.0480482697802507e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 383733436.0, "step": 464 }, { "entropy": 0.47106170654296875, "epoch": 5.284090909090909, "grad_norm": 0.37355555384510464, "learning_rate": 1.986357899445976e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 384576794.0, "step": 465 }, { "entropy": 0.48104095458984375, "epoch": 5.295454545454546, "grad_norm": 0.38999280654890595, "learning_rate": 1.9255724450247676e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 385382850.0, "step": 466 }, { "entropy": 0.467498779296875, "epoch": 5.306818181818182, "grad_norm": 0.3255882167657849, "learning_rate": 1.8656942966522124e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 386230967.0, "step": 467 }, { "entropy": 0.49216461181640625, "epoch": 5.318181818181818, "grad_norm": 0.812386735624479, "learning_rate": 1.8067258087878597e-07, "loss": 0.0095, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 387023995.0, "step": 468 }, { "entropy": 0.48313140869140625, "epoch": 5.329545454545454, "grad_norm": 1.5287051130874392, "learning_rate": 1.748669300122627e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 387853013.0, "step": 469 }, { "entropy": 0.48089599609375, "epoch": 5.340909090909091, "grad_norm": 0.5410198602305207, "learning_rate": 1.691527053487646e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 388678242.0, "step": 470 }, { "entropy": 0.48954010009765625, "epoch": 5.3522727272727275, "grad_norm": 0.2573461421988016, "learning_rate": 1.635301315764484e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 389489184.0, "step": 471 }, { "entropy": 0.48332977294921875, "epoch": 5.363636363636363, "grad_norm": 0.2512417872893421, "learning_rate": 1.579994297796808e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 390313965.0, "step": 472 }, { "entropy": 0.484710693359375, "epoch": 5.375, "grad_norm": 0.24272552930424235, "learning_rate": 1.5256081743034336e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 391124884.0, "step": 473 }, { "entropy": 0.474761962890625, "epoch": 5.386363636363637, "grad_norm": 0.2441148940956172, "learning_rate": 1.472145083792842e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 391979859.0, "step": 474 }, { "entropy": 0.48592376708984375, "epoch": 5.3977272727272725, "grad_norm": 0.29622625734441144, "learning_rate": 1.419607128479053e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 392807678.0, "step": 475 }, { "entropy": 0.48085784912109375, "epoch": 5.409090909090909, "grad_norm": 1.1708257311301122, "learning_rate": 1.3679963741990127e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 393628416.0, "step": 476 }, { "entropy": 0.4741973876953125, "epoch": 5.420454545454546, "grad_norm": 3.7263249279288244, "learning_rate": 1.317314850331314e-07, "loss": 0.0061, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 394467813.0, "step": 477 }, { "entropy": 0.46479034423828125, "epoch": 5.431818181818182, "grad_norm": 0.23356626570343833, "learning_rate": 1.2675645497164352e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 395318537.0, "step": 478 }, { "entropy": 0.4804534912109375, "epoch": 5.443181818181818, "grad_norm": 5.27811356549522, "learning_rate": 1.2187474285783623e-07, "loss": 0.0045, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 396117712.0, "step": 479 }, { "entropy": 0.4789581298828125, "epoch": 5.454545454545454, "grad_norm": 0.23205909431366892, "learning_rate": 1.1708654064476743e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 396951671.0, "step": 480 }, { "entropy": 0.46929931640625, "epoch": 5.465909090909091, "grad_norm": 0.23000123108156822, "learning_rate": 1.1239203660860648e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 397801199.0, "step": 481 }, { "entropy": 0.470428466796875, "epoch": 5.4772727272727275, "grad_norm": 0.23372087472096778, "learning_rate": 1.0779141534123127e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 398632709.0, "step": 482 }, { "entropy": 0.47586822509765625, "epoch": 5.488636363636363, "grad_norm": 0.23070523406895965, "learning_rate": 1.0328485774296875e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 399472187.0, "step": 483 }, { "entropy": 0.482421875, "epoch": 5.5, "grad_norm": 0.23318075758795018, "learning_rate": 9.887254101548422e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 400281455.0, "step": 484 }, { "entropy": 0.4844207763671875, "epoch": 5.511363636363637, "grad_norm": 0.23208929439471024, "learning_rate": 9.455463865481019e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 401097680.0, "step": 485 }, { "entropy": 0.48187255859375, "epoch": 5.5227272727272725, "grad_norm": 0.23677025538291055, "learning_rate": 9.033132044452775e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 401927273.0, "step": 486 }, { "entropy": 0.47664642333984375, "epoch": 5.534090909090909, "grad_norm": 5.055301927928181, "learning_rate": 8.620275244908826e-08, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 402736167.0, "step": 487 }, { "entropy": 0.4710693359375, "epoch": 5.545454545454545, "grad_norm": 0.23085051544257776, "learning_rate": 8.216909700728498e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 403584040.0, "step": 488 }, { "entropy": 0.46595001220703125, "epoch": 5.556818181818182, "grad_norm": 0.2298464991859443, "learning_rate": 7.823051272586812e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 404443639.0, "step": 489 }, { "entropy": 0.490386962890625, "epoch": 5.568181818181818, "grad_norm": 1.7487934276030572, "learning_rate": 7.438715447331018e-08, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 405219096.0, "step": 490 }, { "entropy": 0.46939849853515625, "epoch": 5.579545454545455, "grad_norm": 3.6849857997279503, "learning_rate": 7.063917337371495e-08, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 406050370.0, "step": 491 }, { "entropy": 0.4792938232421875, "epoch": 5.590909090909091, "grad_norm": 0.48656609958077124, "learning_rate": 6.698671680087643e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 406883685.0, "step": 492 }, { "entropy": 0.48340606689453125, "epoch": 5.6022727272727275, "grad_norm": 0.2277425646629738, "learning_rate": 6.342992837248235e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 407672236.0, "step": 493 }, { "entropy": 0.482147216796875, "epoch": 5.613636363636363, "grad_norm": 0.23759672451897687, "learning_rate": 5.996894794446817e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 408468181.0, "step": 494 }, { "entropy": 0.4691009521484375, "epoch": 5.625, "grad_norm": 0.8356994624990842, "learning_rate": 5.660391160551837e-08, "loss": 0.0103, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 409310506.0, "step": 495 }, { "entropy": 0.4743499755859375, "epoch": 5.636363636363637, "grad_norm": 0.23173743117233309, "learning_rate": 5.333495167171354e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 410129372.0, "step": 496 }, { "entropy": 0.48233795166015625, "epoch": 5.6477272727272725, "grad_norm": 3.325194453062317, "learning_rate": 5.016219668132871e-08, "loss": 0.0076, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 410938341.0, "step": 497 }, { "entropy": 0.4671173095703125, "epoch": 5.659090909090909, "grad_norm": 0.9759331183243299, "learning_rate": 4.708577138977932e-08, "loss": 0.0109, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 411793767.0, "step": 498 }, { "entropy": 0.46318817138671875, "epoch": 5.670454545454545, "grad_norm": 1.3398280119073476, "learning_rate": 4.410579676471571e-08, "loss": 0.0073, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 412651605.0, "step": 499 }, { "entropy": 0.4657745361328125, "epoch": 5.681818181818182, "grad_norm": 2.55570040229675, "learning_rate": 4.1222389981265546e-08, "loss": 0.0086, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 413496664.0, "step": 500 }, { "entropy": 0.4684906005859375, "epoch": 5.693181818181818, "grad_norm": 6.915908450960148, "learning_rate": 3.843566441742774e-08, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 414322992.0, "step": 501 }, { "entropy": 0.48667144775390625, "epoch": 5.704545454545455, "grad_norm": 0.3864454725818334, "learning_rate": 3.574572964961304e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 415150977.0, "step": 502 }, { "entropy": 0.47731781005859375, "epoch": 5.715909090909091, "grad_norm": 0.5359817931960815, "learning_rate": 3.3152691448336825e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 415967079.0, "step": 503 }, { "entropy": 0.4813232421875, "epoch": 5.7272727272727275, "grad_norm": 0.2512550244047445, "learning_rate": 3.065665177405808e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 416791778.0, "step": 504 }, { "entropy": 0.477447509765625, "epoch": 5.738636363636363, "grad_norm": 0.24573660989925844, "learning_rate": 2.825770877317363e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 417633485.0, "step": 505 }, { "entropy": 0.46212005615234375, "epoch": 5.75, "grad_norm": 0.29916205036328847, "learning_rate": 2.5955956774154633e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 418482660.0, "step": 506 }, { "entropy": 0.48030853271484375, "epoch": 5.761363636363637, "grad_norm": 5.663292172784125, "learning_rate": 2.3751486283840884e-08, "loss": 0.0036, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 419319817.0, "step": 507 }, { "entropy": 0.4896087646484375, "epoch": 5.7727272727272725, "grad_norm": 3.733506086862286, "learning_rate": 2.1644383983880356e-08, "loss": 0.0255, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 420120900.0, "step": 508 }, { "entropy": 0.4809722900390625, "epoch": 5.784090909090909, "grad_norm": 0.9657687987783138, "learning_rate": 1.9634732727321636e-08, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 420957434.0, "step": 509 }, { "entropy": 0.46662139892578125, "epoch": 5.795454545454545, "grad_norm": 2.701021841228621, "learning_rate": 1.7722611535355426e-08, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 421804546.0, "step": 510 }, { "entropy": 0.48058319091796875, "epoch": 5.806818181818182, "grad_norm": 0.24552060618334481, "learning_rate": 1.5908095594207585e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 422624032.0, "step": 511 }, { "entropy": 0.47879791259765625, "epoch": 5.818181818181818, "grad_norm": 0.2313724445434632, "learning_rate": 1.4191256252182595e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 423469195.0, "step": 512 }, { "entropy": 0.4875946044921875, "epoch": 5.829545454545455, "grad_norm": 0.24314245718612798, "learning_rate": 1.2572161016858874e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 424277193.0, "step": 513 }, { "entropy": 0.47444915771484375, "epoch": 5.840909090909091, "grad_norm": 0.2530316763515594, "learning_rate": 1.1050873552433394e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 425106713.0, "step": 514 }, { "entropy": 0.47956085205078125, "epoch": 5.8522727272727275, "grad_norm": 0.24440590571395973, "learning_rate": 9.627453677218402e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 425917836.0, "step": 515 }, { "entropy": 0.47054290771484375, "epoch": 5.863636363636363, "grad_norm": 0.23559264304220282, "learning_rate": 8.301957361289969e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 426756350.0, "step": 516 }, { "entropy": 0.48386383056640625, "epoch": 5.875, "grad_norm": 0.23312439693348005, "learning_rate": 7.074436724286704e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 427592830.0, "step": 517 }, { "entropy": 0.48955535888671875, "epoch": 5.886363636363637, "grad_norm": 0.2588648942422833, "learning_rate": 5.944940033360269e-09, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 428381425.0, "step": 518 }, { "entropy": 0.46274566650390625, "epoch": 5.8977272727272725, "grad_norm": 0.8745898260707156, "learning_rate": 4.913511701278017e-09, "loss": 0.0081, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 429230635.0, "step": 519 }, { "entropy": 0.48191070556640625, "epoch": 5.909090909090909, "grad_norm": 0.27536210456621274, "learning_rate": 3.98019228467661e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 430035732.0, "step": 520 }, { "entropy": 0.48334503173828125, "epoch": 5.920454545454545, "grad_norm": 0.2487439601672152, "learning_rate": 3.1450184824657892e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 430861990.0, "step": 521 }, { "entropy": 0.47314453125, "epoch": 5.931818181818182, "grad_norm": 0.2584974817202786, "learning_rate": 2.408023134387871e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 431688137.0, "step": 522 }, { "entropy": 0.47559356689453125, "epoch": 5.943181818181818, "grad_norm": 0.27212198450046776, "learning_rate": 1.7692352197240525e-09, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 432525084.0, "step": 523 }, { "entropy": 0.464752197265625, "epoch": 5.954545454545455, "grad_norm": 0.2510235822217297, "learning_rate": 1.2286798561572666e-09, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 433386586.0, "step": 524 }, { "entropy": 0.479766845703125, "epoch": 5.965909090909091, "grad_norm": 1.0240552861546037, "learning_rate": 7.863782987821422e-10, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 434211459.0, "step": 525 }, { "entropy": 0.47564697265625, "epoch": 5.9772727272727275, "grad_norm": 0.24500907077317072, "learning_rate": 4.4234793927094845e-10, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 435023458.0, "step": 526 }, { "entropy": 0.47664642333984375, "epoch": 5.988636363636363, "grad_norm": 1.6045447652040448, "learning_rate": 1.9660230518886436e-10, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 435856072.0, "step": 527 }, { "entropy": 0.49124908447265625, "epoch": 6.0, "grad_norm": 0.24762061313479247, "learning_rate": 4.915105946246002e-11, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 436633698.0, "step": 528 }, { "epoch": 6.0, "step": 528, "total_flos": 513726926487552.0, "train_loss": 0.5520123199349848, "train_runtime": 70216.6152, "train_samples_per_second": 3.512, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 528, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 44, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 513726926487552.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }