{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.4132080078125, "epoch": 0.010869565217391304, "grad_norm": 311.4203607984754, "learning_rate": 0.0, "loss": 8.9732, "mean_token_accuracy": 0.0026041667442768812, "num_tokens": 736049.0, "step": 1 }, { "entropy": 0.3986663818359375, "epoch": 0.021739130434782608, "grad_norm": 320.59921807794416, "learning_rate": 3.5714285714285716e-07, "loss": 8.9248, "mean_token_accuracy": 0.0, "num_tokens": 1463777.0, "step": 2 }, { "entropy": 0.4055328369140625, "epoch": 0.03260869565217391, "grad_norm": 307.5231697291134, "learning_rate": 7.142857142857143e-07, "loss": 8.9711, "mean_token_accuracy": 0.0, "num_tokens": 2219300.0, "step": 3 }, { "entropy": 0.3943328857421875, "epoch": 0.043478260869565216, "grad_norm": 323.32099848994, "learning_rate": 1.0714285714285714e-06, "loss": 8.7592, "mean_token_accuracy": 0.0, "num_tokens": 2979743.0, "step": 4 }, { "entropy": 0.420135498046875, "epoch": 0.05434782608695652, "grad_norm": 340.3141229539087, "learning_rate": 1.4285714285714286e-06, "loss": 8.2909, "mean_token_accuracy": 0.0, "num_tokens": 3706150.0, "step": 5 }, { "entropy": 0.408721923828125, "epoch": 0.06521739130434782, "grad_norm": 344.270504919432, "learning_rate": 1.7857142857142859e-06, "loss": 8.0728, "mean_token_accuracy": 0.0, "num_tokens": 4445188.0, "step": 6 }, { "entropy": 0.39560699462890625, "epoch": 0.07608695652173914, "grad_norm": 302.2444503869232, "learning_rate": 2.1428571428571427e-06, "loss": 6.2297, "mean_token_accuracy": 0.031250000931322575, "num_tokens": 5187957.0, "step": 7 }, { "entropy": 0.4117279052734375, "epoch": 0.08695652173913043, "grad_norm": 219.04723364481447, "learning_rate": 2.5e-06, "loss": 5.4674, "mean_token_accuracy": 0.08072916907258332, "num_tokens": 5921002.0, "step": 8 }, { "entropy": 0.417694091796875, "epoch": 0.09782608695652174, "grad_norm": 98.1081259847685, "learning_rate": 2.8571428571428573e-06, "loss": 4.2197, "mean_token_accuracy": 0.4973958481568843, "num_tokens": 6652142.0, "step": 9 }, { "entropy": 0.40997314453125, "epoch": 0.10869565217391304, "grad_norm": 86.7533080434819, "learning_rate": 3.2142857142857147e-06, "loss": 4.0085, "mean_token_accuracy": 0.49479168141260743, "num_tokens": 7393898.0, "step": 10 }, { "entropy": 0.4106292724609375, "epoch": 0.11956521739130435, "grad_norm": 59.58522339179672, "learning_rate": 3.5714285714285718e-06, "loss": 3.3894, "mean_token_accuracy": 0.505208348389715, "num_tokens": 8102676.0, "step": 11 }, { "entropy": 0.40155029296875, "epoch": 0.13043478260869565, "grad_norm": 58.92890419627513, "learning_rate": 3.928571428571429e-06, "loss": 3.3107, "mean_token_accuracy": 0.4843750144354999, "num_tokens": 8809290.0, "step": 12 }, { "entropy": 0.393829345703125, "epoch": 0.14130434782608695, "grad_norm": 57.409462446242955, "learning_rate": 4.2857142857142855e-06, "loss": 3.2212, "mean_token_accuracy": 0.5156250153668225, "num_tokens": 9557288.0, "step": 13 }, { "entropy": 0.3956146240234375, "epoch": 0.15217391304347827, "grad_norm": 56.488189115901584, "learning_rate": 4.642857142857144e-06, "loss": 3.1327, "mean_token_accuracy": 0.505208348389715, "num_tokens": 10319598.0, "step": 14 }, { "entropy": 0.4060211181640625, "epoch": 0.16304347826086957, "grad_norm": 60.282711315492946, "learning_rate": 5e-06, "loss": 3.0223, "mean_token_accuracy": 0.5026041816454381, "num_tokens": 11024225.0, "step": 15 }, { "entropy": 0.3878936767578125, "epoch": 0.17391304347826086, "grad_norm": 55.83706199973999, "learning_rate": 5.357142857142857e-06, "loss": 2.9584, "mean_token_accuracy": 0.52604168234393, "num_tokens": 11782202.0, "step": 16 }, { "entropy": 0.4119720458984375, "epoch": 0.18478260869565216, "grad_norm": 56.515615405805015, "learning_rate": 5.7142857142857145e-06, "loss": 2.9469, "mean_token_accuracy": 0.5026041816454381, "num_tokens": 12501745.0, "step": 17 }, { "entropy": 0.4099273681640625, "epoch": 0.1956521739130435, "grad_norm": 56.196543894293534, "learning_rate": 6.071428571428571e-06, "loss": 2.9031, "mean_token_accuracy": 0.52604168234393, "num_tokens": 13238411.0, "step": 18 }, { "entropy": 0.3933258056640625, "epoch": 0.20652173913043478, "grad_norm": 55.19469257080713, "learning_rate": 6.4285714285714295e-06, "loss": 2.8638, "mean_token_accuracy": 0.5546875165309757, "num_tokens": 13976737.0, "step": 19 }, { "entropy": 0.4046630859375, "epoch": 0.21739130434782608, "grad_norm": 55.62375832255656, "learning_rate": 6.785714285714287e-06, "loss": 2.8226, "mean_token_accuracy": 0.52604168234393, "num_tokens": 14713129.0, "step": 20 }, { "entropy": 0.4174346923828125, "epoch": 0.22826086956521738, "grad_norm": 56.0776537005582, "learning_rate": 7.1428571428571436e-06, "loss": 2.7738, "mean_token_accuracy": 0.5390625160653144, "num_tokens": 15414280.0, "step": 21 }, { "entropy": 0.3958892822265625, "epoch": 0.2391304347826087, "grad_norm": 55.84684848536741, "learning_rate": 7.500000000000001e-06, "loss": 2.7211, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 16146409.0, "step": 22 }, { "entropy": 0.394927978515625, "epoch": 0.25, "grad_norm": 56.70304819091595, "learning_rate": 7.857142857142858e-06, "loss": 2.6884, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 16876424.0, "step": 23 }, { "entropy": 0.3933563232421875, "epoch": 0.2608695652173913, "grad_norm": 59.405424994443024, "learning_rate": 8.214285714285714e-06, "loss": 2.686, "mean_token_accuracy": 0.5572916832752526, "num_tokens": 17607756.0, "step": 24 }, { "entropy": 0.391387939453125, "epoch": 0.2717391304347826, "grad_norm": 57.418991339188196, "learning_rate": 8.571428571428571e-06, "loss": 2.6324, "mean_token_accuracy": 0.5312500158324838, "num_tokens": 18339588.0, "step": 25 }, { "entropy": 0.4117889404296875, "epoch": 0.2826086956521739, "grad_norm": 57.04690525036641, "learning_rate": 8.92857142857143e-06, "loss": 2.5908, "mean_token_accuracy": 0.5781250172294676, "num_tokens": 19052228.0, "step": 26 }, { "entropy": 0.413299560546875, "epoch": 0.29347826086956524, "grad_norm": 57.87482177137187, "learning_rate": 9.285714285714288e-06, "loss": 2.5745, "mean_token_accuracy": 0.5546875165309757, "num_tokens": 19761480.0, "step": 27 }, { "entropy": 0.3918609619140625, "epoch": 0.30434782608695654, "grad_norm": 57.6709336727282, "learning_rate": 9.642857142857144e-06, "loss": 2.5131, "mean_token_accuracy": 0.5729166837409139, "num_tokens": 20518198.0, "step": 28 }, { "entropy": 0.3912506103515625, "epoch": 0.31521739130434784, "grad_norm": 57.59250346538701, "learning_rate": 1e-05, "loss": 2.4797, "mean_token_accuracy": 0.5390625160653144, "num_tokens": 21233376.0, "step": 29 }, { "entropy": 0.3871612548828125, "epoch": 0.32608695652173914, "grad_norm": 57.36161657267134, "learning_rate": 9.999910138041584e-06, "loss": 2.4115, "mean_token_accuracy": 0.5885416842065752, "num_tokens": 21965608.0, "step": 30 }, { "entropy": 0.3805999755859375, "epoch": 0.33695652173913043, "grad_norm": 57.50995417467593, "learning_rate": 9.999640555396404e-06, "loss": 2.3874, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 22688324.0, "step": 31 }, { "entropy": 0.3845367431640625, "epoch": 0.34782608695652173, "grad_norm": 58.39720778593916, "learning_rate": 9.99919126175455e-06, "loss": 2.3259, "mean_token_accuracy": 0.5885416832752526, "num_tokens": 23409211.0, "step": 32 }, { "entropy": 0.387359619140625, "epoch": 0.358695652173913, "grad_norm": 58.26716191906752, "learning_rate": 9.998562273265786e-06, "loss": 2.2932, "mean_token_accuracy": 0.8880208395421505, "num_tokens": 24134105.0, "step": 33 }, { "entropy": 0.4016571044921875, "epoch": 0.3695652173913043, "grad_norm": 58.24491582430228, "learning_rate": 9.997753612538963e-06, "loss": 2.2414, "mean_token_accuracy": 0.901041672565043, "num_tokens": 24858194.0, "step": 34 }, { "entropy": 0.3777618408203125, "epoch": 0.3804347826086957, "grad_norm": 58.651315915936905, "learning_rate": 9.996765308641218e-06, "loss": 2.1791, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 25608809.0, "step": 35 }, { "entropy": 0.390777587890625, "epoch": 0.391304347826087, "grad_norm": 59.35487193293602, "learning_rate": 9.995597397096923e-06, "loss": 2.1431, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 26351472.0, "step": 36 }, { "entropy": 0.410919189453125, "epoch": 0.40217391304347827, "grad_norm": 59.07734759750127, "learning_rate": 9.994249919886402e-06, "loss": 2.1013, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 27081828.0, "step": 37 }, { "entropy": 0.38934326171875, "epoch": 0.41304347826086957, "grad_norm": 59.15541447484555, "learning_rate": 9.992722925444434e-06, "loss": 2.0633, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 27798222.0, "step": 38 }, { "entropy": 0.3936767578125, "epoch": 0.42391304347826086, "grad_norm": 58.8800906283516, "learning_rate": 9.9910164686585e-06, "loss": 2.0148, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 28537568.0, "step": 39 }, { "entropy": 0.3974609375, "epoch": 0.43478260869565216, "grad_norm": 59.1178475676971, "learning_rate": 9.989130610866822e-06, "loss": 1.9566, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 29282405.0, "step": 40 }, { "entropy": 0.39190673828125, "epoch": 0.44565217391304346, "grad_norm": 58.455690329387025, "learning_rate": 9.98706541985615e-06, "loss": 1.8993, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 30041949.0, "step": 41 }, { "entropy": 0.38946533203125, "epoch": 0.45652173913043476, "grad_norm": 58.21255505443314, "learning_rate": 9.984820969859326e-06, "loss": 1.8697, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 30784326.0, "step": 42 }, { "entropy": 0.38653564453125, "epoch": 0.4673913043478261, "grad_norm": 61.152558796905076, "learning_rate": 9.98239734155262e-06, "loss": 1.8516, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 31494883.0, "step": 43 }, { "entropy": 0.3789520263671875, "epoch": 0.4782608695652174, "grad_norm": 64.01566395627057, "learning_rate": 9.979794622052825e-06, "loss": 1.8419, "mean_token_accuracy": 0.8671875079162419, "num_tokens": 32249196.0, "step": 44 }, { "entropy": 0.4061431884765625, "epoch": 0.4891304347826087, "grad_norm": 58.01942955061165, "learning_rate": 9.977012904914133e-06, "loss": 1.7189, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 32943506.0, "step": 45 }, { "entropy": 0.3825225830078125, "epoch": 0.5, "grad_norm": 57.61228110390323, "learning_rate": 9.97405229012476e-06, "loss": 1.6684, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 33693820.0, "step": 46 }, { "entropy": 0.395050048828125, "epoch": 0.5108695652173914, "grad_norm": 57.71747140390215, "learning_rate": 9.970912884103365e-06, "loss": 1.6254, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 34423797.0, "step": 47 }, { "entropy": 0.406280517578125, "epoch": 0.5217391304347826, "grad_norm": 57.7963582112258, "learning_rate": 9.967594799695218e-06, "loss": 1.5949, "mean_token_accuracy": 0.890625006519258, "num_tokens": 35134310.0, "step": 48 }, { "entropy": 0.4052734375, "epoch": 0.532608695652174, "grad_norm": 56.73762320496077, "learning_rate": 9.964098156168143e-06, "loss": 1.5258, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 35851216.0, "step": 49 }, { "entropy": 0.382781982421875, "epoch": 0.5434782608695652, "grad_norm": 56.57460935743424, "learning_rate": 9.960423079208235e-06, "loss": 1.461, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 36622748.0, "step": 50 }, { "entropy": 0.4014434814453125, "epoch": 0.5543478260869565, "grad_norm": 56.449396812129784, "learning_rate": 9.956569700915338e-06, "loss": 1.4112, "mean_token_accuracy": 0.901041672565043, "num_tokens": 37341849.0, "step": 51 }, { "entropy": 0.38623046875, "epoch": 0.5652173913043478, "grad_norm": 57.34475610674306, "learning_rate": 9.9525381597983e-06, "loss": 1.3685, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 38078370.0, "step": 52 }, { "entropy": 0.403656005859375, "epoch": 0.5760869565217391, "grad_norm": 56.628062051209035, "learning_rate": 9.948328600769996e-06, "loss": 1.3272, "mean_token_accuracy": 0.901041672565043, "num_tokens": 38799653.0, "step": 53 }, { "entropy": 0.3992919921875, "epoch": 0.5869565217391305, "grad_norm": 56.43643883036958, "learning_rate": 9.943941175142109e-06, "loss": 1.2695, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 39508420.0, "step": 54 }, { "entropy": 0.3996429443359375, "epoch": 0.5978260869565217, "grad_norm": 56.337771571623705, "learning_rate": 9.939376040619707e-06, "loss": 1.2108, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 40235911.0, "step": 55 }, { "entropy": 0.3965911865234375, "epoch": 0.6086956521739131, "grad_norm": 56.128155563321904, "learning_rate": 9.934633361295558e-06, "loss": 1.1563, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 40975042.0, "step": 56 }, { "entropy": 0.400970458984375, "epoch": 0.6195652173913043, "grad_norm": 55.26315542185204, "learning_rate": 9.929713307644245e-06, "loss": 1.1246, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 41696396.0, "step": 57 }, { "entropy": 0.39263916015625, "epoch": 0.6304347826086957, "grad_norm": 54.86254923259469, "learning_rate": 9.924616056516027e-06, "loss": 1.076, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 42440574.0, "step": 58 }, { "entropy": 0.3855133056640625, "epoch": 0.6413043478260869, "grad_norm": 54.37535851726419, "learning_rate": 9.919341791130496e-06, "loss": 0.9996, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 43201491.0, "step": 59 }, { "entropy": 0.3922271728515625, "epoch": 0.6521739130434783, "grad_norm": 53.843542444920786, "learning_rate": 9.91389070106998e-06, "loss": 0.939, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 43941990.0, "step": 60 }, { "entropy": 0.392181396484375, "epoch": 0.6630434782608695, "grad_norm": 53.193793655115314, "learning_rate": 9.908262982272724e-06, "loss": 0.9157, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 44683606.0, "step": 61 }, { "entropy": 0.4019012451171875, "epoch": 0.6739130434782609, "grad_norm": 52.50158074780275, "learning_rate": 9.902458837025865e-06, "loss": 0.9166, "mean_token_accuracy": 0.8750000074505806, "num_tokens": 45432434.0, "step": 62 }, { "entropy": 0.3917388916015625, "epoch": 0.6847826086956522, "grad_norm": 51.20620506081717, "learning_rate": 9.896478473958147e-06, "loss": 0.8248, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 46172808.0, "step": 63 }, { "entropy": 0.42156982421875, "epoch": 0.6956521739130435, "grad_norm": 49.71377838953342, "learning_rate": 9.890322108032423e-06, "loss": 0.7796, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 46887344.0, "step": 64 }, { "entropy": 0.412841796875, "epoch": 0.7065217391304348, "grad_norm": 48.79946579114015, "learning_rate": 9.883989960537934e-06, "loss": 0.7169, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 47598804.0, "step": 65 }, { "entropy": 0.4025421142578125, "epoch": 0.717391304347826, "grad_norm": 47.47246229291085, "learning_rate": 9.87748225908235e-06, "loss": 0.6599, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 48328626.0, "step": 66 }, { "entropy": 0.3900604248046875, "epoch": 0.7282608695652174, "grad_norm": 45.6630505460685, "learning_rate": 9.870799237583586e-06, "loss": 0.6299, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 49050818.0, "step": 67 }, { "entropy": 0.3900146484375, "epoch": 0.7391304347826086, "grad_norm": 43.671137236728555, "learning_rate": 9.863941136261409e-06, "loss": 0.5987, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 49821603.0, "step": 68 }, { "entropy": 0.3853912353515625, "epoch": 0.75, "grad_norm": 41.75463992853865, "learning_rate": 9.85690820162878e-06, "loss": 0.5554, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 50583851.0, "step": 69 }, { "entropy": 0.39080810546875, "epoch": 0.7608695652173914, "grad_norm": 39.63626833273986, "learning_rate": 9.849700686483016e-06, "loss": 0.5429, "mean_token_accuracy": 0.901041672565043, "num_tokens": 51340765.0, "step": 70 }, { "entropy": 0.3896942138671875, "epoch": 0.7717391304347826, "grad_norm": 37.2850766781592, "learning_rate": 9.842318849896679e-06, "loss": 0.5126, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 52062797.0, "step": 71 }, { "entropy": 0.4081268310546875, "epoch": 0.782608695652174, "grad_norm": 34.8524617938976, "learning_rate": 9.834762957208293e-06, "loss": 0.4817, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 52763819.0, "step": 72 }, { "entropy": 0.4026336669921875, "epoch": 0.7934782608695652, "grad_norm": 32.73006841336539, "learning_rate": 9.827033280012783e-06, "loss": 0.4532, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 53476825.0, "step": 73 }, { "entropy": 0.38232421875, "epoch": 0.8043478260869565, "grad_norm": 30.598447911376223, "learning_rate": 9.819130096151718e-06, "loss": 0.4288, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 54230469.0, "step": 74 }, { "entropy": 0.3890380859375, "epoch": 0.8152173913043478, "grad_norm": 28.510481522648345, "learning_rate": 9.811053689703333e-06, "loss": 0.3822, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 54968247.0, "step": 75 }, { "entropy": 0.3891143798828125, "epoch": 0.8260869565217391, "grad_norm": 26.69143261415315, "learning_rate": 9.802804350972308e-06, "loss": 0.3698, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 55729266.0, "step": 76 }, { "entropy": 0.4080963134765625, "epoch": 0.8369565217391305, "grad_norm": 24.853889219833917, "learning_rate": 9.794382376479334e-06, "loss": 0.3282, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 56439606.0, "step": 77 }, { "entropy": 0.39483642578125, "epoch": 0.8478260869565217, "grad_norm": 23.879103930139745, "learning_rate": 9.785788068950463e-06, "loss": 0.3492, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 57174042.0, "step": 78 }, { "entropy": 0.393463134765625, "epoch": 0.8586956521739131, "grad_norm": 29.502826535200015, "learning_rate": 9.777021737306214e-06, "loss": 0.3805, "mean_token_accuracy": 0.890625006519258, "num_tokens": 57922090.0, "step": 79 }, { "entropy": 0.39227294921875, "epoch": 0.8695652173913043, "grad_norm": 25.186278274805534, "learning_rate": 9.768083696650481e-06, "loss": 0.3131, "mean_token_accuracy": 0.8828125069849193, "num_tokens": 58657068.0, "step": 80 }, { "entropy": 0.388702392578125, "epoch": 0.8804347826086957, "grad_norm": 20.39179919664989, "learning_rate": 9.7589742682592e-06, "loss": 0.3105, "mean_token_accuracy": 0.8541666753590107, "num_tokens": 59371848.0, "step": 81 }, { "entropy": 0.39642333984375, "epoch": 0.8913043478260869, "grad_norm": 16.48573667654869, "learning_rate": 9.749693779568799e-06, "loss": 0.2881, "mean_token_accuracy": 0.901041672565043, "num_tokens": 60138700.0, "step": 82 }, { "entropy": 0.418212890625, "epoch": 0.9021739130434783, "grad_norm": 16.736139521937194, "learning_rate": 9.740242564164433e-06, "loss": 0.2842, "mean_token_accuracy": 0.8880208400078118, "num_tokens": 60857518.0, "step": 83 }, { "entropy": 0.406829833984375, "epoch": 0.9130434782608695, "grad_norm": 11.573091220404486, "learning_rate": 9.730620961767996e-06, "loss": 0.2459, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 61574756.0, "step": 84 }, { "entropy": 0.3925933837890625, "epoch": 0.9239130434782609, "grad_norm": 15.167601713145734, "learning_rate": 9.720829318225897e-06, "loss": 0.2541, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 62301917.0, "step": 85 }, { "entropy": 0.4003753662109375, "epoch": 0.9347826086956522, "grad_norm": 10.3890102579825, "learning_rate": 9.710867985496644e-06, "loss": 0.2287, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 63032620.0, "step": 86 }, { "entropy": 0.390167236328125, "epoch": 0.9456521739130435, "grad_norm": 8.81818629332175, "learning_rate": 9.700737321638185e-06, "loss": 0.254, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 63766906.0, "step": 87 }, { "entropy": 0.4013671875, "epoch": 0.9565217391304348, "grad_norm": 9.910698551626139, "learning_rate": 9.690437690795038e-06, "loss": 0.2311, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 64494352.0, "step": 88 }, { "entropy": 0.395233154296875, "epoch": 0.967391304347826, "grad_norm": 6.574734122970261, "learning_rate": 9.6799694631852e-06, "loss": 0.2114, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 65236044.0, "step": 89 }, { "entropy": 0.40948486328125, "epoch": 0.9782608695652174, "grad_norm": 6.510590706786288, "learning_rate": 9.669333015086847e-06, "loss": 0.2034, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 65983515.0, "step": 90 }, { "entropy": 0.3949737548828125, "epoch": 0.9891304347826086, "grad_norm": 5.9428204712433415, "learning_rate": 9.658528728824799e-06, "loss": 0.2004, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 66708318.0, "step": 91 }, { "entropy": 0.395599365234375, "epoch": 1.0, "grad_norm": 7.805501519637745, "learning_rate": 9.647556992756789e-06, "loss": 0.2292, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 67446881.0, "step": 92 }, { "entropy": 0.3989410400390625, "epoch": 1.0108695652173914, "grad_norm": 6.762899007449279, "learning_rate": 9.63641820125949e-06, "loss": 0.1931, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 68189298.0, "step": 93 }, { "entropy": 0.383544921875, "epoch": 1.0217391304347827, "grad_norm": 26.558510253726908, "learning_rate": 9.62511275471435e-06, "loss": 0.3214, "mean_token_accuracy": 0.8854166734963655, "num_tokens": 68923012.0, "step": 94 }, { "entropy": 0.4062652587890625, "epoch": 1.0326086956521738, "grad_norm": 17.66752124642183, "learning_rate": 9.613641059493197e-06, "loss": 0.2273, "mean_token_accuracy": 0.8750000074505806, "num_tokens": 69634729.0, "step": 95 }, { "entropy": 0.42718505859375, "epoch": 1.0434782608695652, "grad_norm": 6.440308115084329, "learning_rate": 9.602003527943629e-06, "loss": 0.2182, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 70360145.0, "step": 96 }, { "entropy": 0.419647216796875, "epoch": 1.0543478260869565, "grad_norm": 3.313729334755876, "learning_rate": 9.590200578374198e-06, "loss": 0.1965, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 71094229.0, "step": 97 }, { "entropy": 0.4211273193359375, "epoch": 1.065217391304348, "grad_norm": 6.472092255757511, "learning_rate": 9.578232635039368e-06, "loss": 0.1891, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 71837508.0, "step": 98 }, { "entropy": 0.4221038818359375, "epoch": 1.0760869565217392, "grad_norm": 5.65079339360605, "learning_rate": 9.56610012812427e-06, "loss": 0.1976, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 72578472.0, "step": 99 }, { "entropy": 0.4143218994140625, "epoch": 1.0869565217391304, "grad_norm": 10.526588616167475, "learning_rate": 9.553803493729237e-06, "loss": 0.2276, "mean_token_accuracy": 0.901041672565043, "num_tokens": 73309250.0, "step": 100 }, { "entropy": 0.4214324951171875, "epoch": 1.0978260869565217, "grad_norm": 6.76038126286873, "learning_rate": 9.541343173854128e-06, "loss": 0.176, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 74039837.0, "step": 101 }, { "entropy": 0.426239013671875, "epoch": 1.108695652173913, "grad_norm": 2.550406593305114, "learning_rate": 9.528719616382443e-06, "loss": 0.194, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 74782288.0, "step": 102 }, { "entropy": 0.4285736083984375, "epoch": 1.1195652173913044, "grad_norm": 1.9861505083333468, "learning_rate": 9.515933275065218e-06, "loss": 0.1803, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 75514570.0, "step": 103 }, { "entropy": 0.4146728515625, "epoch": 1.1304347826086956, "grad_norm": 2.0937110928111204, "learning_rate": 9.502984609504724e-06, "loss": 0.1657, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 76263471.0, "step": 104 }, { "entropy": 0.409637451171875, "epoch": 1.141304347826087, "grad_norm": 4.0370749201612925, "learning_rate": 9.48987408513794e-06, "loss": 0.2, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 77032808.0, "step": 105 }, { "entropy": 0.43310546875, "epoch": 1.1521739130434783, "grad_norm": 6.171691874710987, "learning_rate": 9.476602173219822e-06, "loss": 0.1915, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 77755289.0, "step": 106 }, { "entropy": 0.421905517578125, "epoch": 1.1630434782608696, "grad_norm": 9.707071779843087, "learning_rate": 9.463169350806369e-06, "loss": 0.218, "mean_token_accuracy": 0.8854166734963655, "num_tokens": 78504728.0, "step": 107 }, { "entropy": 0.442291259765625, "epoch": 1.1739130434782608, "grad_norm": 4.492512568138839, "learning_rate": 9.449576100737474e-06, "loss": 0.1715, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 79234552.0, "step": 108 }, { "entropy": 0.42706298828125, "epoch": 1.184782608695652, "grad_norm": 7.98756060312019, "learning_rate": 9.435822911619564e-06, "loss": 0.2187, "mean_token_accuracy": 0.8645833414047956, "num_tokens": 79986236.0, "step": 109 }, { "entropy": 0.430419921875, "epoch": 1.1956521739130435, "grad_norm": 2.1129664499398926, "learning_rate": 9.421910277808044e-06, "loss": 0.1867, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 80716096.0, "step": 110 }, { "entropy": 0.4068450927734375, "epoch": 1.2065217391304348, "grad_norm": 16.963736777066014, "learning_rate": 9.407838699389525e-06, "loss": 0.3354, "mean_token_accuracy": 0.8723958409391344, "num_tokens": 81458849.0, "step": 111 }, { "entropy": 0.4076385498046875, "epoch": 1.2173913043478262, "grad_norm": 13.924252562663128, "learning_rate": 9.39360868216384e-06, "loss": 0.3034, "mean_token_accuracy": 0.8776041739620268, "num_tokens": 82211351.0, "step": 112 }, { "entropy": 0.4111328125, "epoch": 1.2282608695652173, "grad_norm": 1.7100595172279562, "learning_rate": 9.379220737625877e-06, "loss": 0.1878, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 82980102.0, "step": 113 }, { "entropy": 0.433319091796875, "epoch": 1.2391304347826086, "grad_norm": 5.846423629455087, "learning_rate": 9.364675382947185e-06, "loss": 0.2073, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 83711400.0, "step": 114 }, { "entropy": 0.445159912109375, "epoch": 1.25, "grad_norm": 6.451469321919068, "learning_rate": 9.349973140957392e-06, "loss": 0.2106, "mean_token_accuracy": 0.8854166734963655, "num_tokens": 84460019.0, "step": 115 }, { "entropy": 0.447662353515625, "epoch": 1.2608695652173914, "grad_norm": 2.7790448798410115, "learning_rate": 9.335114540125393e-06, "loss": 0.1996, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 85193928.0, "step": 116 }, { "entropy": 0.4636077880859375, "epoch": 1.2717391304347827, "grad_norm": 2.6482470281746333, "learning_rate": 9.320100114540382e-06, "loss": 0.1937, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 85934868.0, "step": 117 }, { "entropy": 0.4525604248046875, "epoch": 1.2826086956521738, "grad_norm": 3.315578614192242, "learning_rate": 9.304930403892633e-06, "loss": 0.1847, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 86646205.0, "step": 118 }, { "entropy": 0.4474029541015625, "epoch": 1.2934782608695652, "grad_norm": 3.26330878602713, "learning_rate": 9.289605953454108e-06, "loss": 0.1799, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 87395348.0, "step": 119 }, { "entropy": 0.451995849609375, "epoch": 1.3043478260869565, "grad_norm": 4.510835834017429, "learning_rate": 9.274127314058857e-06, "loss": 0.2158, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 88139881.0, "step": 120 }, { "entropy": 0.4410247802734375, "epoch": 1.315217391304348, "grad_norm": 3.053218547982469, "learning_rate": 9.258495042083222e-06, "loss": 0.1895, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 88852096.0, "step": 121 }, { "entropy": 0.434051513671875, "epoch": 1.3260869565217392, "grad_norm": 1.5033565251314938, "learning_rate": 9.242709699425833e-06, "loss": 0.1819, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 89626261.0, "step": 122 }, { "entropy": 0.4545745849609375, "epoch": 1.3369565217391304, "grad_norm": 3.0951856378919347, "learning_rate": 9.226771853487411e-06, "loss": 0.1665, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 90365741.0, "step": 123 }, { "entropy": 0.4754486083984375, "epoch": 1.3478260869565217, "grad_norm": 1.9859436895610123, "learning_rate": 9.210682077150375e-06, "loss": 0.1646, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 91057876.0, "step": 124 }, { "entropy": 0.487884521484375, "epoch": 1.358695652173913, "grad_norm": 8.27013486394484, "learning_rate": 9.19444094875825e-06, "loss": 0.1843, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 91761477.0, "step": 125 }, { "entropy": 0.4636993408203125, "epoch": 1.3695652173913042, "grad_norm": 2.2934540711997515, "learning_rate": 9.178049052094881e-06, "loss": 0.1618, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 92506148.0, "step": 126 }, { "entropy": 0.4479217529296875, "epoch": 1.3804347826086958, "grad_norm": 4.786209262389493, "learning_rate": 9.161506976363438e-06, "loss": 0.1983, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 93254912.0, "step": 127 }, { "entropy": 0.4602508544921875, "epoch": 1.391304347826087, "grad_norm": 5.7691216780542005, "learning_rate": 9.144815316165251e-06, "loss": 0.1935, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 93978264.0, "step": 128 }, { "entropy": 0.4593658447265625, "epoch": 1.4021739130434783, "grad_norm": 11.199636434276805, "learning_rate": 9.127974671478432e-06, "loss": 0.2107, "mean_token_accuracy": 0.8776041739620268, "num_tokens": 94710015.0, "step": 129 }, { "entropy": 0.446746826171875, "epoch": 1.4130434782608696, "grad_norm": 0.9923518223404385, "learning_rate": 9.110985647636303e-06, "loss": 0.1737, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 95446308.0, "step": 130 }, { "entropy": 0.476165771484375, "epoch": 1.4239130434782608, "grad_norm": 4.474677717637186, "learning_rate": 9.09384885530565e-06, "loss": 0.1851, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 96158637.0, "step": 131 }, { "entropy": 0.481658935546875, "epoch": 1.434782608695652, "grad_norm": 2.185059548670286, "learning_rate": 9.076564910464753e-06, "loss": 0.2109, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 96918874.0, "step": 132 }, { "entropy": 0.5084381103515625, "epoch": 1.4456521739130435, "grad_norm": 2.6152649554216567, "learning_rate": 9.059134434381274e-06, "loss": 0.1803, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 97644105.0, "step": 133 }, { "entropy": 0.5358123779296875, "epoch": 1.4565217391304348, "grad_norm": 3.5962048690843793, "learning_rate": 9.041558053589894e-06, "loss": 0.19, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 98355051.0, "step": 134 }, { "entropy": 0.5171661376953125, "epoch": 1.4673913043478262, "grad_norm": 1.2450335202012426, "learning_rate": 9.023836399869814e-06, "loss": 0.1829, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 99093771.0, "step": 135 }, { "entropy": 0.517242431640625, "epoch": 1.4782608695652173, "grad_norm": 2.8717902587276094, "learning_rate": 9.00597011022204e-06, "loss": 0.1513, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 99833460.0, "step": 136 }, { "entropy": 0.482940673828125, "epoch": 1.4891304347826086, "grad_norm": 1.364200004039475, "learning_rate": 8.987959826846479e-06, "loss": 0.2022, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 100568646.0, "step": 137 }, { "entropy": 0.4926300048828125, "epoch": 1.5, "grad_norm": 0.597811474604225, "learning_rate": 8.96980619711887e-06, "loss": 0.1535, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 101307254.0, "step": 138 }, { "entropy": 0.514373779296875, "epoch": 1.5108695652173914, "grad_norm": 0.7538518313390983, "learning_rate": 8.951509873567498e-06, "loss": 0.1703, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 102001272.0, "step": 139 }, { "entropy": 0.496429443359375, "epoch": 1.5217391304347827, "grad_norm": 6.358019373787709, "learning_rate": 8.93307151384975e-06, "loss": 0.2264, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 102695383.0, "step": 140 }, { "entropy": 0.46575927734375, "epoch": 1.5326086956521738, "grad_norm": 5.517487056748144, "learning_rate": 8.914491780728471e-06, "loss": 0.2207, "mean_token_accuracy": 0.8776041739620268, "num_tokens": 103429899.0, "step": 141 }, { "entropy": 0.4832611083984375, "epoch": 1.5434782608695652, "grad_norm": 2.4395442761334785, "learning_rate": 8.895771342048145e-06, "loss": 0.1596, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 104163455.0, "step": 142 }, { "entropy": 0.4860687255859375, "epoch": 1.5543478260869565, "grad_norm": 4.472158201631238, "learning_rate": 8.876910870710885e-06, "loss": 0.178, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 104930091.0, "step": 143 }, { "entropy": 0.5011749267578125, "epoch": 1.5652173913043477, "grad_norm": 1.8149497345123533, "learning_rate": 8.857911044652244e-06, "loss": 0.1807, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 105653123.0, "step": 144 }, { "entropy": 0.5025482177734375, "epoch": 1.5760869565217392, "grad_norm": 3.8636950718069123, "learning_rate": 8.838772546816857e-06, "loss": 0.1808, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 106368006.0, "step": 145 }, { "entropy": 0.5029754638671875, "epoch": 1.5869565217391304, "grad_norm": 5.002115277057554, "learning_rate": 8.819496065133879e-06, "loss": 0.1834, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 107097616.0, "step": 146 }, { "entropy": 0.521240234375, "epoch": 1.5978260869565217, "grad_norm": 1.015730869884965, "learning_rate": 8.800082292492274e-06, "loss": 0.1662, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 107812186.0, "step": 147 }, { "entropy": 0.4935760498046875, "epoch": 1.608695652173913, "grad_norm": 1.0635747380852985, "learning_rate": 8.780531926715888e-06, "loss": 0.1704, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 108547506.0, "step": 148 }, { "entropy": 0.534637451171875, "epoch": 1.6195652173913042, "grad_norm": 1.5449003097461054, "learning_rate": 8.760845670538387e-06, "loss": 0.1754, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 109266322.0, "step": 149 }, { "entropy": 0.5189208984375, "epoch": 1.6304347826086958, "grad_norm": 2.0268875856085344, "learning_rate": 8.741024231577983e-06, "loss": 0.1517, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 110008922.0, "step": 150 }, { "entropy": 0.5161285400390625, "epoch": 1.641304347826087, "grad_norm": 1.0672579311719492, "learning_rate": 8.721068322312007e-06, "loss": 0.1531, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 110727558.0, "step": 151 }, { "entropy": 0.521392822265625, "epoch": 1.6521739130434783, "grad_norm": 5.213780878437388, "learning_rate": 8.700978660051293e-06, "loss": 0.195, "mean_token_accuracy": 0.890625006519258, "num_tokens": 111471527.0, "step": 152 }, { "entropy": 0.5205535888671875, "epoch": 1.6630434782608696, "grad_norm": 1.9517069380322867, "learning_rate": 8.6807559669144e-06, "loss": 0.1596, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 112239963.0, "step": 153 }, { "entropy": 0.561065673828125, "epoch": 1.6739130434782608, "grad_norm": 2.7705962729348363, "learning_rate": 8.660400969801653e-06, "loss": 0.1738, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 112976654.0, "step": 154 }, { "entropy": 0.5628204345703125, "epoch": 1.6847826086956523, "grad_norm": 4.898163032031099, "learning_rate": 8.63991440036901e-06, "loss": 0.1707, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 113718848.0, "step": 155 }, { "entropy": 0.5825958251953125, "epoch": 1.6956521739130435, "grad_norm": 2.833385371279203, "learning_rate": 8.619296995001773e-06, "loss": 0.1731, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 114442519.0, "step": 156 }, { "entropy": 0.57159423828125, "epoch": 1.7065217391304348, "grad_norm": 4.117296477591056, "learning_rate": 8.598549494788111e-06, "loss": 0.1806, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 115181598.0, "step": 157 }, { "entropy": 0.5811614990234375, "epoch": 1.7173913043478262, "grad_norm": 4.092040299967927, "learning_rate": 8.577672645492426e-06, "loss": 0.1682, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 115915785.0, "step": 158 }, { "entropy": 0.5871734619140625, "epoch": 1.7282608695652173, "grad_norm": 2.8908222719784127, "learning_rate": 8.556667197528543e-06, "loss": 0.1816, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 116621451.0, "step": 159 }, { "entropy": 0.5701141357421875, "epoch": 1.7391304347826086, "grad_norm": 3.6224602544096918, "learning_rate": 8.535533905932739e-06, "loss": 0.1737, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 117310119.0, "step": 160 }, { "entropy": 0.56756591796875, "epoch": 1.75, "grad_norm": 1.4631757964659389, "learning_rate": 8.5142735303366e-06, "loss": 0.1736, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 118041095.0, "step": 161 }, { "entropy": 0.5458984375, "epoch": 1.7608695652173914, "grad_norm": 2.3531502542663385, "learning_rate": 8.492886834939722e-06, "loss": 0.1568, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 118753230.0, "step": 162 }, { "entropy": 0.5509033203125, "epoch": 1.7717391304347827, "grad_norm": 5.023926546982622, "learning_rate": 8.47137458848224e-06, "loss": 0.1753, "mean_token_accuracy": 0.901041672565043, "num_tokens": 119489643.0, "step": 163 }, { "entropy": 0.53118896484375, "epoch": 1.7826086956521738, "grad_norm": 1.721920120015578, "learning_rate": 8.44973756421719e-06, "loss": 0.15, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 120231301.0, "step": 164 }, { "entropy": 0.52777099609375, "epoch": 1.7934782608695652, "grad_norm": 3.418076490433067, "learning_rate": 8.427976539882725e-06, "loss": 0.1613, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 120962416.0, "step": 165 }, { "entropy": 0.51080322265625, "epoch": 1.8043478260869565, "grad_norm": 3.4170913157737797, "learning_rate": 8.406092297674146e-06, "loss": 0.1552, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 121726443.0, "step": 166 }, { "entropy": 0.5138702392578125, "epoch": 1.8152173913043477, "grad_norm": 4.469668539839008, "learning_rate": 8.384085624215801e-06, "loss": 0.1692, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 122456880.0, "step": 167 }, { "entropy": 0.5077056884765625, "epoch": 1.8260869565217392, "grad_norm": 3.398224636055471, "learning_rate": 8.3619573105328e-06, "loss": 0.1976, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 123192926.0, "step": 168 }, { "entropy": 0.5028533935546875, "epoch": 1.8369565217391304, "grad_norm": 1.1058962935015557, "learning_rate": 8.339708152022586e-06, "loss": 0.1682, "mean_token_accuracy": 0.901041672565043, "num_tokens": 123947455.0, "step": 169 }, { "entropy": 0.5254058837890625, "epoch": 1.8478260869565217, "grad_norm": 5.167659625792978, "learning_rate": 8.317338948426338e-06, "loss": 0.1801, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 124663961.0, "step": 170 }, { "entropy": 0.5162200927734375, "epoch": 1.858695652173913, "grad_norm": 3.2546914206847983, "learning_rate": 8.294850503800237e-06, "loss": 0.1575, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 125403597.0, "step": 171 }, { "entropy": 0.536865234375, "epoch": 1.8695652173913042, "grad_norm": 2.121124553980237, "learning_rate": 8.272243626486553e-06, "loss": 0.1715, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 126132729.0, "step": 172 }, { "entropy": 0.511260986328125, "epoch": 1.8804347826086958, "grad_norm": 3.615045249735466, "learning_rate": 8.24951912908459e-06, "loss": 0.1607, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 126885495.0, "step": 173 }, { "entropy": 0.5345001220703125, "epoch": 1.891304347826087, "grad_norm": 2.959072727194171, "learning_rate": 8.22667782842149e-06, "loss": 0.1873, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 127611736.0, "step": 174 }, { "entropy": 0.544708251953125, "epoch": 1.9021739130434783, "grad_norm": 0.8408245157759806, "learning_rate": 8.203720545522852e-06, "loss": 0.1548, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 128331508.0, "step": 175 }, { "entropy": 0.542083740234375, "epoch": 1.9130434782608696, "grad_norm": 3.1936191732342483, "learning_rate": 8.18064810558324e-06, "loss": 0.1804, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 129056107.0, "step": 176 }, { "entropy": 0.52423095703125, "epoch": 1.9239130434782608, "grad_norm": 1.6394513978982945, "learning_rate": 8.157461337936506e-06, "loss": 0.15, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 129807571.0, "step": 177 }, { "entropy": 0.5415802001953125, "epoch": 1.9347826086956523, "grad_norm": 0.7311318246016283, "learning_rate": 8.134161076025992e-06, "loss": 0.1555, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 130558970.0, "step": 178 }, { "entropy": 0.5243377685546875, "epoch": 1.9456521739130435, "grad_norm": 8.45397210689231, "learning_rate": 8.110748157374566e-06, "loss": 0.1935, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 131313276.0, "step": 179 }, { "entropy": 0.5270843505859375, "epoch": 1.9565217391304348, "grad_norm": 4.680053604300379, "learning_rate": 8.087223423554513e-06, "loss": 0.1442, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 132053009.0, "step": 180 }, { "entropy": 0.5287017822265625, "epoch": 1.9673913043478262, "grad_norm": 1.4362435578406598, "learning_rate": 8.063587720157298e-06, "loss": 0.1577, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 132815611.0, "step": 181 }, { "entropy": 0.5521240234375, "epoch": 1.9782608695652173, "grad_norm": 4.592426202753763, "learning_rate": 8.039841896763157e-06, "loss": 0.1737, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 133527514.0, "step": 182 }, { "entropy": 0.562744140625, "epoch": 1.9891304347826086, "grad_norm": 8.804668623486542, "learning_rate": 8.01598680691057e-06, "loss": 0.1974, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 134233532.0, "step": 183 }, { "entropy": 0.52972412109375, "epoch": 2.0, "grad_norm": 5.442557880753617, "learning_rate": 7.99202330806557e-06, "loss": 0.1489, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 134970186.0, "step": 184 }, { "entropy": 0.5426025390625, "epoch": 2.010869565217391, "grad_norm": 3.8135433451565692, "learning_rate": 7.967952261590936e-06, "loss": 0.1616, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 135686807.0, "step": 185 }, { "entropy": 0.532379150390625, "epoch": 2.0217391304347827, "grad_norm": 5.581932015960598, "learning_rate": 7.943774532715215e-06, "loss": 0.1678, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 136422808.0, "step": 186 }, { "entropy": 0.515899658203125, "epoch": 2.032608695652174, "grad_norm": 6.432043736528405, "learning_rate": 7.919490990501636e-06, "loss": 0.1646, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 137151038.0, "step": 187 }, { "entropy": 0.51885986328125, "epoch": 2.0434782608695654, "grad_norm": 4.974490425422215, "learning_rate": 7.895102507816866e-06, "loss": 0.1638, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 137880949.0, "step": 188 }, { "entropy": 0.52191162109375, "epoch": 2.0543478260869565, "grad_norm": 3.2102438591558133, "learning_rate": 7.870609961299627e-06, "loss": 0.152, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 138598332.0, "step": 189 }, { "entropy": 0.517822265625, "epoch": 2.0652173913043477, "grad_norm": 1.129167074131451, "learning_rate": 7.8460142313292e-06, "loss": 0.1986, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 139306369.0, "step": 190 }, { "entropy": 0.4926605224609375, "epoch": 2.0760869565217392, "grad_norm": 5.511214416637161, "learning_rate": 7.821316201993768e-06, "loss": 0.191, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 140050111.0, "step": 191 }, { "entropy": 0.4912261962890625, "epoch": 2.0869565217391304, "grad_norm": 4.589563951441601, "learning_rate": 7.796516761058649e-06, "loss": 0.1709, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 140770436.0, "step": 192 }, { "entropy": 0.517242431640625, "epoch": 2.097826086956522, "grad_norm": 3.307436574573834, "learning_rate": 7.771616799934372e-06, "loss": 0.1747, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 141460692.0, "step": 193 }, { "entropy": 0.495513916015625, "epoch": 2.108695652173913, "grad_norm": 1.2213195030130277, "learning_rate": 7.746617213644646e-06, "loss": 0.1651, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 142200106.0, "step": 194 }, { "entropy": 0.530914306640625, "epoch": 2.119565217391304, "grad_norm": 2.175857735434003, "learning_rate": 7.721518900794186e-06, "loss": 0.1586, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 142904935.0, "step": 195 }, { "entropy": 0.492340087890625, "epoch": 2.130434782608696, "grad_norm": 1.3407787633418933, "learning_rate": 7.696322763536408e-06, "loss": 0.1657, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 143621023.0, "step": 196 }, { "entropy": 0.5035247802734375, "epoch": 2.141304347826087, "grad_norm": 2.742789829325077, "learning_rate": 7.67102970754101e-06, "loss": 0.1473, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 144348296.0, "step": 197 }, { "entropy": 0.4730987548828125, "epoch": 2.1521739130434785, "grad_norm": 2.911801084282523, "learning_rate": 7.645640641961407e-06, "loss": 0.1464, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 145094605.0, "step": 198 }, { "entropy": 0.4618377685546875, "epoch": 2.1630434782608696, "grad_norm": 3.6081222338142305, "learning_rate": 7.620156479402066e-06, "loss": 0.1526, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 145847120.0, "step": 199 }, { "entropy": 0.478057861328125, "epoch": 2.1739130434782608, "grad_norm": 1.0514452021998937, "learning_rate": 7.594578135885684e-06, "loss": 0.1418, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 146577419.0, "step": 200 }, { "entropy": 0.466522216796875, "epoch": 2.1847826086956523, "grad_norm": 1.7411693991777, "learning_rate": 7.568906530820281e-06, "loss": 0.1391, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 147309360.0, "step": 201 }, { "entropy": 0.466461181640625, "epoch": 2.1956521739130435, "grad_norm": 2.17620767082738, "learning_rate": 7.543142586966139e-06, "loss": 0.1429, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 148048145.0, "step": 202 }, { "entropy": 0.460052490234375, "epoch": 2.2065217391304346, "grad_norm": 1.1004916008068764, "learning_rate": 7.517287230402639e-06, "loss": 0.1792, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 148793377.0, "step": 203 }, { "entropy": 0.4695587158203125, "epoch": 2.217391304347826, "grad_norm": 3.228916035303672, "learning_rate": 7.491341390494971e-06, "loss": 0.2017, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 149529344.0, "step": 204 }, { "entropy": 0.495758056640625, "epoch": 2.2282608695652173, "grad_norm": 2.0917537496816916, "learning_rate": 7.465305999860728e-06, "loss": 0.1602, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 150239023.0, "step": 205 }, { "entropy": 0.5149688720703125, "epoch": 2.239130434782609, "grad_norm": 1.7658146369992271, "learning_rate": 7.439181994336389e-06, "loss": 0.1559, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 150959182.0, "step": 206 }, { "entropy": 0.491668701171875, "epoch": 2.25, "grad_norm": 1.842592163481376, "learning_rate": 7.412970312943672e-06, "loss": 0.1593, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 151696790.0, "step": 207 }, { "entropy": 0.5184173583984375, "epoch": 2.260869565217391, "grad_norm": 1.5342919929733787, "learning_rate": 7.386671897855786e-06, "loss": 0.146, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 152399305.0, "step": 208 }, { "entropy": 0.5020751953125, "epoch": 2.2717391304347827, "grad_norm": 1.285400661348482, "learning_rate": 7.360287694363566e-06, "loss": 0.1389, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 153144631.0, "step": 209 }, { "entropy": 0.4999237060546875, "epoch": 2.282608695652174, "grad_norm": 1.089626772132905, "learning_rate": 7.333818650841489e-06, "loss": 0.1728, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 153863039.0, "step": 210 }, { "entropy": 0.4629669189453125, "epoch": 2.2934782608695654, "grad_norm": 0.7958565648055749, "learning_rate": 7.3072657187135895e-06, "loss": 0.1365, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 154627008.0, "step": 211 }, { "entropy": 0.4714813232421875, "epoch": 2.3043478260869565, "grad_norm": 2.1294805846667435, "learning_rate": 7.280629852419263e-06, "loss": 0.1185, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 155336075.0, "step": 212 }, { "entropy": 0.4590911865234375, "epoch": 2.3152173913043477, "grad_norm": 2.2936903239742428, "learning_rate": 7.253912009378953e-06, "loss": 0.1651, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 156069482.0, "step": 213 }, { "entropy": 0.4477081298828125, "epoch": 2.3260869565217392, "grad_norm": 1.399883348847463, "learning_rate": 7.227113149959738e-06, "loss": 0.165, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 156810168.0, "step": 214 }, { "entropy": 0.4658660888671875, "epoch": 2.3369565217391304, "grad_norm": 1.1309340794568266, "learning_rate": 7.200234237440815e-06, "loss": 0.1495, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 157525831.0, "step": 215 }, { "entropy": 0.452239990234375, "epoch": 2.3478260869565215, "grad_norm": 3.438678019243807, "learning_rate": 7.173276237978872e-06, "loss": 0.1612, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 158256452.0, "step": 216 }, { "entropy": 0.4614105224609375, "epoch": 2.358695652173913, "grad_norm": 0.7820925016217779, "learning_rate": 7.146240120573358e-06, "loss": 0.1356, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 158989349.0, "step": 217 }, { "entropy": 0.4483489990234375, "epoch": 2.369565217391304, "grad_norm": 0.9729093397557323, "learning_rate": 7.1191268570316575e-06, "loss": 0.1493, "mean_token_accuracy": 0.945312503259629, "num_tokens": 159726218.0, "step": 218 }, { "entropy": 0.457611083984375, "epoch": 2.380434782608696, "grad_norm": 2.9206076228482334, "learning_rate": 7.091937421934158e-06, "loss": 0.1624, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 160463988.0, "step": 219 }, { "entropy": 0.4862823486328125, "epoch": 2.391304347826087, "grad_norm": 3.479020953108577, "learning_rate": 7.064672792599208e-06, "loss": 0.1396, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 161181564.0, "step": 220 }, { "entropy": 0.4723968505859375, "epoch": 2.4021739130434785, "grad_norm": 0.8546526492754443, "learning_rate": 7.037333949048005e-06, "loss": 0.1331, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 161913838.0, "step": 221 }, { "entropy": 0.4722442626953125, "epoch": 2.4130434782608696, "grad_norm": 0.8101386153737262, "learning_rate": 7.009921873969359e-06, "loss": 0.149, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 162640954.0, "step": 222 }, { "entropy": 0.48663330078125, "epoch": 2.4239130434782608, "grad_norm": 2.1383118886350183, "learning_rate": 6.9824375526843705e-06, "loss": 0.1755, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 163373073.0, "step": 223 }, { "entropy": 0.512115478515625, "epoch": 2.4347826086956523, "grad_norm": 0.8385687350546065, "learning_rate": 6.954881973111013e-06, "loss": 0.1554, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 164096548.0, "step": 224 }, { "entropy": 0.529052734375, "epoch": 2.4456521739130435, "grad_norm": 1.0514628608699, "learning_rate": 6.927256125728624e-06, "loss": 0.1373, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 164820809.0, "step": 225 }, { "entropy": 0.5160675048828125, "epoch": 2.4565217391304346, "grad_norm": 2.698419287754823, "learning_rate": 6.8995610035423044e-06, "loss": 0.153, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 165550845.0, "step": 226 }, { "entropy": 0.5378570556640625, "epoch": 2.467391304347826, "grad_norm": 0.7993047970571002, "learning_rate": 6.871797602047221e-06, "loss": 0.1372, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 166277643.0, "step": 227 }, { "entropy": 0.576873779296875, "epoch": 2.4782608695652173, "grad_norm": 1.5400335597456314, "learning_rate": 6.843966919192827e-06, "loss": 0.138, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 166987602.0, "step": 228 }, { "entropy": 0.544677734375, "epoch": 2.489130434782609, "grad_norm": 1.3501457655398248, "learning_rate": 6.816069955346986e-06, "loss": 0.1527, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 167701559.0, "step": 229 }, { "entropy": 0.5312347412109375, "epoch": 2.5, "grad_norm": 1.346411855493066, "learning_rate": 6.788107713260023e-06, "loss": 0.1398, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 168423290.0, "step": 230 }, { "entropy": 0.5373382568359375, "epoch": 2.5108695652173916, "grad_norm": 1.5812163663322734, "learning_rate": 6.760081198028671e-06, "loss": 0.1524, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 169153964.0, "step": 231 }, { "entropy": 0.5088958740234375, "epoch": 2.5217391304347827, "grad_norm": 1.7413265225657986, "learning_rate": 6.731991417059947e-06, "loss": 0.1376, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 169921938.0, "step": 232 }, { "entropy": 0.5161895751953125, "epoch": 2.532608695652174, "grad_norm": 1.3981089558799913, "learning_rate": 6.703839380034945e-06, "loss": 0.1301, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 170682692.0, "step": 233 }, { "entropy": 0.5208587646484375, "epoch": 2.5434782608695654, "grad_norm": 2.8149627673456936, "learning_rate": 6.675626098872536e-06, "loss": 0.1636, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 171432994.0, "step": 234 }, { "entropy": 0.51824951171875, "epoch": 2.5543478260869565, "grad_norm": 1.4684058065548915, "learning_rate": 6.647352587693001e-06, "loss": 0.1624, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 172141981.0, "step": 235 }, { "entropy": 0.4888153076171875, "epoch": 2.5652173913043477, "grad_norm": 1.4406536148696736, "learning_rate": 6.619019862781571e-06, "loss": 0.1396, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 172897864.0, "step": 236 }, { "entropy": 0.50592041015625, "epoch": 2.5760869565217392, "grad_norm": 2.7792231969842742, "learning_rate": 6.590628942551909e-06, "loss": 0.1665, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 173612172.0, "step": 237 }, { "entropy": 0.479766845703125, "epoch": 2.5869565217391304, "grad_norm": 0.8612062254275914, "learning_rate": 6.5621808475094904e-06, "loss": 0.1472, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 174336959.0, "step": 238 }, { "entropy": 0.463836669921875, "epoch": 2.5978260869565215, "grad_norm": 2.1652743091901083, "learning_rate": 6.533676600214929e-06, "loss": 0.1323, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 175095005.0, "step": 239 }, { "entropy": 0.4647979736328125, "epoch": 2.608695652173913, "grad_norm": 1.0538425000460028, "learning_rate": 6.505117225247218e-06, "loss": 0.1559, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 175821704.0, "step": 240 }, { "entropy": 0.471099853515625, "epoch": 2.619565217391304, "grad_norm": 0.9819078153379016, "learning_rate": 6.476503749166903e-06, "loss": 0.1457, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 176557432.0, "step": 241 }, { "entropy": 0.4735260009765625, "epoch": 2.630434782608696, "grad_norm": 1.778816402900594, "learning_rate": 6.447837200479187e-06, "loss": 0.1823, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 177267268.0, "step": 242 }, { "entropy": 0.476226806640625, "epoch": 2.641304347826087, "grad_norm": 0.7823399457915889, "learning_rate": 6.419118609596948e-06, "loss": 0.146, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 177988371.0, "step": 243 }, { "entropy": 0.4737396240234375, "epoch": 2.6521739130434785, "grad_norm": 2.300505507236317, "learning_rate": 6.390349008803717e-06, "loss": 0.1497, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 178721738.0, "step": 244 }, { "entropy": 0.4729461669921875, "epoch": 2.6630434782608696, "grad_norm": 0.9400854604905068, "learning_rate": 6.36152943221656e-06, "loss": 0.1878, "mean_token_accuracy": 0.901041672565043, "num_tokens": 179467726.0, "step": 245 }, { "entropy": 0.4758758544921875, "epoch": 2.6739130434782608, "grad_norm": 2.605291347050962, "learning_rate": 6.332660915748915e-06, "loss": 0.1686, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 180215932.0, "step": 246 }, { "entropy": 0.48907470703125, "epoch": 2.6847826086956523, "grad_norm": 2.962520848096924, "learning_rate": 6.303744497073352e-06, "loss": 0.1508, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 180961953.0, "step": 247 }, { "entropy": 0.493072509765625, "epoch": 2.6956521739130435, "grad_norm": 1.0342817962395638, "learning_rate": 6.274781215584277e-06, "loss": 0.1489, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 181653249.0, "step": 248 }, { "entropy": 0.5084991455078125, "epoch": 2.7065217391304346, "grad_norm": 0.6822142118394301, "learning_rate": 6.245772112360568e-06, "loss": 0.1564, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 182374091.0, "step": 249 }, { "entropy": 0.4775848388671875, "epoch": 2.717391304347826, "grad_norm": 2.62505064720756, "learning_rate": 6.216718230128156e-06, "loss": 0.1694, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 183112589.0, "step": 250 }, { "entropy": 0.4820556640625, "epoch": 2.7282608695652173, "grad_norm": 1.4778946605073953, "learning_rate": 6.187620613222544e-06, "loss": 0.1516, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 183857175.0, "step": 251 }, { "entropy": 0.4744110107421875, "epoch": 2.7391304347826084, "grad_norm": 1.6896217372430502, "learning_rate": 6.158480307551269e-06, "loss": 0.1715, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 184586476.0, "step": 252 }, { "entropy": 0.492889404296875, "epoch": 2.75, "grad_norm": 0.7824494663876816, "learning_rate": 6.129298360556304e-06, "loss": 0.1207, "mean_token_accuracy": 0.945312503259629, "num_tokens": 185333896.0, "step": 253 }, { "entropy": 0.48492431640625, "epoch": 2.7608695652173916, "grad_norm": 0.885817818754658, "learning_rate": 6.100075821176412e-06, "loss": 0.1531, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 186054040.0, "step": 254 }, { "entropy": 0.5035552978515625, "epoch": 2.7717391304347827, "grad_norm": 3.4109040646320348, "learning_rate": 6.070813739809443e-06, "loss": 0.1741, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 186783234.0, "step": 255 }, { "entropy": 0.52508544921875, "epoch": 2.782608695652174, "grad_norm": 3.8891039102099123, "learning_rate": 6.041513168274568e-06, "loss": 0.1891, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 187490178.0, "step": 256 }, { "entropy": 0.5135955810546875, "epoch": 2.7934782608695654, "grad_norm": 2.3436153874765773, "learning_rate": 6.012175159774488e-06, "loss": 0.1298, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 188244843.0, "step": 257 }, { "entropy": 0.5222015380859375, "epoch": 2.8043478260869565, "grad_norm": 3.255145078571251, "learning_rate": 5.982800768857561e-06, "loss": 0.1579, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 188985421.0, "step": 258 }, { "entropy": 0.5480804443359375, "epoch": 2.8152173913043477, "grad_norm": 2.7396951574762514, "learning_rate": 5.953391051379904e-06, "loss": 0.145, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 189704280.0, "step": 259 }, { "entropy": 0.55145263671875, "epoch": 2.8260869565217392, "grad_norm": 3.533652625022049, "learning_rate": 5.9239470644674425e-06, "loss": 0.1493, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 190430021.0, "step": 260 }, { "entropy": 0.5504913330078125, "epoch": 2.8369565217391304, "grad_norm": 2.3908884569654347, "learning_rate": 5.894469866477905e-06, "loss": 0.1492, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 191164968.0, "step": 261 }, { "entropy": 0.553436279296875, "epoch": 2.8478260869565215, "grad_norm": 1.8403956420887753, "learning_rate": 5.864960516962791e-06, "loss": 0.157, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 191897977.0, "step": 262 }, { "entropy": 0.5384979248046875, "epoch": 2.858695652173913, "grad_norm": 2.840652908639724, "learning_rate": 5.835420076629273e-06, "loss": 0.1421, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 192644903.0, "step": 263 }, { "entropy": 0.5287322998046875, "epoch": 2.869565217391304, "grad_norm": 1.0019073525611903, "learning_rate": 5.805849607302081e-06, "loss": 0.1327, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 193387031.0, "step": 264 }, { "entropy": 0.5142059326171875, "epoch": 2.880434782608696, "grad_norm": 1.0267626138197101, "learning_rate": 5.776250171885329e-06, "loss": 0.1677, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 194117382.0, "step": 265 }, { "entropy": 0.5182952880859375, "epoch": 2.891304347826087, "grad_norm": 2.459122457455405, "learning_rate": 5.74662283432431e-06, "loss": 0.1402, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 194861082.0, "step": 266 }, { "entropy": 0.4894866943359375, "epoch": 2.9021739130434785, "grad_norm": 2.2801769816915796, "learning_rate": 5.716968659567256e-06, "loss": 0.1338, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 195621115.0, "step": 267 }, { "entropy": 0.4940338134765625, "epoch": 2.9130434782608696, "grad_norm": 0.76096101535856, "learning_rate": 5.687288713527051e-06, "loss": 0.1329, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 196360159.0, "step": 268 }, { "entropy": 0.4987640380859375, "epoch": 2.9239130434782608, "grad_norm": 1.1507883154034404, "learning_rate": 5.6575840630429295e-06, "loss": 0.161, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 197074701.0, "step": 269 }, { "entropy": 0.478546142578125, "epoch": 2.9347826086956523, "grad_norm": 4.258626023619959, "learning_rate": 5.627855775842116e-06, "loss": 0.1538, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 197809329.0, "step": 270 }, { "entropy": 0.5005645751953125, "epoch": 2.9456521739130435, "grad_norm": 3.415851630206049, "learning_rate": 5.598104920501455e-06, "loss": 0.1445, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 198531748.0, "step": 271 }, { "entropy": 0.483428955078125, "epoch": 2.9565217391304346, "grad_norm": 1.8451160224609997, "learning_rate": 5.568332566408995e-06, "loss": 0.1229, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 199273498.0, "step": 272 }, { "entropy": 0.4969635009765625, "epoch": 2.967391304347826, "grad_norm": 1.3096913208218943, "learning_rate": 5.538539783725556e-06, "loss": 0.124, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 200009778.0, "step": 273 }, { "entropy": 0.5011138916015625, "epoch": 2.9782608695652173, "grad_norm": 2.6192863885199102, "learning_rate": 5.508727643346257e-06, "loss": 0.1422, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 200752432.0, "step": 274 }, { "entropy": 0.5101165771484375, "epoch": 2.9891304347826084, "grad_norm": 1.9357570578321412, "learning_rate": 5.478897216862026e-06, "loss": 0.1396, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 201496295.0, "step": 275 }, { "entropy": 0.4974517822265625, "epoch": 3.0, "grad_norm": 2.658448706719779, "learning_rate": 5.4490495765210795e-06, "loss": 0.1312, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 202238152.0, "step": 276 }, { "entropy": 0.4904632568359375, "epoch": 3.010869565217391, "grad_norm": 0.7111278030242015, "learning_rate": 5.4191857951903825e-06, "loss": 0.1297, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 203018911.0, "step": 277 }, { "entropy": 0.4854278564453125, "epoch": 3.0217391304347827, "grad_norm": 3.685677125315361, "learning_rate": 5.389306946317089e-06, "loss": 0.1452, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 203769923.0, "step": 278 }, { "entropy": 0.5066680908203125, "epoch": 3.032608695652174, "grad_norm": 5.926720540282865, "learning_rate": 5.359414103889947e-06, "loss": 0.1639, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 204482884.0, "step": 279 }, { "entropy": 0.5011138916015625, "epoch": 3.0434782608695654, "grad_norm": 3.828517904681439, "learning_rate": 5.329508342400702e-06, "loss": 0.1307, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 205197104.0, "step": 280 }, { "entropy": 0.5212249755859375, "epoch": 3.0543478260869565, "grad_norm": 2.371018766229332, "learning_rate": 5.29959073680547e-06, "loss": 0.1074, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 205909556.0, "step": 281 }, { "entropy": 0.4830169677734375, "epoch": 3.0652173913043477, "grad_norm": 3.4453484681704536, "learning_rate": 5.2696623624861065e-06, "loss": 0.122, "mean_token_accuracy": 0.945312503259629, "num_tokens": 206648457.0, "step": 282 }, { "entropy": 0.4988861083984375, "epoch": 3.0760869565217392, "grad_norm": 4.521405572801564, "learning_rate": 5.239724295211541e-06, "loss": 0.1369, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 207369598.0, "step": 283 }, { "entropy": 0.4881439208984375, "epoch": 3.0869565217391304, "grad_norm": 5.674022612225397, "learning_rate": 5.209777611099117e-06, "loss": 0.1471, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 208096720.0, "step": 284 }, { "entropy": 0.470123291015625, "epoch": 3.097826086956522, "grad_norm": 4.813184131741794, "learning_rate": 5.179823386575908e-06, "loss": 0.1037, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 208844663.0, "step": 285 }, { "entropy": 0.4812469482421875, "epoch": 3.108695652173913, "grad_norm": 0.935293372415571, "learning_rate": 5.1498626983400215e-06, "loss": 0.1306, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 209580199.0, "step": 286 }, { "entropy": 0.487884521484375, "epoch": 3.119565217391304, "grad_norm": 1.853600268381173, "learning_rate": 5.11989662332191e-06, "loss": 0.1191, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 210310644.0, "step": 287 }, { "entropy": 0.478851318359375, "epoch": 3.130434782608696, "grad_norm": 2.6861430284164887, "learning_rate": 5.089926238645645e-06, "loss": 0.1241, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 211009831.0, "step": 288 }, { "entropy": 0.4602813720703125, "epoch": 3.141304347826087, "grad_norm": 0.9379319523834283, "learning_rate": 5.059952621590216e-06, "loss": 0.1145, "mean_token_accuracy": 0.955729169305414, "num_tokens": 211728283.0, "step": 289 }, { "entropy": 0.4534759521484375, "epoch": 3.1521739130434785, "grad_norm": 66.38606728912778, "learning_rate": 5.029976849550789e-06, "loss": 0.2548, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 212461065.0, "step": 290 }, { "entropy": 0.4560089111328125, "epoch": 3.1630434782608696, "grad_norm": 16.872257168957795, "learning_rate": 5e-06, "loss": 0.1922, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 213186327.0, "step": 291 }, { "entropy": 0.4451904296875, "epoch": 3.1739130434782608, "grad_norm": 1.5461776477660296, "learning_rate": 4.970023150449212e-06, "loss": 0.1435, "mean_token_accuracy": 0.945312503259629, "num_tokens": 213911814.0, "step": 292 }, { "entropy": 0.4291839599609375, "epoch": 3.1847826086956523, "grad_norm": 1.710426030936313, "learning_rate": 4.940047378409786e-06, "loss": 0.1205, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 214654874.0, "step": 293 }, { "entropy": 0.423095703125, "epoch": 3.1956521739130435, "grad_norm": 1.0827875337801205, "learning_rate": 4.910073761354354e-06, "loss": 0.1095, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 215389761.0, "step": 294 }, { "entropy": 0.4220123291015625, "epoch": 3.2065217391304346, "grad_norm": 5.1333586379045855, "learning_rate": 4.880103376678092e-06, "loss": 0.1316, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 216127217.0, "step": 295 }, { "entropy": 0.4204559326171875, "epoch": 3.217391304347826, "grad_norm": 1.9567851992549676, "learning_rate": 4.85013730165998e-06, "loss": 0.1523, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 216859096.0, "step": 296 }, { "entropy": 0.4506072998046875, "epoch": 3.2282608695652173, "grad_norm": 1.1177237092438557, "learning_rate": 4.820176613424095e-06, "loss": 0.1092, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 217571797.0, "step": 297 }, { "entropy": 0.4485321044921875, "epoch": 3.239130434782609, "grad_norm": 1.0790133746273967, "learning_rate": 4.790222388900884e-06, "loss": 0.1246, "mean_token_accuracy": 0.945312503259629, "num_tokens": 218283906.0, "step": 298 }, { "entropy": 0.434783935546875, "epoch": 3.25, "grad_norm": 1.023940043012299, "learning_rate": 4.76027570478846e-06, "loss": 0.1359, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 219029245.0, "step": 299 }, { "entropy": 0.424102783203125, "epoch": 3.260869565217391, "grad_norm": 1.4712175212615914, "learning_rate": 4.730337637513895e-06, "loss": 0.1248, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 219788983.0, "step": 300 }, { "entropy": 0.4358978271484375, "epoch": 3.2717391304347827, "grad_norm": 1.466288039664087, "learning_rate": 4.7004092631945315e-06, "loss": 0.1388, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 220519820.0, "step": 301 }, { "entropy": 0.465911865234375, "epoch": 3.282608695652174, "grad_norm": 1.3963153804305062, "learning_rate": 4.6704916575993005e-06, "loss": 0.118, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 221236275.0, "step": 302 }, { "entropy": 0.456451416015625, "epoch": 3.2934782608695654, "grad_norm": 1.2767429519721747, "learning_rate": 4.640585896110054e-06, "loss": 0.1136, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 221950844.0, "step": 303 }, { "entropy": 0.448486328125, "epoch": 3.3043478260869565, "grad_norm": 1.9283093557056667, "learning_rate": 4.610693053682912e-06, "loss": 0.1205, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 222698753.0, "step": 304 }, { "entropy": 0.460296630859375, "epoch": 3.3152173913043477, "grad_norm": 1.602506073665712, "learning_rate": 4.580814204809618e-06, "loss": 0.1194, "mean_token_accuracy": 0.945312503259629, "num_tokens": 223409864.0, "step": 305 }, { "entropy": 0.4370574951171875, "epoch": 3.3260869565217392, "grad_norm": 2.812987069937644, "learning_rate": 4.550950423478923e-06, "loss": 0.1207, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 224144482.0, "step": 306 }, { "entropy": 0.448516845703125, "epoch": 3.3369565217391304, "grad_norm": 2.7268302318992252, "learning_rate": 4.521102783137976e-06, "loss": 0.1197, "mean_token_accuracy": 0.945312503259629, "num_tokens": 224874129.0, "step": 307 }, { "entropy": 0.4384918212890625, "epoch": 3.3478260869565215, "grad_norm": 1.8209870494232845, "learning_rate": 4.491272356653744e-06, "loss": 0.1359, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 225607747.0, "step": 308 }, { "entropy": 0.4431610107421875, "epoch": 3.358695652173913, "grad_norm": 1.8893569417936162, "learning_rate": 4.4614602162744455e-06, "loss": 0.1397, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 226351883.0, "step": 309 }, { "entropy": 0.4349212646484375, "epoch": 3.369565217391304, "grad_norm": 2.7117060715938406, "learning_rate": 4.431667433591006e-06, "loss": 0.1505, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 227071902.0, "step": 310 }, { "entropy": 0.4327392578125, "epoch": 3.380434782608696, "grad_norm": 3.2489408268123703, "learning_rate": 4.401895079498547e-06, "loss": 0.1347, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 227818670.0, "step": 311 }, { "entropy": 0.4308319091796875, "epoch": 3.391304347826087, "grad_norm": 1.036775916385551, "learning_rate": 4.372144224157886e-06, "loss": 0.1275, "mean_token_accuracy": 0.945312503259629, "num_tokens": 228554871.0, "step": 312 }, { "entropy": 0.45361328125, "epoch": 3.4021739130434785, "grad_norm": 1.7332669275989192, "learning_rate": 4.342415936957073e-06, "loss": 0.0901, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 229248652.0, "step": 313 }, { "entropy": 0.4354248046875, "epoch": 3.4130434782608696, "grad_norm": 3.623367592607393, "learning_rate": 4.312711286472951e-06, "loss": 0.1053, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 230001926.0, "step": 314 }, { "entropy": 0.425079345703125, "epoch": 3.4239130434782608, "grad_norm": 1.241563582622002, "learning_rate": 4.2830313404327475e-06, "loss": 0.1394, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 230730268.0, "step": 315 }, { "entropy": 0.421630859375, "epoch": 3.4347826086956523, "grad_norm": 1.6961274867544411, "learning_rate": 4.253377165675691e-06, "loss": 0.1254, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 231470870.0, "step": 316 }, { "entropy": 0.449676513671875, "epoch": 3.4456521739130435, "grad_norm": 1.4182450270471312, "learning_rate": 4.223749828114672e-06, "loss": 0.1227, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 232205169.0, "step": 317 }, { "entropy": 0.42352294921875, "epoch": 3.4565217391304346, "grad_norm": 2.7039132599106854, "learning_rate": 4.19415039269792e-06, "loss": 0.1415, "mean_token_accuracy": 0.945312503259629, "num_tokens": 232956149.0, "step": 318 }, { "entropy": 0.4373626708984375, "epoch": 3.467391304347826, "grad_norm": 0.8829096146841485, "learning_rate": 4.1645799233707286e-06, "loss": 0.0968, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 233695610.0, "step": 319 }, { "entropy": 0.430999755859375, "epoch": 3.4782608695652173, "grad_norm": 2.6434624794405757, "learning_rate": 4.1350394830372106e-06, "loss": 0.1252, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 234423864.0, "step": 320 }, { "entropy": 0.4183349609375, "epoch": 3.489130434782609, "grad_norm": 1.4738342290133946, "learning_rate": 4.105530133522096e-06, "loss": 0.1179, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 235180826.0, "step": 321 }, { "entropy": 0.422515869140625, "epoch": 3.5, "grad_norm": 2.2621689843040866, "learning_rate": 4.076052935532559e-06, "loss": 0.1521, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 235940658.0, "step": 322 }, { "entropy": 0.422821044921875, "epoch": 3.5108695652173916, "grad_norm": 1.5419057612886151, "learning_rate": 4.046608948620098e-06, "loss": 0.1066, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 236655968.0, "step": 323 }, { "entropy": 0.4370269775390625, "epoch": 3.5217391304347827, "grad_norm": 1.9425609756123423, "learning_rate": 4.017199231142441e-06, "loss": 0.1161, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 237371740.0, "step": 324 }, { "entropy": 0.4402618408203125, "epoch": 3.532608695652174, "grad_norm": 2.2856146900763683, "learning_rate": 3.987824840225512e-06, "loss": 0.1788, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 238102988.0, "step": 325 }, { "entropy": 0.4551849365234375, "epoch": 3.5434782608695654, "grad_norm": 2.8367564342261473, "learning_rate": 3.9584868317254325e-06, "loss": 0.1046, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 238832212.0, "step": 326 }, { "entropy": 0.442779541015625, "epoch": 3.5543478260869565, "grad_norm": 1.041413390162045, "learning_rate": 3.92918626019056e-06, "loss": 0.0965, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 239560698.0, "step": 327 }, { "entropy": 0.441986083984375, "epoch": 3.5652173913043477, "grad_norm": 3.79920854717062, "learning_rate": 3.8999241788235896e-06, "loss": 0.1541, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 240284379.0, "step": 328 }, { "entropy": 0.430267333984375, "epoch": 3.5760869565217392, "grad_norm": 1.5276414681228787, "learning_rate": 3.8707016394436985e-06, "loss": 0.1275, "mean_token_accuracy": 0.945312503259629, "num_tokens": 241043511.0, "step": 329 }, { "entropy": 0.4509429931640625, "epoch": 3.5869565217391304, "grad_norm": 1.2736529719713083, "learning_rate": 3.841519692448732e-06, "loss": 0.1286, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 241759435.0, "step": 330 }, { "entropy": 0.45465087890625, "epoch": 3.5978260869565215, "grad_norm": 1.9141550160209704, "learning_rate": 3.8123793867774573e-06, "loss": 0.0899, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 242491369.0, "step": 331 }, { "entropy": 0.5021209716796875, "epoch": 3.608695652173913, "grad_norm": 1.232332216642473, "learning_rate": 3.7832817698718456e-06, "loss": 0.1205, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 243199357.0, "step": 332 }, { "entropy": 0.4455413818359375, "epoch": 3.619565217391304, "grad_norm": 2.1444573158673723, "learning_rate": 3.754227887639434e-06, "loss": 0.1158, "mean_token_accuracy": 0.945312503259629, "num_tokens": 243912435.0, "step": 333 }, { "entropy": 0.4324493408203125, "epoch": 3.630434782608696, "grad_norm": 1.617461677829012, "learning_rate": 3.725218784415723e-06, "loss": 0.1092, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 244654934.0, "step": 334 }, { "entropy": 0.444549560546875, "epoch": 3.641304347826087, "grad_norm": 1.377702218043193, "learning_rate": 3.6962555029266488e-06, "loss": 0.1241, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 245389036.0, "step": 335 }, { "entropy": 0.434906005859375, "epoch": 3.6521739130434785, "grad_norm": 1.727264760442294, "learning_rate": 3.667339084251087e-06, "loss": 0.1071, "mean_token_accuracy": 0.955729169305414, "num_tokens": 246134106.0, "step": 336 }, { "entropy": 0.43316650390625, "epoch": 3.6630434782608696, "grad_norm": 1.3954090035983964, "learning_rate": 3.638470567783442e-06, "loss": 0.1179, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 246849956.0, "step": 337 }, { "entropy": 0.453948974609375, "epoch": 3.6739130434782608, "grad_norm": 1.6447060709500603, "learning_rate": 3.609650991196285e-06, "loss": 0.0936, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 247577548.0, "step": 338 }, { "entropy": 0.4385833740234375, "epoch": 3.6847826086956523, "grad_norm": 1.887620262109647, "learning_rate": 3.5808813904030517e-06, "loss": 0.0854, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 248301732.0, "step": 339 }, { "entropy": 0.43109130859375, "epoch": 3.6956521739130435, "grad_norm": 1.5374227030971785, "learning_rate": 3.5521627995208146e-06, "loss": 0.088, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 249061110.0, "step": 340 }, { "entropy": 0.442901611328125, "epoch": 3.7065217391304346, "grad_norm": 3.639926746576837, "learning_rate": 3.523496250833098e-06, "loss": 0.0975, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 249771909.0, "step": 341 }, { "entropy": 0.437469482421875, "epoch": 3.717391304347826, "grad_norm": 2.4252733850331043, "learning_rate": 3.4948827747527846e-06, "loss": 0.1198, "mean_token_accuracy": 0.955729169305414, "num_tokens": 250508824.0, "step": 342 }, { "entropy": 0.43988037109375, "epoch": 3.7282608695652173, "grad_norm": 2.470431180963349, "learning_rate": 3.466323399785072e-06, "loss": 0.1032, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 251258370.0, "step": 343 }, { "entropy": 0.43865966796875, "epoch": 3.7391304347826084, "grad_norm": 3.615490506805586, "learning_rate": 3.4378191524905104e-06, "loss": 0.1176, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 251982193.0, "step": 344 }, { "entropy": 0.434356689453125, "epoch": 3.75, "grad_norm": 4.953188731098825, "learning_rate": 3.4093710574480926e-06, "loss": 0.0944, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 252725373.0, "step": 345 }, { "entropy": 0.441253662109375, "epoch": 3.7608695652173916, "grad_norm": 2.903524695818942, "learning_rate": 3.3809801372184305e-06, "loss": 0.1043, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 253444253.0, "step": 346 }, { "entropy": 0.440887451171875, "epoch": 3.7717391304347827, "grad_norm": 1.8484878167112617, "learning_rate": 3.352647412307002e-06, "loss": 0.0971, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 254196231.0, "step": 347 }, { "entropy": 0.4503631591796875, "epoch": 3.782608695652174, "grad_norm": 2.3324615390252603, "learning_rate": 3.3243739011274645e-06, "loss": 0.0915, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 254897104.0, "step": 348 }, { "entropy": 0.42547607421875, "epoch": 3.7934782608695654, "grad_norm": 1.8126624217835543, "learning_rate": 3.296160619965056e-06, "loss": 0.1015, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 255625847.0, "step": 349 }, { "entropy": 0.4334869384765625, "epoch": 3.8043478260869565, "grad_norm": 3.1451739862900654, "learning_rate": 3.2680085829400553e-06, "loss": 0.1178, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 256357804.0, "step": 350 }, { "entropy": 0.44500732421875, "epoch": 3.8152173913043477, "grad_norm": 1.366277116524626, "learning_rate": 3.2399188019713325e-06, "loss": 0.0686, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 257106678.0, "step": 351 }, { "entropy": 0.4403076171875, "epoch": 3.8260869565217392, "grad_norm": 1.8547202991721121, "learning_rate": 3.2118922867399776e-06, "loss": 0.0654, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 257830430.0, "step": 352 }, { "entropy": 0.4456939697265625, "epoch": 3.8369565217391304, "grad_norm": 2.5528798400662334, "learning_rate": 3.183930044653014e-06, "loss": 0.083, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 258564915.0, "step": 353 }, { "entropy": 0.4347991943359375, "epoch": 3.8478260869565215, "grad_norm": 2.7742470177999454, "learning_rate": 3.156033080807175e-06, "loss": 0.0773, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 259323546.0, "step": 354 }, { "entropy": 0.435638427734375, "epoch": 3.858695652173913, "grad_norm": 2.78613102328482, "learning_rate": 3.128202397952781e-06, "loss": 0.1138, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 260059087.0, "step": 355 }, { "entropy": 0.448394775390625, "epoch": 3.869565217391304, "grad_norm": 3.749640348606413, "learning_rate": 3.1004389964576976e-06, "loss": 0.1093, "mean_token_accuracy": 0.955729169305414, "num_tokens": 260766551.0, "step": 356 }, { "entropy": 0.4436798095703125, "epoch": 3.880434782608696, "grad_norm": 4.7477247915918195, "learning_rate": 3.0727438742713766e-06, "loss": 0.1368, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 261503811.0, "step": 357 }, { "entropy": 0.4672088623046875, "epoch": 3.891304347826087, "grad_norm": 3.833134489829662, "learning_rate": 3.045118026888988e-06, "loss": 0.0968, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 262230419.0, "step": 358 }, { "entropy": 0.464874267578125, "epoch": 3.9021739130434785, "grad_norm": 3.0516617638198302, "learning_rate": 3.0175624473156315e-06, "loss": 0.0861, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 262970363.0, "step": 359 }, { "entropy": 0.4435577392578125, "epoch": 3.9130434782608696, "grad_norm": 3.7678168274529997, "learning_rate": 2.9900781260306427e-06, "loss": 0.0866, "mean_token_accuracy": 0.955729169305414, "num_tokens": 263725373.0, "step": 360 }, { "entropy": 0.450714111328125, "epoch": 3.9239130434782608, "grad_norm": 2.043005160780546, "learning_rate": 2.962666050951997e-06, "loss": 0.0858, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 264460162.0, "step": 361 }, { "entropy": 0.4597625732421875, "epoch": 3.9347826086956523, "grad_norm": 2.2439286805770218, "learning_rate": 2.9353272074007933e-06, "loss": 0.0845, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 265193348.0, "step": 362 }, { "entropy": 0.4772186279296875, "epoch": 3.9456521739130435, "grad_norm": 3.008568678345949, "learning_rate": 2.9080625780658455e-06, "loss": 0.1165, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 265886823.0, "step": 363 }, { "entropy": 0.459228515625, "epoch": 3.9565217391304346, "grad_norm": 2.159391990479657, "learning_rate": 2.8808731429683433e-06, "loss": 0.0739, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 266623713.0, "step": 364 }, { "entropy": 0.466949462890625, "epoch": 3.967391304347826, "grad_norm": 2.8677224790187372, "learning_rate": 2.853759879426644e-06, "loss": 0.1018, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 267356085.0, "step": 365 }, { "entropy": 0.46905517578125, "epoch": 3.9782608695652173, "grad_norm": 3.763284574242391, "learning_rate": 2.8267237620211296e-06, "loss": 0.1038, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 268076004.0, "step": 366 }, { "entropy": 0.4678497314453125, "epoch": 3.9891304347826084, "grad_norm": 3.601468098277866, "learning_rate": 2.7997657625591866e-06, "loss": 0.087, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 268830382.0, "step": 367 }, { "entropy": 0.468475341796875, "epoch": 4.0, "grad_norm": 2.611708119130719, "learning_rate": 2.772886850040264e-06, "loss": 0.0904, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 269574137.0, "step": 368 }, { "entropy": 0.4866485595703125, "epoch": 4.010869565217392, "grad_norm": 2.7300781208155427, "learning_rate": 2.7460879906210485e-06, "loss": 0.064, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 270306277.0, "step": 369 }, { "entropy": 0.4772186279296875, "epoch": 4.021739130434782, "grad_norm": 1.4961978907102276, "learning_rate": 2.7193701475807376e-06, "loss": 0.063, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 271027387.0, "step": 370 }, { "entropy": 0.4515380859375, "epoch": 4.032608695652174, "grad_norm": 1.7987674036485675, "learning_rate": 2.6927342812864117e-06, "loss": 0.0798, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 271778153.0, "step": 371 }, { "entropy": 0.45458984375, "epoch": 4.043478260869565, "grad_norm": 2.9488112617854667, "learning_rate": 2.6661813491585133e-06, "loss": 0.052, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 272503209.0, "step": 372 }, { "entropy": 0.4464874267578125, "epoch": 4.054347826086956, "grad_norm": 2.625401389327965, "learning_rate": 2.6397123056364364e-06, "loss": 0.0665, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 273237290.0, "step": 373 }, { "entropy": 0.4439697265625, "epoch": 4.065217391304348, "grad_norm": 1.4718508783115323, "learning_rate": 2.613328102144216e-06, "loss": 0.0606, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 273964978.0, "step": 374 }, { "entropy": 0.4301605224609375, "epoch": 4.076086956521739, "grad_norm": 3.2460386674373254, "learning_rate": 2.5870296870563287e-06, "loss": 0.0876, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 274693952.0, "step": 375 }, { "entropy": 0.418304443359375, "epoch": 4.086956521739131, "grad_norm": 2.35569042854768, "learning_rate": 2.5608180056636123e-06, "loss": 0.0904, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 275447596.0, "step": 376 }, { "entropy": 0.4223175048828125, "epoch": 4.0978260869565215, "grad_norm": 2.4698945876545335, "learning_rate": 2.534694000139273e-06, "loss": 0.0559, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 276183120.0, "step": 377 }, { "entropy": 0.4175567626953125, "epoch": 4.108695652173913, "grad_norm": 1.928510597054212, "learning_rate": 2.5086586095050314e-06, "loss": 0.0494, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 276914364.0, "step": 378 }, { "entropy": 0.428558349609375, "epoch": 4.119565217391305, "grad_norm": 3.457540026725172, "learning_rate": 2.482712769597363e-06, "loss": 0.039, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 277625330.0, "step": 379 }, { "entropy": 0.41448974609375, "epoch": 4.130434782608695, "grad_norm": 2.085040589673449, "learning_rate": 2.4568574130338624e-06, "loss": 0.046, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 278372481.0, "step": 380 }, { "entropy": 0.415679931640625, "epoch": 4.141304347826087, "grad_norm": 3.983539704513102, "learning_rate": 2.4310934691797207e-06, "loss": 0.0509, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 279098982.0, "step": 381 }, { "entropy": 0.4116668701171875, "epoch": 4.1521739130434785, "grad_norm": 1.9912291616847635, "learning_rate": 2.405421864114318e-06, "loss": 0.0526, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 279812107.0, "step": 382 }, { "entropy": 0.42218017578125, "epoch": 4.163043478260869, "grad_norm": 2.704559272932083, "learning_rate": 2.379843520597937e-06, "loss": 0.0681, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 280527610.0, "step": 383 }, { "entropy": 0.4107208251953125, "epoch": 4.173913043478261, "grad_norm": 3.5111658042208567, "learning_rate": 2.3543593580385925e-06, "loss": 0.0492, "mean_token_accuracy": 0.989583333954215, "num_tokens": 281253019.0, "step": 384 }, { "entropy": 0.4045867919921875, "epoch": 4.184782608695652, "grad_norm": 5.079290167429306, "learning_rate": 2.3289702924589914e-06, "loss": 0.0847, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 281998234.0, "step": 385 }, { "entropy": 0.4412689208984375, "epoch": 4.195652173913044, "grad_norm": 5.139561224962418, "learning_rate": 2.303677236463593e-06, "loss": 0.0898, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 282738530.0, "step": 386 }, { "entropy": 0.4209747314453125, "epoch": 4.206521739130435, "grad_norm": 4.810884443758544, "learning_rate": 2.2784810992058155e-06, "loss": 0.0537, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 283445582.0, "step": 387 }, { "entropy": 0.406951904296875, "epoch": 4.217391304347826, "grad_norm": 2.7111175696393426, "learning_rate": 2.2533827863553552e-06, "loss": 0.0662, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 284183621.0, "step": 388 }, { "entropy": 0.397796630859375, "epoch": 4.228260869565218, "grad_norm": 4.9399851145714075, "learning_rate": 2.2283832000656304e-06, "loss": 0.0619, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 284935356.0, "step": 389 }, { "entropy": 0.411773681640625, "epoch": 4.239130434782608, "grad_norm": 4.9335772334323025, "learning_rate": 2.2034832389413536e-06, "loss": 0.0565, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 285665897.0, "step": 390 }, { "entropy": 0.4003753662109375, "epoch": 4.25, "grad_norm": 2.312671320421061, "learning_rate": 2.178683798006234e-06, "loss": 0.0493, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 286411607.0, "step": 391 }, { "entropy": 0.4046630859375, "epoch": 4.260869565217392, "grad_norm": 2.319802222347104, "learning_rate": 2.153985768670803e-06, "loss": 0.0513, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 287131523.0, "step": 392 }, { "entropy": 0.4002532958984375, "epoch": 4.271739130434782, "grad_norm": 2.0631506419054917, "learning_rate": 2.1293900387003742e-06, "loss": 0.0504, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 287870432.0, "step": 393 }, { "entropy": 0.4129486083984375, "epoch": 4.282608695652174, "grad_norm": 1.6882906349576583, "learning_rate": 2.104897492183135e-06, "loss": 0.0339, "mean_token_accuracy": 0.989583333954215, "num_tokens": 288590314.0, "step": 394 }, { "entropy": 0.399658203125, "epoch": 4.293478260869565, "grad_norm": 3.6116780855643635, "learning_rate": 2.080509009498364e-06, "loss": 0.0766, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 289342128.0, "step": 395 }, { "entropy": 0.4263458251953125, "epoch": 4.304347826086957, "grad_norm": 1.9675619499566213, "learning_rate": 2.056225467284786e-06, "loss": 0.0489, "mean_token_accuracy": 0.989583333954215, "num_tokens": 290061861.0, "step": 396 }, { "entropy": 0.407196044921875, "epoch": 4.315217391304348, "grad_norm": 1.993285317102883, "learning_rate": 2.0320477384090665e-06, "loss": 0.0544, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 290800081.0, "step": 397 }, { "entropy": 0.4045867919921875, "epoch": 4.326086956521739, "grad_norm": 4.819721946991274, "learning_rate": 2.007976691934432e-06, "loss": 0.0495, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 291522494.0, "step": 398 }, { "entropy": 0.395233154296875, "epoch": 4.336956521739131, "grad_norm": 4.76315710555539, "learning_rate": 1.9840131930894334e-06, "loss": 0.0724, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 292257524.0, "step": 399 }, { "entropy": 0.408905029296875, "epoch": 4.3478260869565215, "grad_norm": 2.8754184768577162, "learning_rate": 1.9601581032368457e-06, "loss": 0.0358, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 293018088.0, "step": 400 }, { "entropy": 0.414581298828125, "epoch": 4.358695652173913, "grad_norm": 2.4081677258047987, "learning_rate": 1.936412279842705e-06, "loss": 0.0491, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 293750057.0, "step": 401 }, { "entropy": 0.402923583984375, "epoch": 4.369565217391305, "grad_norm": 3.0372455968061867, "learning_rate": 1.912776576445488e-06, "loss": 0.0473, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 294478098.0, "step": 402 }, { "entropy": 0.434967041015625, "epoch": 4.380434782608695, "grad_norm": 3.812490747787835, "learning_rate": 1.8892518426254363e-06, "loss": 0.0786, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 295180680.0, "step": 403 }, { "entropy": 0.412994384765625, "epoch": 4.391304347826087, "grad_norm": 3.585209828081671, "learning_rate": 1.8658389239740094e-06, "loss": 0.0546, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 295903805.0, "step": 404 }, { "entropy": 0.43218994140625, "epoch": 4.4021739130434785, "grad_norm": 2.812768680407068, "learning_rate": 1.8425386620634961e-06, "loss": 0.0612, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 296641989.0, "step": 405 }, { "entropy": 0.4144134521484375, "epoch": 4.413043478260869, "grad_norm": 2.821636952059335, "learning_rate": 1.8193518944167625e-06, "loss": 0.0411, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 297380486.0, "step": 406 }, { "entropy": 0.416015625, "epoch": 4.423913043478261, "grad_norm": 2.6578372050676173, "learning_rate": 1.7962794544771477e-06, "loss": 0.0331, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 298106103.0, "step": 407 }, { "entropy": 0.4125518798828125, "epoch": 4.434782608695652, "grad_norm": 4.680570450169205, "learning_rate": 1.773322171578512e-06, "loss": 0.0548, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 298864936.0, "step": 408 }, { "entropy": 0.43536376953125, "epoch": 4.445652173913043, "grad_norm": 3.200460191935974, "learning_rate": 1.7504808709154104e-06, "loss": 0.0725, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 299589040.0, "step": 409 }, { "entropy": 0.42657470703125, "epoch": 4.456521739130435, "grad_norm": 2.771274243214472, "learning_rate": 1.727756373513449e-06, "loss": 0.0776, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 300295523.0, "step": 410 }, { "entropy": 0.4076995849609375, "epoch": 4.467391304347826, "grad_norm": 2.548796401378428, "learning_rate": 1.7051494961997623e-06, "loss": 0.0582, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 301005765.0, "step": 411 }, { "entropy": 0.4084625244140625, "epoch": 4.478260869565218, "grad_norm": 1.9994386544286782, "learning_rate": 1.6826610515736618e-06, "loss": 0.0292, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 301748290.0, "step": 412 }, { "entropy": 0.4356689453125, "epoch": 4.489130434782608, "grad_norm": 2.2197819076897787, "learning_rate": 1.660291847977415e-06, "loss": 0.041, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 302451329.0, "step": 413 }, { "entropy": 0.41204833984375, "epoch": 4.5, "grad_norm": 2.455349815343268, "learning_rate": 1.6380426894672003e-06, "loss": 0.0457, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 303175092.0, "step": 414 }, { "entropy": 0.4294891357421875, "epoch": 4.510869565217392, "grad_norm": 2.710943705952911, "learning_rate": 1.6159143757842005e-06, "loss": 0.0454, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 303896428.0, "step": 415 }, { "entropy": 0.4051971435546875, "epoch": 4.521739130434782, "grad_norm": 3.0032821856789065, "learning_rate": 1.5939077023258547e-06, "loss": 0.0547, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 304634834.0, "step": 416 }, { "entropy": 0.445343017578125, "epoch": 4.532608695652174, "grad_norm": 2.2530273539388146, "learning_rate": 1.5720234601172767e-06, "loss": 0.0293, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 305325860.0, "step": 417 }, { "entropy": 0.4124908447265625, "epoch": 4.543478260869565, "grad_norm": 3.1330667538082126, "learning_rate": 1.5502624357828118e-06, "loss": 0.0802, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 306089245.0, "step": 418 }, { "entropy": 0.41497802734375, "epoch": 4.554347826086957, "grad_norm": 2.2467240345711645, "learning_rate": 1.5286254115177623e-06, "loss": 0.0373, "mean_token_accuracy": 0.989583333954215, "num_tokens": 306822344.0, "step": 419 }, { "entropy": 0.4322509765625, "epoch": 4.565217391304348, "grad_norm": 2.2227469183901736, "learning_rate": 1.5071131650602782e-06, "loss": 0.0372, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 307541285.0, "step": 420 }, { "entropy": 0.413787841796875, "epoch": 4.576086956521739, "grad_norm": 2.0709109351711836, "learning_rate": 1.485726469663401e-06, "loss": 0.0316, "mean_token_accuracy": 0.989583333954215, "num_tokens": 308256892.0, "step": 421 }, { "entropy": 0.422515869140625, "epoch": 4.586956521739131, "grad_norm": 1.5331678246800737, "learning_rate": 1.4644660940672628e-06, "loss": 0.0215, "mean_token_accuracy": 0.989583333954215, "num_tokens": 308999745.0, "step": 422 }, { "entropy": 0.4016265869140625, "epoch": 4.5978260869565215, "grad_norm": 2.5471266140170443, "learning_rate": 1.4433328024714583e-06, "loss": 0.069, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 309740799.0, "step": 423 }, { "entropy": 0.420562744140625, "epoch": 4.608695652173913, "grad_norm": 3.167188925399225, "learning_rate": 1.422327354507575e-06, "loss": 0.0406, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 310458494.0, "step": 424 }, { "entropy": 0.409942626953125, "epoch": 4.619565217391305, "grad_norm": 2.3943435455869926, "learning_rate": 1.4014505052118893e-06, "loss": 0.054, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 311194917.0, "step": 425 }, { "entropy": 0.4096221923828125, "epoch": 4.630434782608695, "grad_norm": 2.212368683866918, "learning_rate": 1.3807030049982284e-06, "loss": 0.0315, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 311938164.0, "step": 426 }, { "entropy": 0.39923095703125, "epoch": 4.641304347826087, "grad_norm": 2.5878663864858718, "learning_rate": 1.3600855996309937e-06, "loss": 0.0391, "mean_token_accuracy": 0.989583333954215, "num_tokens": 312684433.0, "step": 427 }, { "entropy": 0.41021728515625, "epoch": 4.6521739130434785, "grad_norm": 2.560421888081383, "learning_rate": 1.339599030198351e-06, "loss": 0.0572, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 313420352.0, "step": 428 }, { "entropy": 0.4083251953125, "epoch": 4.663043478260869, "grad_norm": 3.054237583231214, "learning_rate": 1.3192440330856005e-06, "loss": 0.048, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 314169602.0, "step": 429 }, { "entropy": 0.4042510986328125, "epoch": 4.673913043478261, "grad_norm": 1.916804219563132, "learning_rate": 1.2990213399487078e-06, "loss": 0.0457, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 314913664.0, "step": 430 }, { "entropy": 0.4028472900390625, "epoch": 4.684782608695652, "grad_norm": 2.0422782684611116, "learning_rate": 1.278931677687994e-06, "loss": 0.0424, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 315646144.0, "step": 431 }, { "entropy": 0.4032135009765625, "epoch": 4.695652173913043, "grad_norm": 1.4351345675904197, "learning_rate": 1.2589757684220182e-06, "loss": 0.023, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 316380571.0, "step": 432 }, { "entropy": 0.4063262939453125, "epoch": 4.706521739130435, "grad_norm": 1.7418995711205454, "learning_rate": 1.239154329461615e-06, "loss": 0.0203, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 317129506.0, "step": 433 }, { "entropy": 0.4080352783203125, "epoch": 4.717391304347826, "grad_norm": 3.0546055267777277, "learning_rate": 1.2194680732841125e-06, "loss": 0.0284, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 317878755.0, "step": 434 }, { "entropy": 0.4080810546875, "epoch": 4.728260869565218, "grad_norm": 1.2821499432415668, "learning_rate": 1.1999177075077278e-06, "loss": 0.0173, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 318614683.0, "step": 435 }, { "entropy": 0.4033203125, "epoch": 4.739130434782608, "grad_norm": 2.7222945012776996, "learning_rate": 1.1805039348661213e-06, "loss": 0.0246, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 319344907.0, "step": 436 }, { "entropy": 0.408966064453125, "epoch": 4.75, "grad_norm": 3.1591784602515274, "learning_rate": 1.1612274531831463e-06, "loss": 0.0525, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 320081214.0, "step": 437 }, { "entropy": 0.4129791259765625, "epoch": 4.760869565217392, "grad_norm": 2.484789091986584, "learning_rate": 1.1420889553477577e-06, "loss": 0.0689, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 320808939.0, "step": 438 }, { "entropy": 0.4048614501953125, "epoch": 4.771739130434782, "grad_norm": 2.419811107661141, "learning_rate": 1.1230891292891173e-06, "loss": 0.0379, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 321535295.0, "step": 439 }, { "entropy": 0.414337158203125, "epoch": 4.782608695652174, "grad_norm": 2.064500040614814, "learning_rate": 1.1042286579518556e-06, "loss": 0.0403, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 322261465.0, "step": 440 }, { "entropy": 0.4001312255859375, "epoch": 4.793478260869565, "grad_norm": 2.297195906490076, "learning_rate": 1.0855082192715294e-06, "loss": 0.0453, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 323010771.0, "step": 441 }, { "entropy": 0.4105072021484375, "epoch": 4.804347826086957, "grad_norm": 2.2277803267926277, "learning_rate": 1.0669284861502517e-06, "loss": 0.0285, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 323747839.0, "step": 442 }, { "entropy": 0.39654541015625, "epoch": 4.815217391304348, "grad_norm": 2.2167121974335333, "learning_rate": 1.0484901264325026e-06, "loss": 0.0253, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 324512017.0, "step": 443 }, { "entropy": 0.4126739501953125, "epoch": 4.826086956521739, "grad_norm": 1.131288128180686, "learning_rate": 1.0301938028811303e-06, "loss": 0.0149, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 325231917.0, "step": 444 }, { "entropy": 0.4065704345703125, "epoch": 4.836956521739131, "grad_norm": 3.562991857009424, "learning_rate": 1.0120401731535213e-06, "loss": 0.0437, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 325965200.0, "step": 445 }, { "entropy": 0.3999481201171875, "epoch": 4.8478260869565215, "grad_norm": 3.485697515835889, "learning_rate": 9.940298897779615e-07, "loss": 0.0414, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 326710482.0, "step": 446 }, { "entropy": 0.412750244140625, "epoch": 4.858695652173913, "grad_norm": 1.4653396598394415, "learning_rate": 9.761636001301872e-07, "loss": 0.0185, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 327427331.0, "step": 447 }, { "entropy": 0.4132080078125, "epoch": 4.869565217391305, "grad_norm": 3.621100589978088, "learning_rate": 9.58441946410108e-07, "loss": 0.037, "mean_token_accuracy": 0.989583333954215, "num_tokens": 328147723.0, "step": 448 }, { "entropy": 0.4010467529296875, "epoch": 4.880434782608695, "grad_norm": 3.171039122195853, "learning_rate": 9.408655656187282e-07, "loss": 0.0256, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 328924086.0, "step": 449 }, { "entropy": 0.4055328369140625, "epoch": 4.891304347826087, "grad_norm": 2.523795123674481, "learning_rate": 9.234350895352479e-07, "loss": 0.0243, "mean_token_accuracy": 0.989583333954215, "num_tokens": 329658771.0, "step": 450 }, { "entropy": 0.4088897705078125, "epoch": 4.9021739130434785, "grad_norm": 3.42519876063796, "learning_rate": 9.061511446943533e-07, "loss": 0.023, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 330385752.0, "step": 451 }, { "entropy": 0.4098968505859375, "epoch": 4.913043478260869, "grad_norm": 2.01468212891926, "learning_rate": 8.890143523636968e-07, "loss": 0.0199, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 331104115.0, "step": 452 }, { "entropy": 0.4082794189453125, "epoch": 4.923913043478261, "grad_norm": 3.1125668067551993, "learning_rate": 8.720253285215685e-07, "loss": 0.0414, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 331824536.0, "step": 453 }, { "entropy": 0.396453857421875, "epoch": 4.934782608695652, "grad_norm": 2.7597760592043388, "learning_rate": 8.551846838347489e-07, "loss": 0.019, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 332563726.0, "step": 454 }, { "entropy": 0.420623779296875, "epoch": 4.945652173913043, "grad_norm": 3.494544234545317, "learning_rate": 8.384930236365629e-07, "loss": 0.0441, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 333288671.0, "step": 455 }, { "entropy": 0.391998291015625, "epoch": 4.956521739130435, "grad_norm": 2.2247266199545424, "learning_rate": 8.219509479051202e-07, "loss": 0.0278, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 334073387.0, "step": 456 }, { "entropy": 0.4291839599609375, "epoch": 4.967391304347826, "grad_norm": 2.3531506297864713, "learning_rate": 8.055590512417499e-07, "loss": 0.0297, "mean_token_accuracy": 0.989583333954215, "num_tokens": 334816728.0, "step": 457 }, { "entropy": 0.404052734375, "epoch": 4.978260869565218, "grad_norm": 1.5945315425002895, "learning_rate": 7.893179228496261e-07, "loss": 0.0242, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 335532419.0, "step": 458 }, { "entropy": 0.390106201171875, "epoch": 4.989130434782608, "grad_norm": 1.797015867033134, "learning_rate": 7.732281465125907e-07, "loss": 0.0203, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 336287282.0, "step": 459 }, { "entropy": 0.3868865966796875, "epoch": 5.0, "grad_norm": 4.054309515163909, "learning_rate": 7.572903005741689e-07, "loss": 0.038, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 337040768.0, "step": 460 }, { "entropy": 0.4239654541015625, "epoch": 5.010869565217392, "grad_norm": 2.037867510102136, "learning_rate": 7.415049579167783e-07, "loss": 0.0085, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 337766168.0, "step": 461 }, { "entropy": 0.3897705078125, "epoch": 5.021739130434782, "grad_norm": 4.130199703611077, "learning_rate": 7.258726859411435e-07, "loss": 0.0486, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 338509308.0, "step": 462 }, { "entropy": 0.4114532470703125, "epoch": 5.032608695652174, "grad_norm": 1.4084584784753773, "learning_rate": 7.103940465458936e-07, "loss": 0.0164, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 339241775.0, "step": 463 }, { "entropy": 0.3938446044921875, "epoch": 5.043478260869565, "grad_norm": 4.223867668677694, "learning_rate": 6.950695961073684e-07, "loss": 0.0266, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 339977899.0, "step": 464 }, { "entropy": 0.405853271484375, "epoch": 5.054347826086956, "grad_norm": 3.1513324473644087, "learning_rate": 6.79899885459619e-07, "loss": 0.0278, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 340699822.0, "step": 465 }, { "entropy": 0.402862548828125, "epoch": 5.065217391304348, "grad_norm": 1.9272349862009421, "learning_rate": 6.64885459874608e-07, "loss": 0.0237, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 341438940.0, "step": 466 }, { "entropy": 0.3831787109375, "epoch": 5.076086956521739, "grad_norm": 1.5225484641534506, "learning_rate": 6.500268590426107e-07, "loss": 0.0159, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 342184972.0, "step": 467 }, { "entropy": 0.3907012939453125, "epoch": 5.086956521739131, "grad_norm": 3.9148783887118763, "learning_rate": 6.353246170528149e-07, "loss": 0.0225, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 342940915.0, "step": 468 }, { "entropy": 0.40972900390625, "epoch": 5.0978260869565215, "grad_norm": 1.4281558107061891, "learning_rate": 6.207792623741249e-07, "loss": 0.0142, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 343675628.0, "step": 469 }, { "entropy": 0.40313720703125, "epoch": 5.108695652173913, "grad_norm": 1.206776637290113, "learning_rate": 6.063913178361614e-07, "loss": 0.0076, "mean_token_accuracy": 1.0, "num_tokens": 344397191.0, "step": 470 }, { "entropy": 0.385772705078125, "epoch": 5.119565217391305, "grad_norm": 2.2509076542953297, "learning_rate": 5.921613006104765e-07, "loss": 0.0214, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 345139527.0, "step": 471 }, { "entropy": 0.4058380126953125, "epoch": 5.130434782608695, "grad_norm": 2.681777682259471, "learning_rate": 5.780897221919551e-07, "loss": 0.0232, "mean_token_accuracy": 0.989583333954215, "num_tokens": 345859581.0, "step": 472 }, { "entropy": 0.3974609375, "epoch": 5.141304347826087, "grad_norm": 2.562368235939727, "learning_rate": 5.641770883804365e-07, "loss": 0.0277, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 346594728.0, "step": 473 }, { "entropy": 0.3978271484375, "epoch": 5.1521739130434785, "grad_norm": 1.260409714507234, "learning_rate": 5.504238992625277e-07, "loss": 0.0107, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 347345412.0, "step": 474 }, { "entropy": 0.394134521484375, "epoch": 5.163043478260869, "grad_norm": 2.8754774462772286, "learning_rate": 5.368306491936326e-07, "loss": 0.0158, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 348061065.0, "step": 475 }, { "entropy": 0.40869140625, "epoch": 5.173913043478261, "grad_norm": 1.023086461493808, "learning_rate": 5.233978267801798e-07, "loss": 0.0073, "mean_token_accuracy": 1.0, "num_tokens": 348783776.0, "step": 476 }, { "entropy": 0.401458740234375, "epoch": 5.184782608695652, "grad_norm": 1.4929118650933983, "learning_rate": 5.101259148620618e-07, "loss": 0.0099, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 349514350.0, "step": 477 }, { "entropy": 0.3917999267578125, "epoch": 5.195652173913044, "grad_norm": 1.294188438794479, "learning_rate": 4.970153904952768e-07, "loss": 0.0107, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 350248076.0, "step": 478 }, { "entropy": 0.3802642822265625, "epoch": 5.206521739130435, "grad_norm": 1.5868086446888314, "learning_rate": 4.840667249347824e-07, "loss": 0.0294, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 350991865.0, "step": 479 }, { "entropy": 0.38580322265625, "epoch": 5.217391304347826, "grad_norm": 1.7262483127382957, "learning_rate": 4.7128038361755836e-07, "loss": 0.0239, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 351745006.0, "step": 480 }, { "entropy": 0.4080352783203125, "epoch": 5.228260869565218, "grad_norm": 5.0340551829352, "learning_rate": 4.586568261458729e-07, "loss": 0.0455, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 352474084.0, "step": 481 }, { "entropy": 0.4060821533203125, "epoch": 5.239130434782608, "grad_norm": 2.8279201983140814, "learning_rate": 4.461965062707646e-07, "loss": 0.0234, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 353211715.0, "step": 482 }, { "entropy": 0.392120361328125, "epoch": 5.25, "grad_norm": 2.1056087405556725, "learning_rate": 4.338998718757315e-07, "loss": 0.0087, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 353950858.0, "step": 483 }, { "entropy": 0.4037322998046875, "epoch": 5.260869565217392, "grad_norm": 1.9071121793728274, "learning_rate": 4.2176736496063406e-07, "loss": 0.0047, "mean_token_accuracy": 1.0, "num_tokens": 354656306.0, "step": 484 }, { "entropy": 0.4117431640625, "epoch": 5.271739130434782, "grad_norm": 2.1832059782760695, "learning_rate": 4.0979942162580387e-07, "loss": 0.011, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 355373127.0, "step": 485 }, { "entropy": 0.387969970703125, "epoch": 5.282608695652174, "grad_norm": 0.9338603119206235, "learning_rate": 3.979964720563728e-07, "loss": 0.015, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 356134595.0, "step": 486 }, { "entropy": 0.3863067626953125, "epoch": 5.293478260869565, "grad_norm": 0.8035643508790813, "learning_rate": 3.863589405068047e-07, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 356874140.0, "step": 487 }, { "entropy": 0.3829498291015625, "epoch": 5.304347826086957, "grad_norm": 2.330137070929337, "learning_rate": 3.748872452856506e-07, "loss": 0.0091, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 357620724.0, "step": 488 }, { "entropy": 0.3927459716796875, "epoch": 5.315217391304348, "grad_norm": 1.2276302828645993, "learning_rate": 3.63581798740511e-07, "loss": 0.0049, "mean_token_accuracy": 1.0, "num_tokens": 358355003.0, "step": 489 }, { "entropy": 0.3846282958984375, "epoch": 5.326086956521739, "grad_norm": 3.1122288247660985, "learning_rate": 3.524430072432117e-07, "loss": 0.0311, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 359088947.0, "step": 490 }, { "entropy": 0.3803253173828125, "epoch": 5.336956521739131, "grad_norm": 1.031743535694978, "learning_rate": 3.414712711752011e-07, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 359839271.0, "step": 491 }, { "entropy": 0.392791748046875, "epoch": 5.3478260869565215, "grad_norm": 2.4003564062037217, "learning_rate": 3.306669849131544e-07, "loss": 0.0222, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 360555118.0, "step": 492 }, { "entropy": 0.4043121337890625, "epoch": 5.358695652173913, "grad_norm": 1.5916000916175594, "learning_rate": 3.20030536814801e-07, "loss": 0.0138, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 361282594.0, "step": 493 }, { "entropy": 0.3998870849609375, "epoch": 5.369565217391305, "grad_norm": 5.135801576812412, "learning_rate": 3.095623092049632e-07, "loss": 0.0457, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 362002556.0, "step": 494 }, { "entropy": 0.3865966796875, "epoch": 5.380434782608695, "grad_norm": 3.090721052570619, "learning_rate": 2.992626783618152e-07, "loss": 0.031, "mean_token_accuracy": 0.989583333954215, "num_tokens": 362743968.0, "step": 495 }, { "entropy": 0.38818359375, "epoch": 5.391304347826087, "grad_norm": 2.1833786614192396, "learning_rate": 2.891320145033566e-07, "loss": 0.0093, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 363477550.0, "step": 496 }, { "entropy": 0.3900604248046875, "epoch": 5.4021739130434785, "grad_norm": 0.5228962640445148, "learning_rate": 2.791706817741041e-07, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 364226587.0, "step": 497 }, { "entropy": 0.3830413818359375, "epoch": 5.413043478260869, "grad_norm": 2.177304130904656, "learning_rate": 2.693790382320055e-07, "loss": 0.0175, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 364959239.0, "step": 498 }, { "entropy": 0.3940887451171875, "epoch": 5.423913043478261, "grad_norm": 2.7692226509966775, "learning_rate": 2.59757435835567e-07, "loss": 0.0152, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 365696800.0, "step": 499 }, { "entropy": 0.3874969482421875, "epoch": 5.434782608695652, "grad_norm": 0.6622227905804676, "learning_rate": 2.5030622043120237e-07, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 366426471.0, "step": 500 }, { "entropy": 0.3917999267578125, "epoch": 5.445652173913043, "grad_norm": 3.4285563843356326, "learning_rate": 2.41025731740801e-07, "loss": 0.0333, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 367174609.0, "step": 501 }, { "entropy": 0.3955535888671875, "epoch": 5.456521739130435, "grad_norm": 3.7100370301508296, "learning_rate": 2.319163033495192e-07, "loss": 0.0424, "mean_token_accuracy": 0.989583333954215, "num_tokens": 367898714.0, "step": 502 }, { "entropy": 0.380218505859375, "epoch": 5.467391304347826, "grad_norm": 1.7830513614583579, "learning_rate": 2.2297826269378653e-07, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 368639031.0, "step": 503 }, { "entropy": 0.3790130615234375, "epoch": 5.478260869565218, "grad_norm": 2.2310214846672896, "learning_rate": 2.142119310495383e-07, "loss": 0.01, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 369397263.0, "step": 504 }, { "entropy": 0.3843231201171875, "epoch": 5.489130434782608, "grad_norm": 3.113797314828611, "learning_rate": 2.0561762352066638e-07, "loss": 0.0121, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 370138352.0, "step": 505 }, { "entropy": 0.3926849365234375, "epoch": 5.5, "grad_norm": 7.027103058482383, "learning_rate": 1.9719564902769272e-07, "loss": 0.0268, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 370867304.0, "step": 506 }, { "entropy": 0.385528564453125, "epoch": 5.510869565217392, "grad_norm": 3.158975096906896, "learning_rate": 1.889463102966671e-07, "loss": 0.0136, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 371612613.0, "step": 507 }, { "entropy": 0.40679931640625, "epoch": 5.521739130434782, "grad_norm": 3.7256140843342846, "learning_rate": 1.8086990384828195e-07, "loss": 0.0322, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 372320930.0, "step": 508 }, { "entropy": 0.3902587890625, "epoch": 5.532608695652174, "grad_norm": 3.640770823761304, "learning_rate": 1.729667199872187e-07, "loss": 0.0172, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 373025821.0, "step": 509 }, { "entropy": 0.386322021484375, "epoch": 5.543478260869565, "grad_norm": 4.3600778017735875, "learning_rate": 1.6523704279170773e-07, "loss": 0.0302, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 373758445.0, "step": 510 }, { "entropy": 0.3842010498046875, "epoch": 5.554347826086957, "grad_norm": 2.8904903079482365, "learning_rate": 1.5768115010332207e-07, "loss": 0.0394, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 374507004.0, "step": 511 }, { "entropy": 0.4094390869140625, "epoch": 5.565217391304348, "grad_norm": 1.5280072777152858, "learning_rate": 1.5029931351698723e-07, "loss": 0.0062, "mean_token_accuracy": 1.0, "num_tokens": 375216818.0, "step": 512 }, { "entropy": 0.3795166015625, "epoch": 5.576086956521739, "grad_norm": 2.9083912204527858, "learning_rate": 1.4309179837122045e-07, "loss": 0.0095, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 375963533.0, "step": 513 }, { "entropy": 0.388153076171875, "epoch": 5.586956521739131, "grad_norm": 2.494676483675169, "learning_rate": 1.3605886373859234e-07, "loss": 0.0121, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 376695849.0, "step": 514 }, { "entropy": 0.4017486572265625, "epoch": 5.5978260869565215, "grad_norm": 5.605449787153541, "learning_rate": 1.2920076241641376e-07, "loss": 0.0333, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 377408950.0, "step": 515 }, { "entropy": 0.3864898681640625, "epoch": 5.608695652173913, "grad_norm": 2.8048999910075283, "learning_rate": 1.22517740917652e-07, "loss": 0.0137, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 378154501.0, "step": 516 }, { "entropy": 0.397857666015625, "epoch": 5.619565217391305, "grad_norm": 1.585255503223806, "learning_rate": 1.1601003946206723e-07, "loss": 0.0124, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 378878351.0, "step": 517 }, { "entropy": 0.390869140625, "epoch": 5.630434782608695, "grad_norm": 1.6591111399858995, "learning_rate": 1.0967789196757839e-07, "loss": 0.0057, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 379589042.0, "step": 518 }, { "entropy": 0.3965606689453125, "epoch": 5.641304347826087, "grad_norm": 3.8513221893355163, "learning_rate": 1.0352152604185429e-07, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 380316238.0, "step": 519 }, { "entropy": 0.3861236572265625, "epoch": 5.6521739130434785, "grad_norm": 1.2903428759830748, "learning_rate": 9.754116297413574e-08, "loss": 0.0164, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 381060518.0, "step": 520 }, { "entropy": 0.3924102783203125, "epoch": 5.663043478260869, "grad_norm": 2.3661355025868915, "learning_rate": 9.17370177272775e-08, "loss": 0.0135, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 381794517.0, "step": 521 }, { "entropy": 0.41534423828125, "epoch": 5.673913043478261, "grad_norm": 1.7159919960698389, "learning_rate": 8.610929893002274e-08, "loss": 0.0116, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 382509591.0, "step": 522 }, { "entropy": 0.388824462890625, "epoch": 5.684782608695652, "grad_norm": 3.1200032663581574, "learning_rate": 8.065820886950404e-08, "loss": 0.0316, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 383254229.0, "step": 523 }, { "entropy": 0.3953857421875, "epoch": 5.695652173913043, "grad_norm": 4.159894595972317, "learning_rate": 7.538394348397316e-08, "loss": 0.0659, "mean_token_accuracy": 0.989583333954215, "num_tokens": 383988955.0, "step": 524 }, { "entropy": 0.386077880859375, "epoch": 5.706521739130435, "grad_norm": 1.368823358453224, "learning_rate": 7.028669235575714e-08, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 384708327.0, "step": 525 }, { "entropy": 0.41278076171875, "epoch": 5.717391304347826, "grad_norm": 2.818039902986355, "learning_rate": 6.536663870444382e-08, "loss": 0.0107, "mean_token_accuracy": 1.0, "num_tokens": 385418118.0, "step": 526 }, { "entropy": 0.3767852783203125, "epoch": 5.728260869565218, "grad_norm": 3.546014549466884, "learning_rate": 6.062395938029485e-08, "loss": 0.0315, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 386175413.0, "step": 527 }, { "entropy": 0.401458740234375, "epoch": 5.739130434782608, "grad_norm": 2.389814493856653, "learning_rate": 5.605882485789138e-08, "loss": 0.0318, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 386897197.0, "step": 528 }, { "entropy": 0.3771820068359375, "epoch": 5.75, "grad_norm": 1.0744355499348452, "learning_rate": 5.167139923000553e-08, "loss": 0.0056, "mean_token_accuracy": 1.0, "num_tokens": 387648661.0, "step": 529 }, { "entropy": 0.380645751953125, "epoch": 5.760869565217392, "grad_norm": 0.9597020947603775, "learning_rate": 4.746184020170019e-08, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 388401188.0, "step": 530 }, { "entropy": 0.386199951171875, "epoch": 5.771739130434782, "grad_norm": 1.0214090880188462, "learning_rate": 4.3430299084663006e-08, "loss": 0.0052, "mean_token_accuracy": 1.0, "num_tokens": 389132125.0, "step": 531 }, { "entropy": 0.416839599609375, "epoch": 5.782608695652174, "grad_norm": 1.0946483645943863, "learning_rate": 3.957692079176623e-08, "loss": 0.0081, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 389862785.0, "step": 532 }, { "entropy": 0.390167236328125, "epoch": 5.793478260869565, "grad_norm": 1.6459688782297026, "learning_rate": 3.590184383185758e-08, "loss": 0.0062, "mean_token_accuracy": 1.0, "num_tokens": 390597870.0, "step": 533 }, { "entropy": 0.3839874267578125, "epoch": 5.804347826086957, "grad_norm": 2.0698702706686567, "learning_rate": 3.240520030478256e-08, "loss": 0.0288, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 391319503.0, "step": 534 }, { "entropy": 0.3899383544921875, "epoch": 5.815217391304348, "grad_norm": 1.8622051935678263, "learning_rate": 2.9087115896635486e-08, "loss": 0.0103, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 392069003.0, "step": 535 }, { "entropy": 0.4185791015625, "epoch": 5.826086956521739, "grad_norm": 0.9838782015391406, "learning_rate": 2.5947709875240867e-08, "loss": 0.0161, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 392791197.0, "step": 536 }, { "entropy": 0.4089813232421875, "epoch": 5.836956521739131, "grad_norm": 0.5779206893200703, "learning_rate": 2.298709508586794e-08, "loss": 0.0033, "mean_token_accuracy": 1.0, "num_tokens": 393512216.0, "step": 537 }, { "entropy": 0.391387939453125, "epoch": 5.8478260869565215, "grad_norm": 1.5997986902738377, "learning_rate": 2.0205377947174475e-08, "loss": 0.0254, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 394224676.0, "step": 538 }, { "entropy": 0.40155029296875, "epoch": 5.858695652173913, "grad_norm": 0.9538716174458627, "learning_rate": 1.760265844738096e-08, "loss": 0.0052, "mean_token_accuracy": 1.0, "num_tokens": 394946689.0, "step": 539 }, { "entropy": 0.41473388671875, "epoch": 5.869565217391305, "grad_norm": 3.1692455070380507, "learning_rate": 1.5179030140675122e-08, "loss": 0.006, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 395674368.0, "step": 540 }, { "entropy": 0.3944549560546875, "epoch": 5.880434782608695, "grad_norm": 3.822951895963981, "learning_rate": 1.2934580143851294e-08, "loss": 0.0268, "mean_token_accuracy": 0.989583333954215, "num_tokens": 396397890.0, "step": 541 }, { "entropy": 0.3854827880859375, "epoch": 5.891304347826087, "grad_norm": 1.7269765059882336, "learning_rate": 1.0869389133178477e-08, "loss": 0.0132, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 397122155.0, "step": 542 }, { "entropy": 0.383026123046875, "epoch": 5.9021739130434785, "grad_norm": 2.3437834826407107, "learning_rate": 8.983531341500984e-09, "loss": 0.0227, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 397856299.0, "step": 543 }, { "entropy": 0.393585205078125, "epoch": 5.913043478260869, "grad_norm": 2.8396564206572372, "learning_rate": 7.277074555567809e-09, "loss": 0.0068, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 398575680.0, "step": 544 }, { "entropy": 0.4019622802734375, "epoch": 5.923913043478261, "grad_norm": 2.1229188120999614, "learning_rate": 5.750080113598455e-09, "loss": 0.023, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 399294327.0, "step": 545 }, { "entropy": 0.3828125, "epoch": 5.934782608695652, "grad_norm": 2.7811799795374825, "learning_rate": 4.40260290307748e-09, "loss": 0.0162, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 400028937.0, "step": 546 }, { "entropy": 0.39453125, "epoch": 5.945652173913043, "grad_norm": 1.2135571583249902, "learning_rate": 3.2346913587816275e-09, "loss": 0.005, "mean_token_accuracy": 1.0, "num_tokens": 400760386.0, "step": 547 }, { "entropy": 0.383087158203125, "epoch": 5.956521739130435, "grad_norm": 2.037257238818469, "learning_rate": 2.2463874610378912e-09, "loss": 0.0149, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 401508363.0, "step": 548 }, { "entropy": 0.395111083984375, "epoch": 5.967391304347826, "grad_norm": 0.9899828962837534, "learning_rate": 1.4377267342158274e-09, "loss": 0.0094, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 402255596.0, "step": 549 }, { "entropy": 0.391082763671875, "epoch": 5.978260869565218, "grad_norm": 3.1153076835245432, "learning_rate": 8.087382454502468e-10, "loss": 0.0421, "mean_token_accuracy": 0.989583333954215, "num_tokens": 402993150.0, "step": 550 }, { "entropy": 0.3865966796875, "epoch": 5.989130434782608, "grad_norm": 1.329602352306281, "learning_rate": 3.594446035964927e-10, "loss": 0.0157, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 403711677.0, "step": 551 }, { "entropy": 0.397247314453125, "epoch": 6.0, "grad_norm": 1.779545542710601, "learning_rate": 8.986195841609313e-11, "loss": 0.0062, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 404454221.0, "step": 552 }, { "epoch": 6.0, "step": 552, "total_flos": 475865375768576.0, "train_loss": 0.4369911788587384, "train_runtime": 50253.5416, "train_samples_per_second": 2.224, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 552, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 46, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 475865375768576.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }