{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 522, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5506134033203125, "epoch": 0.011494252873563218, "grad_norm": 384.0606248233174, "learning_rate": 0.0, "loss": 8.318, "mean_token_accuracy": 0.0, "num_tokens": 849672.0, "step": 1 }, { "entropy": 0.5534515380859375, "epoch": 0.022988505747126436, "grad_norm": 387.6767837142096, "learning_rate": 1.8518518518518518e-07, "loss": 8.3232, "mean_token_accuracy": 0.0, "num_tokens": 1671243.0, "step": 2 }, { "entropy": 0.5354843139648438, "epoch": 0.034482758620689655, "grad_norm": 383.45390757110886, "learning_rate": 3.7037037037037036e-07, "loss": 8.2998, "mean_token_accuracy": 0.0, "num_tokens": 2542121.0, "step": 3 }, { "entropy": 0.552337646484375, "epoch": 0.04597701149425287, "grad_norm": 390.2053728104676, "learning_rate": 5.555555555555555e-07, "loss": 8.2361, "mean_token_accuracy": 0.0, "num_tokens": 3370166.0, "step": 4 }, { "entropy": 0.5463790893554688, "epoch": 0.05747126436781609, "grad_norm": 396.6880182595508, "learning_rate": 7.407407407407407e-07, "loss": 8.0897, "mean_token_accuracy": 0.0, "num_tokens": 4194809.0, "step": 5 }, { "entropy": 0.5587310791015625, "epoch": 0.06896551724137931, "grad_norm": 394.428520935306, "learning_rate": 9.259259259259259e-07, "loss": 8.0395, "mean_token_accuracy": 0.0, "num_tokens": 5001673.0, "step": 6 }, { "entropy": 0.5570907592773438, "epoch": 0.08045977011494253, "grad_norm": 400.1266091194488, "learning_rate": 1.111111111111111e-06, "loss": 7.3825, "mean_token_accuracy": 0.0, "num_tokens": 5831972.0, "step": 7 }, { "entropy": 0.5578536987304688, "epoch": 0.09195402298850575, "grad_norm": 268.1695863511667, "learning_rate": 1.2962962962962962e-06, "loss": 5.8784, "mean_token_accuracy": 0.0013020833721384406, "num_tokens": 6655190.0, "step": 8 }, { "entropy": 0.5546417236328125, "epoch": 0.10344827586206896, "grad_norm": 228.00484704024294, "learning_rate": 1.4814814814814815e-06, "loss": 5.6001, "mean_token_accuracy": 0.0026041667442768812, "num_tokens": 7497377.0, "step": 9 }, { "entropy": 0.5494537353515625, "epoch": 0.11494252873563218, "grad_norm": 189.63353230439253, "learning_rate": 1.6666666666666667e-06, "loss": 5.2696, "mean_token_accuracy": 0.01822916720993817, "num_tokens": 8345673.0, "step": 10 }, { "entropy": 0.5466766357421875, "epoch": 0.12643678160919541, "grad_norm": 102.52439706164563, "learning_rate": 1.8518518518518519e-06, "loss": 4.1157, "mean_token_accuracy": 0.5065104317618534, "num_tokens": 9206475.0, "step": 11 }, { "entropy": 0.5690078735351562, "epoch": 0.13793103448275862, "grad_norm": 97.09612459227306, "learning_rate": 2.037037037037037e-06, "loss": 4.027, "mean_token_accuracy": 0.5195312654832378, "num_tokens": 10013439.0, "step": 12 }, { "entropy": 0.5670013427734375, "epoch": 0.14942528735632185, "grad_norm": 81.99594392583451, "learning_rate": 2.222222222222222e-06, "loss": 3.8264, "mean_token_accuracy": 0.5065104317618534, "num_tokens": 10837729.0, "step": 13 }, { "entropy": 0.5586013793945312, "epoch": 0.16091954022988506, "grad_norm": 74.57440962105724, "learning_rate": 2.4074074074074075e-06, "loss": 3.711, "mean_token_accuracy": 0.5039062650175765, "num_tokens": 11666285.0, "step": 14 }, { "entropy": 0.5576553344726562, "epoch": 0.1724137931034483, "grad_norm": 59.69190752157155, "learning_rate": 2.5925925925925925e-06, "loss": 3.2731, "mean_token_accuracy": 0.505208348389715, "num_tokens": 12488399.0, "step": 15 }, { "entropy": 0.5660324096679688, "epoch": 0.1839080459770115, "grad_norm": 58.640319989957504, "learning_rate": 2.7777777777777783e-06, "loss": 3.2084, "mean_token_accuracy": 0.5039062650175765, "num_tokens": 13285405.0, "step": 16 }, { "entropy": 0.5326614379882812, "epoch": 0.19540229885057472, "grad_norm": 58.02496291902121, "learning_rate": 2.962962962962963e-06, "loss": 3.1538, "mean_token_accuracy": 0.5312500158324838, "num_tokens": 14169920.0, "step": 17 }, { "entropy": 0.5579299926757812, "epoch": 0.20689655172413793, "grad_norm": 57.3541740444759, "learning_rate": 3.1481481481481483e-06, "loss": 3.0936, "mean_token_accuracy": 0.5325520992046222, "num_tokens": 14983623.0, "step": 18 }, { "entropy": 0.552978515625, "epoch": 0.21839080459770116, "grad_norm": 57.41173653229489, "learning_rate": 3.3333333333333333e-06, "loss": 3.0535, "mean_token_accuracy": 0.5338541825767606, "num_tokens": 15831705.0, "step": 19 }, { "entropy": 0.5468978881835938, "epoch": 0.22988505747126436, "grad_norm": 57.85464581418466, "learning_rate": 3.5185185185185187e-06, "loss": 2.9633, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 16671559.0, "step": 20 }, { "entropy": 0.543548583984375, "epoch": 0.2413793103448276, "grad_norm": 57.14746959353303, "learning_rate": 3.7037037037037037e-06, "loss": 2.917, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 17493335.0, "step": 21 }, { "entropy": 0.5303497314453125, "epoch": 0.25287356321839083, "grad_norm": 61.39811744800574, "learning_rate": 3.88888888888889e-06, "loss": 2.9148, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 18340307.0, "step": 22 }, { "entropy": 0.5362396240234375, "epoch": 0.26436781609195403, "grad_norm": 60.342413701495495, "learning_rate": 4.074074074074074e-06, "loss": 2.9023, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 19185926.0, "step": 23 }, { "entropy": 0.5345382690429688, "epoch": 0.27586206896551724, "grad_norm": 58.48140818561053, "learning_rate": 4.2592592592592596e-06, "loss": 2.8588, "mean_token_accuracy": 0.558593766647391, "num_tokens": 20039684.0, "step": 24 }, { "entropy": 0.5341339111328125, "epoch": 0.28735632183908044, "grad_norm": 57.45969923517991, "learning_rate": 4.444444444444444e-06, "loss": 2.8357, "mean_token_accuracy": 0.5494791830424219, "num_tokens": 20867602.0, "step": 25 }, { "entropy": 0.543975830078125, "epoch": 0.2988505747126437, "grad_norm": 58.24286332124725, "learning_rate": 4.62962962962963e-06, "loss": 2.8104, "mean_token_accuracy": 0.5325520992046222, "num_tokens": 21691039.0, "step": 26 }, { "entropy": 0.5395126342773438, "epoch": 0.3103448275862069, "grad_norm": 57.74485187701231, "learning_rate": 4.814814814814815e-06, "loss": 2.7818, "mean_token_accuracy": 0.5468750162981451, "num_tokens": 22501019.0, "step": 27 }, { "entropy": 0.529571533203125, "epoch": 0.3218390804597701, "grad_norm": 57.27325469074225, "learning_rate": 5e-06, "loss": 2.7382, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 23335788.0, "step": 28 }, { "entropy": 0.5239486694335938, "epoch": 0.3333333333333333, "grad_norm": 57.68762700547809, "learning_rate": 4.999949650182267e-06, "loss": 2.7075, "mean_token_accuracy": 0.5611979333916679, "num_tokens": 24179389.0, "step": 29 }, { "entropy": 0.526702880859375, "epoch": 0.3448275862068966, "grad_norm": 57.52243272097885, "learning_rate": 4.999798602757149e-06, "loss": 2.672, "mean_token_accuracy": 0.5716146003687754, "num_tokens": 25001444.0, "step": 30 }, { "entropy": 0.5177001953125, "epoch": 0.3563218390804598, "grad_norm": 58.79536537541134, "learning_rate": 4.999546863808815e-06, "loss": 2.6479, "mean_token_accuracy": 0.5533854331588373, "num_tokens": 25869769.0, "step": 31 }, { "entropy": 0.515228271484375, "epoch": 0.367816091954023, "grad_norm": 57.98864616043999, "learning_rate": 4.999194443477273e-06, "loss": 2.627, "mean_token_accuracy": 0.5546875165309757, "num_tokens": 26716768.0, "step": 32 }, { "entropy": 0.5211105346679688, "epoch": 0.3793103448275862, "grad_norm": 58.71523852821683, "learning_rate": 4.998741355957963e-06, "loss": 2.5927, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 27545869.0, "step": 33 }, { "entropy": 0.5121917724609375, "epoch": 0.39080459770114945, "grad_norm": 58.704187181529285, "learning_rate": 4.998187619501185e-06, "loss": 2.5905, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 28399163.0, "step": 34 }, { "entropy": 0.5317153930664062, "epoch": 0.40229885057471265, "grad_norm": 58.90319706198557, "learning_rate": 4.99753325641136e-06, "loss": 2.5505, "mean_token_accuracy": 0.5742187671130523, "num_tokens": 29189974.0, "step": 35 }, { "entropy": 0.5278091430664062, "epoch": 0.41379310344827586, "grad_norm": 59.06761043984239, "learning_rate": 4.9967782930461405e-06, "loss": 2.5229, "mean_token_accuracy": 0.5690104336244985, "num_tokens": 30025939.0, "step": 36 }, { "entropy": 0.5297012329101562, "epoch": 0.42528735632183906, "grad_norm": 59.08737592365927, "learning_rate": 4.9959227598153395e-06, "loss": 2.4795, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 30858152.0, "step": 37 }, { "entropy": 0.5217742919921875, "epoch": 0.4367816091954023, "grad_norm": 59.20003475363042, "learning_rate": 4.994966691179712e-06, "loss": 2.4737, "mean_token_accuracy": 0.570312516996637, "num_tokens": 31702950.0, "step": 38 }, { "entropy": 0.53533935546875, "epoch": 0.4482758620689655, "grad_norm": 59.67146679766626, "learning_rate": 4.993910125649561e-06, "loss": 2.4354, "mean_token_accuracy": 0.5846354340901598, "num_tokens": 32533459.0, "step": 39 }, { "entropy": 0.5344924926757812, "epoch": 0.45977011494252873, "grad_norm": 59.396870663210215, "learning_rate": 4.992753105783194e-06, "loss": 2.4049, "mean_token_accuracy": 0.5872396008344367, "num_tokens": 33344480.0, "step": 40 }, { "entropy": 0.5267333984375, "epoch": 0.47126436781609193, "grad_norm": 60.097920183732995, "learning_rate": 4.991495678185202e-06, "loss": 2.3931, "mean_token_accuracy": 0.5729166837409139, "num_tokens": 34168826.0, "step": 41 }, { "entropy": 0.5363311767578125, "epoch": 0.4827586206896552, "grad_norm": 60.149361087445314, "learning_rate": 4.990137893504585e-06, "loss": 2.3507, "mean_token_accuracy": 0.5950521010672674, "num_tokens": 34985268.0, "step": 42 }, { "entropy": 0.5448989868164062, "epoch": 0.4942528735632184, "grad_norm": 60.10877790561801, "learning_rate": 4.988679806432712e-06, "loss": 2.3546, "mean_token_accuracy": 0.5768229338573292, "num_tokens": 35795324.0, "step": 43 }, { "entropy": 0.5362472534179688, "epoch": 0.5057471264367817, "grad_norm": 62.17298377637022, "learning_rate": 4.987121475701118e-06, "loss": 2.336, "mean_token_accuracy": 0.5807291838573292, "num_tokens": 36626952.0, "step": 44 }, { "entropy": 0.5361404418945312, "epoch": 0.5172413793103449, "grad_norm": 60.1281035244482, "learning_rate": 4.985462964079137e-06, "loss": 2.3124, "mean_token_accuracy": 0.6627604304812849, "num_tokens": 37460441.0, "step": 45 }, { "entropy": 0.5405349731445312, "epoch": 0.5287356321839081, "grad_norm": 61.52627035325762, "learning_rate": 4.983704338371375e-06, "loss": 2.304, "mean_token_accuracy": 0.8281250086147338, "num_tokens": 38327419.0, "step": 46 }, { "entropy": 0.5349044799804688, "epoch": 0.5402298850574713, "grad_norm": 60.7002499169142, "learning_rate": 4.981845669415022e-06, "loss": 2.2617, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 39178646.0, "step": 47 }, { "entropy": 0.5537567138671875, "epoch": 0.5517241379310345, "grad_norm": 60.72677769538659, "learning_rate": 4.9798870320769884e-06, "loss": 2.2418, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 39996115.0, "step": 48 }, { "entropy": 0.543853759765625, "epoch": 0.5632183908045977, "grad_norm": 60.797281222128134, "learning_rate": 4.977828505250903e-06, "loss": 2.221, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 40816594.0, "step": 49 }, { "entropy": 0.549407958984375, "epoch": 0.5747126436781609, "grad_norm": 60.30979043148616, "learning_rate": 4.975670171853926e-06, "loss": 2.1833, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 41637390.0, "step": 50 }, { "entropy": 0.5470504760742188, "epoch": 0.5862068965517241, "grad_norm": 60.72587739612534, "learning_rate": 4.9734121188234115e-06, "loss": 2.1529, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 42461263.0, "step": 51 }, { "entropy": 0.5331497192382812, "epoch": 0.5977011494252874, "grad_norm": 60.23446589036639, "learning_rate": 4.971054437113406e-06, "loss": 2.1406, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 43324395.0, "step": 52 }, { "entropy": 0.5502243041992188, "epoch": 0.6091954022988506, "grad_norm": 59.73550494985957, "learning_rate": 4.968597221690986e-06, "loss": 2.0925, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 44144015.0, "step": 53 }, { "entropy": 0.55364990234375, "epoch": 0.6206896551724138, "grad_norm": 59.90243880461605, "learning_rate": 4.96604057153243e-06, "loss": 2.0862, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 44968457.0, "step": 54 }, { "entropy": 0.5594482421875, "epoch": 0.632183908045977, "grad_norm": 59.85087788955637, "learning_rate": 4.963384589619233e-06, "loss": 2.0512, "mean_token_accuracy": 0.923177087912336, "num_tokens": 45771639.0, "step": 55 }, { "entropy": 0.5338211059570312, "epoch": 0.6436781609195402, "grad_norm": 59.685128376437696, "learning_rate": 4.960629382933959e-06, "loss": 2.0163, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 46631555.0, "step": 56 }, { "entropy": 0.55718994140625, "epoch": 0.6551724137931034, "grad_norm": 59.46218131066281, "learning_rate": 4.957775062455933e-06, "loss": 1.9855, "mean_token_accuracy": 0.923177087912336, "num_tokens": 47425135.0, "step": 57 }, { "entropy": 0.53204345703125, "epoch": 0.6666666666666666, "grad_norm": 59.66591139674002, "learning_rate": 4.9548217431567665e-06, "loss": 1.9881, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 48282128.0, "step": 58 }, { "entropy": 0.5321273803710938, "epoch": 0.6781609195402298, "grad_norm": 59.413094982042885, "learning_rate": 4.951769543995731e-06, "loss": 1.951, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 49150035.0, "step": 59 }, { "entropy": 0.552520751953125, "epoch": 0.6896551724137931, "grad_norm": 58.95179931513381, "learning_rate": 4.948618587914963e-06, "loss": 1.9159, "mean_token_accuracy": 0.923177087912336, "num_tokens": 49967970.0, "step": 60 }, { "entropy": 0.5478515625, "epoch": 0.7011494252873564, "grad_norm": 58.78293117045981, "learning_rate": 4.9453690018345144e-06, "loss": 1.8829, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 50796138.0, "step": 61 }, { "entropy": 0.55072021484375, "epoch": 0.7126436781609196, "grad_norm": 58.7211077235095, "learning_rate": 4.9420209166472386e-06, "loss": 1.8573, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 51611865.0, "step": 62 }, { "entropy": 0.5400009155273438, "epoch": 0.7241379310344828, "grad_norm": 61.677454113756916, "learning_rate": 4.938574467213519e-06, "loss": 1.8451, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 52450130.0, "step": 63 }, { "entropy": 0.551239013671875, "epoch": 0.735632183908046, "grad_norm": 58.55481924234654, "learning_rate": 4.935029792355834e-06, "loss": 1.827, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 53249566.0, "step": 64 }, { "entropy": 0.545135498046875, "epoch": 0.7471264367816092, "grad_norm": 58.16487922973258, "learning_rate": 4.931387034853173e-06, "loss": 1.8032, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 54077184.0, "step": 65 }, { "entropy": 0.5316085815429688, "epoch": 0.7586206896551724, "grad_norm": 58.576343100426335, "learning_rate": 4.927646341435276e-06, "loss": 1.7661, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 54933670.0, "step": 66 }, { "entropy": 0.5523452758789062, "epoch": 0.7701149425287356, "grad_norm": 58.08683764205428, "learning_rate": 4.9238078627767285e-06, "loss": 1.7396, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 55733802.0, "step": 67 }, { "entropy": 0.551300048828125, "epoch": 0.7816091954022989, "grad_norm": 58.08491660003921, "learning_rate": 4.919871753490892e-06, "loss": 1.6895, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 56526451.0, "step": 68 }, { "entropy": 0.5546722412109375, "epoch": 0.7931034482758621, "grad_norm": 58.51853320871066, "learning_rate": 4.9158381721236715e-06, "loss": 1.6761, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 57305070.0, "step": 69 }, { "entropy": 0.5334014892578125, "epoch": 0.8045977011494253, "grad_norm": 58.170621114524614, "learning_rate": 4.91170728114714e-06, "loss": 1.6546, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 58169562.0, "step": 70 }, { "entropy": 0.5419540405273438, "epoch": 0.8160919540229885, "grad_norm": 58.528006132883995, "learning_rate": 4.907479246952981e-06, "loss": 1.6312, "mean_token_accuracy": 0.923177087912336, "num_tokens": 58988176.0, "step": 71 }, { "entropy": 0.5484161376953125, "epoch": 0.8275862068965517, "grad_norm": 58.636271269409825, "learning_rate": 4.903154239845798e-06, "loss": 1.5899, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 59799834.0, "step": 72 }, { "entropy": 0.5280075073242188, "epoch": 0.8390804597701149, "grad_norm": 58.5835216129002, "learning_rate": 4.8987324340362445e-06, "loss": 1.5841, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 60676006.0, "step": 73 }, { "entropy": 0.5616531372070312, "epoch": 0.8505747126436781, "grad_norm": 60.89085729912909, "learning_rate": 4.894214007634014e-06, "loss": 1.5472, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 61467427.0, "step": 74 }, { "entropy": 0.5347137451171875, "epoch": 0.8620689655172413, "grad_norm": 59.747068058722576, "learning_rate": 4.889599142640663e-06, "loss": 1.5215, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 62295438.0, "step": 75 }, { "entropy": 0.5299224853515625, "epoch": 0.8735632183908046, "grad_norm": 58.493866078976886, "learning_rate": 4.884888024942282e-06, "loss": 1.4759, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 63166965.0, "step": 76 }, { "entropy": 0.54058837890625, "epoch": 0.8850574712643678, "grad_norm": 60.16738350731097, "learning_rate": 4.880080844302004e-06, "loss": 1.4711, "mean_token_accuracy": 0.923177087912336, "num_tokens": 64006639.0, "step": 77 }, { "entropy": 0.5329971313476562, "epoch": 0.896551724137931, "grad_norm": 59.15598057345835, "learning_rate": 4.875177794352364e-06, "loss": 1.4256, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 64880312.0, "step": 78 }, { "entropy": 0.5373687744140625, "epoch": 0.9080459770114943, "grad_norm": 59.64780911126226, "learning_rate": 4.870179072587499e-06, "loss": 1.4136, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 65737156.0, "step": 79 }, { "entropy": 0.5340957641601562, "epoch": 0.9195402298850575, "grad_norm": 59.31089224424462, "learning_rate": 4.865084880355193e-06, "loss": 1.366, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 66600246.0, "step": 80 }, { "entropy": 0.5457611083984375, "epoch": 0.9310344827586207, "grad_norm": 58.67365335737598, "learning_rate": 4.859895422848767e-06, "loss": 1.3352, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 67409974.0, "step": 81 }, { "entropy": 0.5508956909179688, "epoch": 0.9425287356321839, "grad_norm": 59.941253952471996, "learning_rate": 4.854610909098813e-06, "loss": 1.3101, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 68213017.0, "step": 82 }, { "entropy": 0.545867919921875, "epoch": 0.9540229885057471, "grad_norm": 58.22197118807192, "learning_rate": 4.849231551964771e-06, "loss": 1.2806, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 69047530.0, "step": 83 }, { "entropy": 0.5390625, "epoch": 0.9655172413793104, "grad_norm": 58.73986491110616, "learning_rate": 4.843757568126366e-06, "loss": 1.2614, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 69871145.0, "step": 84 }, { "entropy": 0.57440185546875, "epoch": 0.9770114942528736, "grad_norm": 58.0306658042546, "learning_rate": 4.838189178074867e-06, "loss": 1.2314, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 70637600.0, "step": 85 }, { "entropy": 0.5460357666015625, "epoch": 0.9885057471264368, "grad_norm": 57.84110468380361, "learning_rate": 4.832526606104213e-06, "loss": 1.1956, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 71459220.0, "step": 86 }, { "entropy": 0.5465316772460938, "epoch": 1.0, "grad_norm": 57.45695249217127, "learning_rate": 4.826770080301978e-06, "loss": 1.1805, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 72277431.0, "step": 87 }, { "entropy": 0.5504074096679688, "epoch": 1.0114942528735633, "grad_norm": 57.11647405728569, "learning_rate": 4.8209198325401815e-06, "loss": 1.1402, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 73118155.0, "step": 88 }, { "entropy": 0.5403900146484375, "epoch": 1.0229885057471264, "grad_norm": 57.58423055463277, "learning_rate": 4.814976098465951e-06, "loss": 1.1167, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 73960483.0, "step": 89 }, { "entropy": 0.5394744873046875, "epoch": 1.0344827586206897, "grad_norm": 56.91664155850013, "learning_rate": 4.808939117492028e-06, "loss": 1.0679, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 74806371.0, "step": 90 }, { "entropy": 0.5373764038085938, "epoch": 1.0459770114942528, "grad_norm": 57.550759721577286, "learning_rate": 4.802809132787125e-06, "loss": 1.061, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 75649917.0, "step": 91 }, { "entropy": 0.542877197265625, "epoch": 1.0574712643678161, "grad_norm": 56.581488938244036, "learning_rate": 4.796586391266135e-06, "loss": 1.0517, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 76503812.0, "step": 92 }, { "entropy": 0.551666259765625, "epoch": 1.0689655172413792, "grad_norm": 57.7727620529194, "learning_rate": 4.790271143580174e-06, "loss": 1.0115, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 77300193.0, "step": 93 }, { "entropy": 0.5273056030273438, "epoch": 1.0804597701149425, "grad_norm": 56.45974821885547, "learning_rate": 4.783863644106502e-06, "loss": 0.9868, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 78158080.0, "step": 94 }, { "entropy": 0.545013427734375, "epoch": 1.0919540229885056, "grad_norm": 57.95646226031294, "learning_rate": 4.777364150938263e-06, "loss": 0.967, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 78970891.0, "step": 95 }, { "entropy": 0.5445480346679688, "epoch": 1.103448275862069, "grad_norm": 56.3826184106477, "learning_rate": 4.770772925874093e-06, "loss": 0.9342, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 79797185.0, "step": 96 }, { "entropy": 0.5520553588867188, "epoch": 1.1149425287356323, "grad_norm": 57.57229556412407, "learning_rate": 4.764090234407578e-06, "loss": 0.9378, "mean_token_accuracy": 0.923177087912336, "num_tokens": 80609961.0, "step": 97 }, { "entropy": 0.5395355224609375, "epoch": 1.1264367816091954, "grad_norm": 55.28494434882409, "learning_rate": 4.757316345716554e-06, "loss": 0.8895, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 81431397.0, "step": 98 }, { "entropy": 0.5417556762695312, "epoch": 1.1379310344827587, "grad_norm": 56.62889719836715, "learning_rate": 4.75045153265227e-06, "loss": 0.8857, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 82257632.0, "step": 99 }, { "entropy": 0.5480728149414062, "epoch": 1.1494252873563218, "grad_norm": 54.66966542092686, "learning_rate": 4.743496071728396e-06, "loss": 0.8359, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 83059920.0, "step": 100 }, { "entropy": 0.5300827026367188, "epoch": 1.160919540229885, "grad_norm": 55.3701092949065, "learning_rate": 4.736450243109885e-06, "loss": 0.8529, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 83910320.0, "step": 101 }, { "entropy": 0.5412673950195312, "epoch": 1.1724137931034484, "grad_norm": 54.925211873446585, "learning_rate": 4.729314330601684e-06, "loss": 0.8188, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 84728079.0, "step": 102 }, { "entropy": 0.5335769653320312, "epoch": 1.1839080459770115, "grad_norm": 53.599510414280125, "learning_rate": 4.7220886216373095e-06, "loss": 0.7627, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 85588279.0, "step": 103 }, { "entropy": 0.5458831787109375, "epoch": 1.1954022988505748, "grad_norm": 54.85644042986777, "learning_rate": 4.714773407267264e-06, "loss": 0.7858, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 86423379.0, "step": 104 }, { "entropy": 0.5371475219726562, "epoch": 1.206896551724138, "grad_norm": 52.70508106481705, "learning_rate": 4.707368982147318e-06, "loss": 0.731, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 87283283.0, "step": 105 }, { "entropy": 0.539794921875, "epoch": 1.2183908045977012, "grad_norm": 52.85664100248831, "learning_rate": 4.699875644526633e-06, "loss": 0.7239, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 88114338.0, "step": 106 }, { "entropy": 0.5485763549804688, "epoch": 1.2298850574712643, "grad_norm": 51.94175773034053, "learning_rate": 4.692293696235758e-06, "loss": 0.6903, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 88926461.0, "step": 107 }, { "entropy": 0.531646728515625, "epoch": 1.2413793103448276, "grad_norm": 50.87427138533599, "learning_rate": 4.684623442674463e-06, "loss": 0.6617, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 89769524.0, "step": 108 }, { "entropy": 0.54351806640625, "epoch": 1.2528735632183907, "grad_norm": 51.75745904673131, "learning_rate": 4.676865192799443e-06, "loss": 0.6903, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 90602609.0, "step": 109 }, { "entropy": 0.5421981811523438, "epoch": 1.264367816091954, "grad_norm": 49.25842347613154, "learning_rate": 4.669019259111873e-06, "loss": 0.6236, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 91423615.0, "step": 110 }, { "entropy": 0.5615921020507812, "epoch": 1.2758620689655173, "grad_norm": 47.655979296452784, "learning_rate": 4.661085957644817e-06, "loss": 0.5967, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 92204901.0, "step": 111 }, { "entropy": 0.5518264770507812, "epoch": 1.2873563218390804, "grad_norm": 47.714755290736136, "learning_rate": 4.653065607950502e-06, "loss": 0.5973, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 93005537.0, "step": 112 }, { "entropy": 0.5359039306640625, "epoch": 1.2988505747126438, "grad_norm": 45.89988870201635, "learning_rate": 4.644958533087443e-06, "loss": 0.5801, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 93854099.0, "step": 113 }, { "entropy": 0.5466156005859375, "epoch": 1.3103448275862069, "grad_norm": 45.66002812793437, "learning_rate": 4.636765059607434e-06, "loss": 0.5638, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 94673565.0, "step": 114 }, { "entropy": 0.5435638427734375, "epoch": 1.3218390804597702, "grad_norm": 45.14215656166137, "learning_rate": 4.628485517542393e-06, "loss": 0.5556, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 95476803.0, "step": 115 }, { "entropy": 0.541839599609375, "epoch": 1.3333333333333333, "grad_norm": 42.636378872173985, "learning_rate": 4.620120240391065e-06, "loss": 0.5019, "mean_token_accuracy": 0.945312503259629, "num_tokens": 96327452.0, "step": 116 }, { "entropy": 0.5332412719726562, "epoch": 1.3448275862068966, "grad_norm": 43.81341541141291, "learning_rate": 4.611669565105597e-06, "loss": 0.5114, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 97185071.0, "step": 117 }, { "entropy": 0.5449066162109375, "epoch": 1.3563218390804597, "grad_norm": 41.21199218337275, "learning_rate": 4.603133832077953e-06, "loss": 0.4932, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 98001207.0, "step": 118 }, { "entropy": 0.5452804565429688, "epoch": 1.367816091954023, "grad_norm": 40.05351715592126, "learning_rate": 4.5945133851262185e-06, "loss": 0.4633, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 98827697.0, "step": 119 }, { "entropy": 0.5493011474609375, "epoch": 1.3793103448275863, "grad_norm": 38.1413690912593, "learning_rate": 4.585808571480739e-06, "loss": 0.4567, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 99658235.0, "step": 120 }, { "entropy": 0.5312347412109375, "epoch": 1.3908045977011494, "grad_norm": 35.77272010447715, "learning_rate": 4.577019741770137e-06, "loss": 0.4394, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 100521415.0, "step": 121 }, { "entropy": 0.5474624633789062, "epoch": 1.4022988505747127, "grad_norm": 35.44749167453788, "learning_rate": 4.5681472500071935e-06, "loss": 0.4051, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 101350872.0, "step": 122 }, { "entropy": 0.5480728149414062, "epoch": 1.4137931034482758, "grad_norm": 33.3855582923729, "learning_rate": 4.559191453574582e-06, "loss": 0.3849, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 102172977.0, "step": 123 }, { "entropy": 0.538360595703125, "epoch": 1.4252873563218391, "grad_norm": 33.1834131236442, "learning_rate": 4.550152713210478e-06, "loss": 0.3944, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 102998176.0, "step": 124 }, { "entropy": 0.5465621948242188, "epoch": 1.4367816091954024, "grad_norm": 31.2542834783487, "learning_rate": 4.541031392994025e-06, "loss": 0.3635, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 103812200.0, "step": 125 }, { "entropy": 0.5382003784179688, "epoch": 1.4482758620689655, "grad_norm": 32.40479508190425, "learning_rate": 4.53182786033067e-06, "loss": 0.3612, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 104634429.0, "step": 126 }, { "entropy": 0.5458450317382812, "epoch": 1.4597701149425286, "grad_norm": 30.72340713652481, "learning_rate": 4.522542485937369e-06, "loss": 0.3399, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 105461037.0, "step": 127 }, { "entropy": 0.552581787109375, "epoch": 1.471264367816092, "grad_norm": 29.132170258942477, "learning_rate": 4.513175643827647e-06, "loss": 0.313, "mean_token_accuracy": 0.955729169305414, "num_tokens": 106274400.0, "step": 128 }, { "entropy": 0.5401077270507812, "epoch": 1.4827586206896552, "grad_norm": 28.683201688609554, "learning_rate": 4.503727711296539e-06, "loss": 0.3137, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 107103293.0, "step": 129 }, { "entropy": 0.5371475219726562, "epoch": 1.4942528735632183, "grad_norm": 26.82838757384805, "learning_rate": 4.494199068905389e-06, "loss": 0.2902, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 107941406.0, "step": 130 }, { "entropy": 0.5441970825195312, "epoch": 1.5057471264367817, "grad_norm": 25.971681456288458, "learning_rate": 4.484590100466524e-06, "loss": 0.2772, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 108767533.0, "step": 131 }, { "entropy": 0.52886962890625, "epoch": 1.5172413793103448, "grad_norm": 24.8536339487485, "learning_rate": 4.474901193027791e-06, "loss": 0.2905, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 109628216.0, "step": 132 }, { "entropy": 0.524261474609375, "epoch": 1.528735632183908, "grad_norm": 23.69318911646761, "learning_rate": 4.4651327368569695e-06, "loss": 0.2648, "mean_token_accuracy": 0.955729169305414, "num_tokens": 110503540.0, "step": 133 }, { "entropy": 0.5345916748046875, "epoch": 1.5402298850574714, "grad_norm": 22.723236503876162, "learning_rate": 4.455285125426049e-06, "loss": 0.2575, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 111345385.0, "step": 134 }, { "entropy": 0.5496292114257812, "epoch": 1.5517241379310345, "grad_norm": 22.92712147978677, "learning_rate": 4.445358755395382e-06, "loss": 0.2472, "mean_token_accuracy": 0.945312503259629, "num_tokens": 112152579.0, "step": 135 }, { "entropy": 0.5406265258789062, "epoch": 1.5632183908045976, "grad_norm": 21.357350310348682, "learning_rate": 4.435354026597707e-06, "loss": 0.2491, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 112991802.0, "step": 136 }, { "entropy": 0.5425033569335938, "epoch": 1.5747126436781609, "grad_norm": 18.884927838452082, "learning_rate": 4.425271342022039e-06, "loss": 0.2242, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 113818986.0, "step": 137 }, { "entropy": 0.545745849609375, "epoch": 1.5862068965517242, "grad_norm": 19.69653684409779, "learning_rate": 4.415111107797445e-06, "loss": 0.2174, "mean_token_accuracy": 0.945312503259629, "num_tokens": 114642387.0, "step": 138 }, { "entropy": 0.5489501953125, "epoch": 1.5977011494252875, "grad_norm": 17.837394227991627, "learning_rate": 4.404873733176678e-06, "loss": 0.1848, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 115451222.0, "step": 139 }, { "entropy": 0.52032470703125, "epoch": 1.6091954022988506, "grad_norm": 17.24235084071355, "learning_rate": 4.3945596305196925e-06, "loss": 0.2063, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 116312542.0, "step": 140 }, { "entropy": 0.52130126953125, "epoch": 1.6206896551724137, "grad_norm": 19.23462423448226, "learning_rate": 4.384169215277042e-06, "loss": 0.2174, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 117191343.0, "step": 141 }, { "entropy": 0.5353240966796875, "epoch": 1.632183908045977, "grad_norm": 19.832558193797162, "learning_rate": 4.373702905973136e-06, "loss": 0.2222, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 118019469.0, "step": 142 }, { "entropy": 0.537261962890625, "epoch": 1.6436781609195403, "grad_norm": 18.073863888632665, "learning_rate": 4.363161124189387e-06, "loss": 0.213, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 118848731.0, "step": 143 }, { "entropy": 0.5355072021484375, "epoch": 1.6551724137931034, "grad_norm": 13.643534993364158, "learning_rate": 4.352544294547229e-06, "loss": 0.2255, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 119699438.0, "step": 144 }, { "entropy": 0.53302001953125, "epoch": 1.6666666666666665, "grad_norm": 13.32804577717328, "learning_rate": 4.341852844691012e-06, "loss": 0.1919, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 120543167.0, "step": 145 }, { "entropy": 0.5304183959960938, "epoch": 1.6781609195402298, "grad_norm": 12.1831913402612, "learning_rate": 4.331087205270778e-06, "loss": 0.1573, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 121413453.0, "step": 146 }, { "entropy": 0.5264511108398438, "epoch": 1.6896551724137931, "grad_norm": 14.866153648828307, "learning_rate": 4.320247809924911e-06, "loss": 0.1717, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 122277203.0, "step": 147 }, { "entropy": 0.5309906005859375, "epoch": 1.7011494252873565, "grad_norm": 36.174423890876305, "learning_rate": 4.309335095262675e-06, "loss": 0.3047, "mean_token_accuracy": 0.923177087912336, "num_tokens": 123098358.0, "step": 148 }, { "entropy": 0.54193115234375, "epoch": 1.7126436781609196, "grad_norm": 19.683188431556264, "learning_rate": 4.2983495008466285e-06, "loss": 0.1893, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 123918082.0, "step": 149 }, { "entropy": 0.5468902587890625, "epoch": 1.7241379310344827, "grad_norm": 11.029033694493663, "learning_rate": 4.287291469174909e-06, "loss": 0.1802, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 124727934.0, "step": 150 }, { "entropy": 0.5524139404296875, "epoch": 1.735632183908046, "grad_norm": 12.741964950820162, "learning_rate": 4.276161445663423e-06, "loss": 0.1935, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 125521364.0, "step": 151 }, { "entropy": 0.5598068237304688, "epoch": 1.7471264367816093, "grad_norm": 10.684223439753536, "learning_rate": 4.264959878627891e-06, "loss": 0.1764, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 126310482.0, "step": 152 }, { "entropy": 0.539764404296875, "epoch": 1.7586206896551724, "grad_norm": 7.967845652668114, "learning_rate": 4.253687219265803e-06, "loss": 0.1465, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 127127924.0, "step": 153 }, { "entropy": 0.5313034057617188, "epoch": 1.7701149425287355, "grad_norm": 8.78172779941521, "learning_rate": 4.242343921638235e-06, "loss": 0.1423, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 127963998.0, "step": 154 }, { "entropy": 0.5254440307617188, "epoch": 1.7816091954022988, "grad_norm": 8.679498363997155, "learning_rate": 4.230930442651558e-06, "loss": 0.1402, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 128815367.0, "step": 155 }, { "entropy": 0.5338897705078125, "epoch": 1.793103448275862, "grad_norm": 12.414584279230683, "learning_rate": 4.219447242039043e-06, "loss": 0.1571, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 129649673.0, "step": 156 }, { "entropy": 0.5256805419921875, "epoch": 1.8045977011494254, "grad_norm": 11.900284697954019, "learning_rate": 4.207894782342337e-06, "loss": 0.1883, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 130490556.0, "step": 157 }, { "entropy": 0.5339889526367188, "epoch": 1.8160919540229885, "grad_norm": 7.737654770007032, "learning_rate": 4.196273528892831e-06, "loss": 0.161, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 131336884.0, "step": 158 }, { "entropy": 0.5406951904296875, "epoch": 1.8275862068965516, "grad_norm": 14.436453211359808, "learning_rate": 4.18458394979292e-06, "loss": 0.1516, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 132170053.0, "step": 159 }, { "entropy": 0.5588531494140625, "epoch": 1.839080459770115, "grad_norm": 6.146992863163749, "learning_rate": 4.172826515897146e-06, "loss": 0.1441, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 132962150.0, "step": 160 }, { "entropy": 0.5524063110351562, "epoch": 1.8505747126436782, "grad_norm": 17.04399335228688, "learning_rate": 4.161001700793231e-06, "loss": 0.1792, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 133791084.0, "step": 161 }, { "entropy": 0.55145263671875, "epoch": 1.8620689655172413, "grad_norm": 7.828687166002703, "learning_rate": 4.149109980783004e-06, "loss": 0.1377, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 134601699.0, "step": 162 }, { "entropy": 0.541046142578125, "epoch": 1.8735632183908046, "grad_norm": 15.274961569812856, "learning_rate": 4.137151834863213e-06, "loss": 0.165, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 135435563.0, "step": 163 }, { "entropy": 0.5604324340820312, "epoch": 1.8850574712643677, "grad_norm": 10.201587419105804, "learning_rate": 4.125127744706232e-06, "loss": 0.1241, "mean_token_accuracy": 0.955729169305414, "num_tokens": 136227698.0, "step": 164 }, { "entropy": 0.5586700439453125, "epoch": 1.896551724137931, "grad_norm": 11.218767501992696, "learning_rate": 4.113038194640658e-06, "loss": 0.1361, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 137028072.0, "step": 165 }, { "entropy": 0.5378341674804688, "epoch": 1.9080459770114944, "grad_norm": 10.759553109195826, "learning_rate": 4.100883671631806e-06, "loss": 0.1467, "mean_token_accuracy": 0.945312503259629, "num_tokens": 137879487.0, "step": 166 }, { "entropy": 0.566436767578125, "epoch": 1.9195402298850575, "grad_norm": 12.342573902869242, "learning_rate": 4.088664665262091e-06, "loss": 0.1545, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 138645762.0, "step": 167 }, { "entropy": 0.5373306274414062, "epoch": 1.9310344827586206, "grad_norm": 14.908844134109147, "learning_rate": 4.076381667711306e-06, "loss": 0.1839, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 139495511.0, "step": 168 }, { "entropy": 0.5425796508789062, "epoch": 1.9425287356321839, "grad_norm": 4.4442828301660064, "learning_rate": 4.064035173736804e-06, "loss": 0.1113, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 140341321.0, "step": 169 }, { "entropy": 0.5580520629882812, "epoch": 1.9540229885057472, "grad_norm": 8.345686227442089, "learning_rate": 4.05162568065356e-06, "loss": 0.1313, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 141135938.0, "step": 170 }, { "entropy": 0.5560073852539062, "epoch": 1.9655172413793105, "grad_norm": 5.335070929960651, "learning_rate": 4.039153688314146e-06, "loss": 0.1195, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 141960068.0, "step": 171 }, { "entropy": 0.5646591186523438, "epoch": 1.9770114942528736, "grad_norm": 8.240841304443089, "learning_rate": 4.0266196990885955e-06, "loss": 0.1065, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 142742990.0, "step": 172 }, { "entropy": 0.5467605590820312, "epoch": 1.9885057471264367, "grad_norm": 10.082258712427155, "learning_rate": 4.014024217844167e-06, "loss": 0.1196, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 143557918.0, "step": 173 }, { "entropy": 0.5526123046875, "epoch": 2.0, "grad_norm": 7.158314474332936, "learning_rate": 4.001367751925008e-06, "loss": 0.1259, "mean_token_accuracy": 0.955729169305414, "num_tokens": 144380626.0, "step": 174 }, { "entropy": 0.5583114624023438, "epoch": 2.0114942528735633, "grad_norm": 8.124921778124246, "learning_rate": 3.98865081113172e-06, "loss": 0.1174, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 145190047.0, "step": 175 }, { "entropy": 0.5553131103515625, "epoch": 2.0229885057471266, "grad_norm": 5.03898978737638, "learning_rate": 3.9758739077008256e-06, "loss": 0.0908, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 146000722.0, "step": 176 }, { "entropy": 0.5393218994140625, "epoch": 2.0344827586206895, "grad_norm": 5.644079988912752, "learning_rate": 3.96303755628413e-06, "loss": 0.0908, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 146826351.0, "step": 177 }, { "entropy": 0.5538177490234375, "epoch": 2.045977011494253, "grad_norm": 7.201807862926888, "learning_rate": 3.950142273927996e-06, "loss": 0.0837, "mean_token_accuracy": 0.977864584652707, "num_tokens": 147603914.0, "step": 178 }, { "entropy": 0.5235366821289062, "epoch": 2.057471264367816, "grad_norm": 4.912652652723082, "learning_rate": 3.937188580052518e-06, "loss": 0.08, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 148447329.0, "step": 179 }, { "entropy": 0.5220870971679688, "epoch": 2.0689655172413794, "grad_norm": 10.473323494894288, "learning_rate": 3.924176996430597e-06, "loss": 0.1432, "mean_token_accuracy": 0.955729169305414, "num_tokens": 149295756.0, "step": 180 }, { "entropy": 0.526580810546875, "epoch": 2.0804597701149423, "grad_norm": 8.04200314568136, "learning_rate": 3.911108047166924e-06, "loss": 0.0891, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 150134016.0, "step": 181 }, { "entropy": 0.5185546875, "epoch": 2.0919540229885056, "grad_norm": 4.355747854192094, "learning_rate": 3.897982258676867e-06, "loss": 0.0844, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 150973641.0, "step": 182 }, { "entropy": 0.533843994140625, "epoch": 2.103448275862069, "grad_norm": 7.793915871437659, "learning_rate": 3.8848001596652765e-06, "loss": 0.0865, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 151784661.0, "step": 183 }, { "entropy": 0.5393905639648438, "epoch": 2.1149425287356323, "grad_norm": 4.469170022546009, "learning_rate": 3.8715622811051754e-06, "loss": 0.0927, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 152598583.0, "step": 184 }, { "entropy": 0.527130126953125, "epoch": 2.1264367816091956, "grad_norm": 3.9563194078415385, "learning_rate": 3.858269156216383e-06, "loss": 0.082, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 153415586.0, "step": 185 }, { "entropy": 0.524139404296875, "epoch": 2.1379310344827585, "grad_norm": 4.965128765266531, "learning_rate": 3.844921320444031e-06, "loss": 0.0964, "mean_token_accuracy": 0.967447918606922, "num_tokens": 154257216.0, "step": 186 }, { "entropy": 0.5194931030273438, "epoch": 2.1494252873563218, "grad_norm": 12.18058735895402, "learning_rate": 3.8315193114369995e-06, "loss": 0.0965, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 155099957.0, "step": 187 }, { "entropy": 0.50604248046875, "epoch": 2.160919540229885, "grad_norm": 5.512495898345173, "learning_rate": 3.8180636690262565e-06, "loss": 0.0988, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 155957358.0, "step": 188 }, { "entropy": 0.5256881713867188, "epoch": 2.1724137931034484, "grad_norm": 11.30593507768243, "learning_rate": 3.804554935203115e-06, "loss": 0.1068, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 156762306.0, "step": 189 }, { "entropy": 0.5245895385742188, "epoch": 2.1839080459770113, "grad_norm": 4.544672386500914, "learning_rate": 3.7909936540974052e-06, "loss": 0.0734, "mean_token_accuracy": 0.977864584652707, "num_tokens": 157592554.0, "step": 190 }, { "entropy": 0.5061416625976562, "epoch": 2.1954022988505746, "grad_norm": 3.941196817466825, "learning_rate": 3.777380371955552e-06, "loss": 0.0772, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 158445907.0, "step": 191 }, { "entropy": 0.515838623046875, "epoch": 2.206896551724138, "grad_norm": 3.232650955550973, "learning_rate": 3.7637156371185744e-06, "loss": 0.0628, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 159250314.0, "step": 192 }, { "entropy": 0.5301513671875, "epoch": 2.218390804597701, "grad_norm": 5.776572070005293, "learning_rate": 3.7500000000000005e-06, "loss": 0.0821, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 160039518.0, "step": 193 }, { "entropy": 0.5020217895507812, "epoch": 2.2298850574712645, "grad_norm": 8.011873369364945, "learning_rate": 3.7362340130636926e-06, "loss": 0.0858, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 160893468.0, "step": 194 }, { "entropy": 0.5221786499023438, "epoch": 2.2413793103448274, "grad_norm": 4.253581057609115, "learning_rate": 3.7224182308015977e-06, "loss": 0.078, "mean_token_accuracy": 0.977864584652707, "num_tokens": 161694335.0, "step": 195 }, { "entropy": 0.5163421630859375, "epoch": 2.2528735632183907, "grad_norm": 11.977620750971552, "learning_rate": 3.7085532097114098e-06, "loss": 0.1257, "mean_token_accuracy": 0.955729169305414, "num_tokens": 162532877.0, "step": 196 }, { "entropy": 0.5193557739257812, "epoch": 2.264367816091954, "grad_norm": 4.100816778244049, "learning_rate": 3.6946395082741582e-06, "loss": 0.0741, "mean_token_accuracy": 0.977864584652707, "num_tokens": 163353544.0, "step": 197 }, { "entropy": 0.5131072998046875, "epoch": 2.2758620689655173, "grad_norm": 14.336855059492699, "learning_rate": 3.6806776869317074e-06, "loss": 0.1028, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 164189466.0, "step": 198 }, { "entropy": 0.522857666015625, "epoch": 2.2873563218390807, "grad_norm": 12.863083017991736, "learning_rate": 3.6666683080641846e-06, "loss": 0.1047, "mean_token_accuracy": 0.945312503259629, "num_tokens": 165024121.0, "step": 199 }, { "entropy": 0.5307998657226562, "epoch": 2.2988505747126435, "grad_norm": 5.384315613913392, "learning_rate": 3.6526119359673283e-06, "loss": 0.0825, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 165858531.0, "step": 200 }, { "entropy": 0.5401611328125, "epoch": 2.310344827586207, "grad_norm": 6.978944080668443, "learning_rate": 3.6385091368297582e-06, "loss": 0.0949, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 166672880.0, "step": 201 }, { "entropy": 0.5244674682617188, "epoch": 2.32183908045977, "grad_norm": 8.586521251755975, "learning_rate": 3.624360478710165e-06, "loss": 0.0979, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 167513008.0, "step": 202 }, { "entropy": 0.5154495239257812, "epoch": 2.3333333333333335, "grad_norm": 10.059252916770694, "learning_rate": 3.6101665315144357e-06, "loss": 0.1087, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 168386249.0, "step": 203 }, { "entropy": 0.5272903442382812, "epoch": 2.344827586206897, "grad_norm": 3.7220569133269326, "learning_rate": 3.595927866972694e-06, "loss": 0.0893, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 169217721.0, "step": 204 }, { "entropy": 0.5361862182617188, "epoch": 2.3563218390804597, "grad_norm": 8.14296764792559, "learning_rate": 3.581645058616271e-06, "loss": 0.0914, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 170033814.0, "step": 205 }, { "entropy": 0.5298843383789062, "epoch": 2.367816091954023, "grad_norm": 2.896667570609037, "learning_rate": 3.5673186817546047e-06, "loss": 0.0564, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 170852345.0, "step": 206 }, { "entropy": 0.5321426391601562, "epoch": 2.3793103448275863, "grad_norm": 6.510425360047998, "learning_rate": 3.552949313452067e-06, "loss": 0.0669, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 171671019.0, "step": 207 }, { "entropy": 0.517181396484375, "epoch": 2.3908045977011496, "grad_norm": 4.429647054928265, "learning_rate": 3.5385375325047167e-06, "loss": 0.0555, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 172531805.0, "step": 208 }, { "entropy": 0.5280838012695312, "epoch": 2.4022988505747125, "grad_norm": 3.9428408925354477, "learning_rate": 3.5240839194169885e-06, "loss": 0.0676, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 173363059.0, "step": 209 }, { "entropy": 0.5230560302734375, "epoch": 2.413793103448276, "grad_norm": 6.031126783984352, "learning_rate": 3.5095890563783124e-06, "loss": 0.0431, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 174211868.0, "step": 210 }, { "entropy": 0.52056884765625, "epoch": 2.425287356321839, "grad_norm": 4.468991885868761, "learning_rate": 3.4950535272396564e-06, "loss": 0.0619, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 175030480.0, "step": 211 }, { "entropy": 0.529876708984375, "epoch": 2.4367816091954024, "grad_norm": 9.239483704756537, "learning_rate": 3.480477917490014e-06, "loss": 0.0716, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 175843572.0, "step": 212 }, { "entropy": 0.5277099609375, "epoch": 2.4482758620689653, "grad_norm": 5.300163295816919, "learning_rate": 3.4658628142328215e-06, "loss": 0.0756, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 176656377.0, "step": 213 }, { "entropy": 0.523345947265625, "epoch": 2.4597701149425286, "grad_norm": 4.579054112244679, "learning_rate": 3.4512088061623077e-06, "loss": 0.0509, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 177483746.0, "step": 214 }, { "entropy": 0.523223876953125, "epoch": 2.471264367816092, "grad_norm": 5.040930541242033, "learning_rate": 3.436516483539781e-06, "loss": 0.0551, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 178330666.0, "step": 215 }, { "entropy": 0.531158447265625, "epoch": 2.4827586206896552, "grad_norm": 8.739883447359308, "learning_rate": 3.4217864381698523e-06, "loss": 0.0719, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 179149396.0, "step": 216 }, { "entropy": 0.5261764526367188, "epoch": 2.4942528735632186, "grad_norm": 11.494350665933359, "learning_rate": 3.4070192633766025e-06, "loss": 0.0428, "mean_token_accuracy": 0.989583333954215, "num_tokens": 179974795.0, "step": 217 }, { "entropy": 0.5234909057617188, "epoch": 2.5057471264367814, "grad_norm": 6.838876203026226, "learning_rate": 3.39221555397968e-06, "loss": 0.0796, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 180802086.0, "step": 218 }, { "entropy": 0.5234756469726562, "epoch": 2.5172413793103448, "grad_norm": 4.455755339288859, "learning_rate": 3.37737590627034e-06, "loss": 0.0805, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 181658273.0, "step": 219 }, { "entropy": 0.5229568481445312, "epoch": 2.528735632183908, "grad_norm": 3.4578598266072866, "learning_rate": 3.362500917987427e-06, "loss": 0.0415, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 182474385.0, "step": 220 }, { "entropy": 0.5162353515625, "epoch": 2.5402298850574714, "grad_norm": 3.3923382783923386, "learning_rate": 3.3475911882933014e-06, "loss": 0.045, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 183315758.0, "step": 221 }, { "entropy": 0.5235671997070312, "epoch": 2.5517241379310347, "grad_norm": 4.903114742385986, "learning_rate": 3.332647317749702e-06, "loss": 0.0466, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 184143585.0, "step": 222 }, { "entropy": 0.5148849487304688, "epoch": 2.5632183908045976, "grad_norm": 4.864354866386424, "learning_rate": 3.3176699082935546e-06, "loss": 0.0461, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 184985732.0, "step": 223 }, { "entropy": 0.5059356689453125, "epoch": 2.574712643678161, "grad_norm": 9.984487721834693, "learning_rate": 3.3026595632127274e-06, "loss": 0.0708, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 185828814.0, "step": 224 }, { "entropy": 0.5138702392578125, "epoch": 2.586206896551724, "grad_norm": 6.845004434710824, "learning_rate": 3.2876168871217322e-06, "loss": 0.0655, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 186649376.0, "step": 225 }, { "entropy": 0.493011474609375, "epoch": 2.5977011494252875, "grad_norm": 22.95951100464713, "learning_rate": 3.272542485937369e-06, "loss": 0.1239, "mean_token_accuracy": 0.955729169305414, "num_tokens": 187517356.0, "step": 226 }, { "entropy": 0.5219573974609375, "epoch": 2.609195402298851, "grad_norm": 14.8549297549626, "learning_rate": 3.2574369668543187e-06, "loss": 0.1119, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 188347541.0, "step": 227 }, { "entropy": 0.5112228393554688, "epoch": 2.6206896551724137, "grad_norm": 4.479017898039443, "learning_rate": 3.2423009383206876e-06, "loss": 0.0627, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 189198859.0, "step": 228 }, { "entropy": 0.511871337890625, "epoch": 2.632183908045977, "grad_norm": 7.490721295049219, "learning_rate": 3.227135010013498e-06, "loss": 0.0705, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 190033894.0, "step": 229 }, { "entropy": 0.514923095703125, "epoch": 2.6436781609195403, "grad_norm": 5.050493531281534, "learning_rate": 3.211939792814131e-06, "loss": 0.0602, "mean_token_accuracy": 0.977864584652707, "num_tokens": 190869827.0, "step": 230 }, { "entropy": 0.5310745239257812, "epoch": 2.655172413793103, "grad_norm": 4.861042871342369, "learning_rate": 3.19671589878372e-06, "loss": 0.0393, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 191679736.0, "step": 231 }, { "entropy": 0.5336990356445312, "epoch": 2.6666666666666665, "grad_norm": 6.075779640917305, "learning_rate": 3.1814639411384953e-06, "loss": 0.0594, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 192490031.0, "step": 232 }, { "entropy": 0.5216522216796875, "epoch": 2.67816091954023, "grad_norm": 3.9095981775216457, "learning_rate": 3.1661845342250874e-06, "loss": 0.0538, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 193317563.0, "step": 233 }, { "entropy": 0.5286178588867188, "epoch": 2.689655172413793, "grad_norm": 4.097740161891809, "learning_rate": 3.1508782934957804e-06, "loss": 0.0689, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 194121291.0, "step": 234 }, { "entropy": 0.5184707641601562, "epoch": 2.7011494252873565, "grad_norm": 2.965792573695505, "learning_rate": 3.1355458354837183e-06, "loss": 0.0435, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 194990367.0, "step": 235 }, { "entropy": 0.5118179321289062, "epoch": 2.7126436781609193, "grad_norm": 3.9101911113146133, "learning_rate": 3.1201877777780724e-06, "loss": 0.0577, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 195834396.0, "step": 236 }, { "entropy": 0.518096923828125, "epoch": 2.7241379310344827, "grad_norm": 4.9106691954268715, "learning_rate": 3.1048047389991693e-06, "loss": 0.0442, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 196680761.0, "step": 237 }, { "entropy": 0.5065536499023438, "epoch": 2.735632183908046, "grad_norm": 3.9425883682767138, "learning_rate": 3.089397338773569e-06, "loss": 0.0354, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 197540695.0, "step": 238 }, { "entropy": 0.5074462890625, "epoch": 2.7471264367816093, "grad_norm": 3.8639716797812973, "learning_rate": 3.0739661977091027e-06, "loss": 0.03, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 198381995.0, "step": 239 }, { "entropy": 0.5099868774414062, "epoch": 2.7586206896551726, "grad_norm": 5.774630120152915, "learning_rate": 3.0585119373698858e-06, "loss": 0.034, "mean_token_accuracy": 0.989583333954215, "num_tokens": 199222309.0, "step": 240 }, { "entropy": 0.5069580078125, "epoch": 2.7701149425287355, "grad_norm": 3.1153325710771202, "learning_rate": 3.04303518025127e-06, "loss": 0.032, "mean_token_accuracy": 0.989583333954215, "num_tokens": 200069860.0, "step": 241 }, { "entropy": 0.5121688842773438, "epoch": 2.781609195402299, "grad_norm": 4.199338548501109, "learning_rate": 3.0275365497547747e-06, "loss": 0.0341, "mean_token_accuracy": 0.989583333954215, "num_tokens": 200911603.0, "step": 242 }, { "entropy": 0.5169677734375, "epoch": 2.793103448275862, "grad_norm": 5.348906239774751, "learning_rate": 3.012016670162977e-06, "loss": 0.044, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 201716379.0, "step": 243 }, { "entropy": 0.5079116821289062, "epoch": 2.8045977011494254, "grad_norm": 8.098700249722082, "learning_rate": 2.9964761666143638e-06, "loss": 0.0352, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 202537011.0, "step": 244 }, { "entropy": 0.5113906860351562, "epoch": 2.8160919540229887, "grad_norm": 6.415054459447643, "learning_rate": 2.980915665078153e-06, "loss": 0.0494, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 203368643.0, "step": 245 }, { "entropy": 0.52069091796875, "epoch": 2.8275862068965516, "grad_norm": 4.0837268974382885, "learning_rate": 2.9653357923290753e-06, "loss": 0.0264, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 204173970.0, "step": 246 }, { "entropy": 0.49139404296875, "epoch": 2.839080459770115, "grad_norm": 11.87980766761474, "learning_rate": 2.949737175922135e-06, "loss": 0.0641, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 205069359.0, "step": 247 }, { "entropy": 0.5181503295898438, "epoch": 2.8505747126436782, "grad_norm": 5.55092789641183, "learning_rate": 2.9341204441673267e-06, "loss": 0.0569, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 205863015.0, "step": 248 }, { "entropy": 0.5153961181640625, "epoch": 2.862068965517241, "grad_norm": 11.249207750194575, "learning_rate": 2.9184862261043272e-06, "loss": 0.0652, "mean_token_accuracy": 0.977864584652707, "num_tokens": 206693116.0, "step": 249 }, { "entropy": 0.5162200927734375, "epoch": 2.873563218390805, "grad_norm": 11.262000025655086, "learning_rate": 2.902835151477161e-06, "loss": 0.0772, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 207535238.0, "step": 250 }, { "entropy": 0.49965667724609375, "epoch": 2.8850574712643677, "grad_norm": 5.331206524136488, "learning_rate": 2.887167850708831e-06, "loss": 0.0595, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 208414225.0, "step": 251 }, { "entropy": 0.509613037109375, "epoch": 2.896551724137931, "grad_norm": 10.395838999195405, "learning_rate": 2.8714849548759293e-06, "loss": 0.0749, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 209267716.0, "step": 252 }, { "entropy": 0.5212478637695312, "epoch": 2.9080459770114944, "grad_norm": 3.4925992718532903, "learning_rate": 2.8557870956832135e-06, "loss": 0.0441, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 210079116.0, "step": 253 }, { "entropy": 0.52960205078125, "epoch": 2.9195402298850572, "grad_norm": 3.0763445569145795, "learning_rate": 2.840074905438161e-06, "loss": 0.0356, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 210868372.0, "step": 254 }, { "entropy": 0.509857177734375, "epoch": 2.9310344827586206, "grad_norm": 6.085180472974132, "learning_rate": 2.8243490170255046e-06, "loss": 0.0374, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 211719378.0, "step": 255 }, { "entropy": 0.52435302734375, "epoch": 2.942528735632184, "grad_norm": 4.342033965891391, "learning_rate": 2.808610063881737e-06, "loss": 0.0367, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 212504744.0, "step": 256 }, { "entropy": 0.5114364624023438, "epoch": 2.954022988505747, "grad_norm": 4.53595707345555, "learning_rate": 2.792858679969596e-06, "loss": 0.041, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 213336715.0, "step": 257 }, { "entropy": 0.5150146484375, "epoch": 2.9655172413793105, "grad_norm": 3.532952348106941, "learning_rate": 2.7770954997525277e-06, "loss": 0.0301, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 214157360.0, "step": 258 }, { "entropy": 0.5124893188476562, "epoch": 2.9770114942528734, "grad_norm": 5.374052711219377, "learning_rate": 2.761321158169134e-06, "loss": 0.0333, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 214981518.0, "step": 259 }, { "entropy": 0.5047225952148438, "epoch": 2.9885057471264367, "grad_norm": 6.297842060515772, "learning_rate": 2.745536290607593e-06, "loss": 0.0416, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 215835954.0, "step": 260 }, { "entropy": 0.5225372314453125, "epoch": 3.0, "grad_norm": 5.830015018464227, "learning_rate": 2.729741532880069e-06, "loss": 0.0578, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 216642134.0, "step": 261 }, { "entropy": 0.5021514892578125, "epoch": 3.0114942528735633, "grad_norm": 2.477209280475651, "learning_rate": 2.7139375211971e-06, "loss": 0.0143, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 217503099.0, "step": 262 }, { "entropy": 0.5126724243164062, "epoch": 3.0229885057471266, "grad_norm": 3.8701219743611466, "learning_rate": 2.6981248921419713e-06, "loss": 0.0228, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 218333703.0, "step": 263 }, { "entropy": 0.5178909301757812, "epoch": 3.0344827586206895, "grad_norm": 2.7967744739077642, "learning_rate": 2.682304282645077e-06, "loss": 0.0163, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 219141954.0, "step": 264 }, { "entropy": 0.5064697265625, "epoch": 3.045977011494253, "grad_norm": 2.8493437564285675, "learning_rate": 2.66647632995826e-06, "loss": 0.0153, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 219981675.0, "step": 265 }, { "entropy": 0.5040740966796875, "epoch": 3.057471264367816, "grad_norm": 3.39106083775523, "learning_rate": 2.6506416716291466e-06, "loss": 0.0182, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 220814651.0, "step": 266 }, { "entropy": 0.5218353271484375, "epoch": 3.0689655172413794, "grad_norm": 3.245992716985913, "learning_rate": 2.634800945475465e-06, "loss": 0.0359, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 221625766.0, "step": 267 }, { "entropy": 0.515411376953125, "epoch": 3.0804597701149423, "grad_norm": 3.4758561756981003, "learning_rate": 2.6189547895593565e-06, "loss": 0.0272, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 222429577.0, "step": 268 }, { "entropy": 0.5009841918945312, "epoch": 3.0919540229885056, "grad_norm": 5.9657298113817445, "learning_rate": 2.6031038421616684e-06, "loss": 0.0216, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 223278965.0, "step": 269 }, { "entropy": 0.5102462768554688, "epoch": 3.103448275862069, "grad_norm": 4.077413121693344, "learning_rate": 2.587248741756253e-06, "loss": 0.021, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 224122111.0, "step": 270 }, { "entropy": 0.5262985229492188, "epoch": 3.1149425287356323, "grad_norm": 4.99945914752149, "learning_rate": 2.5713901269842405e-06, "loss": 0.0286, "mean_token_accuracy": 0.989583333954215, "num_tokens": 224930403.0, "step": 271 }, { "entropy": 0.4971160888671875, "epoch": 3.1264367816091956, "grad_norm": 4.210562762990907, "learning_rate": 2.555528636628324e-06, "loss": 0.0334, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 225800782.0, "step": 272 }, { "entropy": 0.5353775024414062, "epoch": 3.1379310344827585, "grad_norm": 3.3827078358399234, "learning_rate": 2.53966490958702e-06, "loss": 0.0173, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 226565726.0, "step": 273 }, { "entropy": 0.5124588012695312, "epoch": 3.1494252873563218, "grad_norm": 3.6515337585771617, "learning_rate": 2.5237995848489422e-06, "loss": 0.0204, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 227384084.0, "step": 274 }, { "entropy": 0.5161056518554688, "epoch": 3.160919540229885, "grad_norm": 2.5107176292809523, "learning_rate": 2.507933301467056e-06, "loss": 0.0118, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 228221106.0, "step": 275 }, { "entropy": 0.5079498291015625, "epoch": 3.1724137931034484, "grad_norm": 4.072490672739514, "learning_rate": 2.4920666985329446e-06, "loss": 0.0231, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 229063105.0, "step": 276 }, { "entropy": 0.5137939453125, "epoch": 3.1839080459770113, "grad_norm": 4.772256728134127, "learning_rate": 2.4762004151510586e-06, "loss": 0.0272, "mean_token_accuracy": 0.989583333954215, "num_tokens": 229887163.0, "step": 277 }, { "entropy": 0.5074691772460938, "epoch": 3.1954022988505746, "grad_norm": 2.5146705491674632, "learning_rate": 2.4603350904129802e-06, "loss": 0.0165, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 230714460.0, "step": 278 }, { "entropy": 0.5012588500976562, "epoch": 3.206896551724138, "grad_norm": 3.4523116738901187, "learning_rate": 2.4444713633716764e-06, "loss": 0.0214, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 231546214.0, "step": 279 }, { "entropy": 0.5137557983398438, "epoch": 3.218390804597701, "grad_norm": 4.103312747798813, "learning_rate": 2.42860987301576e-06, "loss": 0.0147, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 232367417.0, "step": 280 }, { "entropy": 0.5189132690429688, "epoch": 3.2298850574712645, "grad_norm": 3.0711187257517323, "learning_rate": 2.4127512582437486e-06, "loss": 0.0197, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 233171390.0, "step": 281 }, { "entropy": 0.4828948974609375, "epoch": 3.2413793103448274, "grad_norm": 4.593361566496384, "learning_rate": 2.3968961578383324e-06, "loss": 0.0242, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 234048614.0, "step": 282 }, { "entropy": 0.49707794189453125, "epoch": 3.2528735632183907, "grad_norm": 4.369720789995865, "learning_rate": 2.3810452104406444e-06, "loss": 0.0181, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 234880964.0, "step": 283 }, { "entropy": 0.49500274658203125, "epoch": 3.264367816091954, "grad_norm": 4.068902757291551, "learning_rate": 2.3651990545245357e-06, "loss": 0.0188, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 235742677.0, "step": 284 }, { "entropy": 0.5010452270507812, "epoch": 3.2758620689655173, "grad_norm": 4.533741620441784, "learning_rate": 2.3493583283708542e-06, "loss": 0.0197, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 236562341.0, "step": 285 }, { "entropy": 0.5067520141601562, "epoch": 3.2873563218390807, "grad_norm": 7.309205026590191, "learning_rate": 2.3335236700417404e-06, "loss": 0.0294, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 237376240.0, "step": 286 }, { "entropy": 0.49813079833984375, "epoch": 3.2988505747126435, "grad_norm": 6.265890191493295, "learning_rate": 2.3176957173549236e-06, "loss": 0.0179, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 238205161.0, "step": 287 }, { "entropy": 0.5123062133789062, "epoch": 3.310344827586207, "grad_norm": 5.357876728974506, "learning_rate": 2.3018751078580287e-06, "loss": 0.0301, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 239018120.0, "step": 288 }, { "entropy": 0.50250244140625, "epoch": 3.32183908045977, "grad_norm": 5.842965328064792, "learning_rate": 2.2860624788029013e-06, "loss": 0.0192, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 239830755.0, "step": 289 }, { "entropy": 0.48967742919921875, "epoch": 3.3333333333333335, "grad_norm": 6.615246255894702, "learning_rate": 2.2702584671199317e-06, "loss": 0.0152, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 240669859.0, "step": 290 }, { "entropy": 0.49662017822265625, "epoch": 3.344827586206897, "grad_norm": 2.6840632671541362, "learning_rate": 2.2544637093924072e-06, "loss": 0.0181, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 241488468.0, "step": 291 }, { "entropy": 0.49695587158203125, "epoch": 3.3563218390804597, "grad_norm": 6.090789224137067, "learning_rate": 2.238678841830867e-06, "loss": 0.0184, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 242306679.0, "step": 292 }, { "entropy": 0.5002517700195312, "epoch": 3.367816091954023, "grad_norm": 9.067054425714629, "learning_rate": 2.2229045002474727e-06, "loss": 0.028, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 243130959.0, "step": 293 }, { "entropy": 0.4901580810546875, "epoch": 3.3793103448275863, "grad_norm": 6.6957071542612425, "learning_rate": 2.2071413200304046e-06, "loss": 0.021, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 243963959.0, "step": 294 }, { "entropy": 0.49200439453125, "epoch": 3.3908045977011496, "grad_norm": 5.790677099390474, "learning_rate": 2.1913899361182634e-06, "loss": 0.0217, "mean_token_accuracy": 0.989583333954215, "num_tokens": 244789315.0, "step": 295 }, { "entropy": 0.48865509033203125, "epoch": 3.4022988505747125, "grad_norm": 6.64954441446596, "learning_rate": 2.1756509829744958e-06, "loss": 0.0309, "mean_token_accuracy": 0.989583333954215, "num_tokens": 245647468.0, "step": 296 }, { "entropy": 0.49068450927734375, "epoch": 3.413793103448276, "grad_norm": 5.000268942128174, "learning_rate": 2.1599250945618404e-06, "loss": 0.031, "mean_token_accuracy": 0.989583333954215, "num_tokens": 246506392.0, "step": 297 }, { "entropy": 0.5043487548828125, "epoch": 3.425287356321839, "grad_norm": 6.690882034959598, "learning_rate": 2.1442129043167877e-06, "loss": 0.0247, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 247317246.0, "step": 298 }, { "entropy": 0.48810577392578125, "epoch": 3.4367816091954024, "grad_norm": 7.4751141517356166, "learning_rate": 2.128515045124071e-06, "loss": 0.0303, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 248176152.0, "step": 299 }, { "entropy": 0.48333740234375, "epoch": 3.4482758620689653, "grad_norm": 2.8897721353550296, "learning_rate": 2.1128321492911697e-06, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 249019061.0, "step": 300 }, { "entropy": 0.48685455322265625, "epoch": 3.4597701149425286, "grad_norm": 3.6616455523506777, "learning_rate": 2.0971648485228404e-06, "loss": 0.0231, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 249879936.0, "step": 301 }, { "entropy": 0.4935455322265625, "epoch": 3.471264367816092, "grad_norm": 2.0624568242402432, "learning_rate": 2.0815137738956736e-06, "loss": 0.015, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 250720828.0, "step": 302 }, { "entropy": 0.485931396484375, "epoch": 3.4827586206896552, "grad_norm": 3.7427616919674827, "learning_rate": 2.0658795558326745e-06, "loss": 0.0199, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 251591480.0, "step": 303 }, { "entropy": 0.49802398681640625, "epoch": 3.4942528735632186, "grad_norm": 2.669949059595247, "learning_rate": 2.0502628240778655e-06, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 252425658.0, "step": 304 }, { "entropy": 0.4919586181640625, "epoch": 3.5057471264367814, "grad_norm": 4.948406162990609, "learning_rate": 2.034664207670925e-06, "loss": 0.0166, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 253260881.0, "step": 305 }, { "entropy": 0.49547576904296875, "epoch": 3.5172413793103448, "grad_norm": 5.26095619649171, "learning_rate": 2.019084334921849e-06, "loss": 0.0165, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 254103428.0, "step": 306 }, { "entropy": 0.49622344970703125, "epoch": 3.528735632183908, "grad_norm": 2.0304772167522755, "learning_rate": 2.003523833385637e-06, "loss": 0.0094, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 254934799.0, "step": 307 }, { "entropy": 0.49666595458984375, "epoch": 3.5402298850574714, "grad_norm": 3.2058148127523083, "learning_rate": 1.987983329837024e-06, "loss": 0.0091, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 255740355.0, "step": 308 }, { "entropy": 0.484710693359375, "epoch": 3.5517241379310347, "grad_norm": 5.056038254220789, "learning_rate": 1.972463450245226e-06, "loss": 0.0201, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 256592207.0, "step": 309 }, { "entropy": 0.49091339111328125, "epoch": 3.5632183908045976, "grad_norm": 4.295846823386714, "learning_rate": 1.956964819748731e-06, "loss": 0.0186, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 257394928.0, "step": 310 }, { "entropy": 0.484405517578125, "epoch": 3.574712643678161, "grad_norm": 5.136383467949302, "learning_rate": 1.9414880626301147e-06, "loss": 0.0114, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 258218560.0, "step": 311 }, { "entropy": 0.47745513916015625, "epoch": 3.586206896551724, "grad_norm": 1.5987930235387136, "learning_rate": 1.9260338022908972e-06, "loss": 0.0051, "mean_token_accuracy": 1.0, "num_tokens": 259076886.0, "step": 312 }, { "entropy": 0.490386962890625, "epoch": 3.5977011494252875, "grad_norm": 5.630174223879903, "learning_rate": 1.9106026612264316e-06, "loss": 0.0157, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 259903431.0, "step": 313 }, { "entropy": 0.4911651611328125, "epoch": 3.609195402298851, "grad_norm": 3.594997086155672, "learning_rate": 1.895195261000831e-06, "loss": 0.0148, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 260724089.0, "step": 314 }, { "entropy": 0.4875335693359375, "epoch": 3.6206896551724137, "grad_norm": 7.1001251451875955, "learning_rate": 1.8798122222219288e-06, "loss": 0.0182, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 261570168.0, "step": 315 }, { "entropy": 0.49361419677734375, "epoch": 3.632183908045977, "grad_norm": 1.8488403064040235, "learning_rate": 1.8644541645162834e-06, "loss": 0.005, "mean_token_accuracy": 1.0, "num_tokens": 262377615.0, "step": 316 }, { "entropy": 0.47705841064453125, "epoch": 3.6436781609195403, "grad_norm": 8.419399542330668, "learning_rate": 1.84912170650422e-06, "loss": 0.017, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 263233842.0, "step": 317 }, { "entropy": 0.4798583984375, "epoch": 3.655172413793103, "grad_norm": 6.948749207718704, "learning_rate": 1.833815465774913e-06, "loss": 0.0214, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 264054307.0, "step": 318 }, { "entropy": 0.48322296142578125, "epoch": 3.6666666666666665, "grad_norm": 4.4143868380602544, "learning_rate": 1.818536058861506e-06, "loss": 0.0143, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 264889320.0, "step": 319 }, { "entropy": 0.48094940185546875, "epoch": 3.67816091954023, "grad_norm": 7.51629286227342, "learning_rate": 1.803284101216281e-06, "loss": 0.0205, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 265727293.0, "step": 320 }, { "entropy": 0.48848724365234375, "epoch": 3.689655172413793, "grad_norm": 4.390187366525572, "learning_rate": 1.7880602071858694e-06, "loss": 0.0248, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 266554947.0, "step": 321 }, { "entropy": 0.5014801025390625, "epoch": 3.7011494252873565, "grad_norm": 4.765097706705711, "learning_rate": 1.7728649899865024e-06, "loss": 0.0076, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 267364160.0, "step": 322 }, { "entropy": 0.47812652587890625, "epoch": 3.7126436781609193, "grad_norm": 4.601028573147119, "learning_rate": 1.7576990616793139e-06, "loss": 0.0124, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 268207518.0, "step": 323 }, { "entropy": 0.5039901733398438, "epoch": 3.7241379310344827, "grad_norm": 9.345319142946543, "learning_rate": 1.7425630331456821e-06, "loss": 0.0297, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 268994732.0, "step": 324 }, { "entropy": 0.5049362182617188, "epoch": 3.735632183908046, "grad_norm": 10.813891797521325, "learning_rate": 1.7274575140626318e-06, "loss": 0.0305, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 269789989.0, "step": 325 }, { "entropy": 0.4865875244140625, "epoch": 3.7471264367816093, "grad_norm": 3.732890733936591, "learning_rate": 1.7123831128782686e-06, "loss": 0.0166, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 270633429.0, "step": 326 }, { "entropy": 0.487823486328125, "epoch": 3.7586206896551726, "grad_norm": 3.952992845746983, "learning_rate": 1.697340436787273e-06, "loss": 0.0193, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 271480889.0, "step": 327 }, { "entropy": 0.49646759033203125, "epoch": 3.7701149425287355, "grad_norm": 6.939199060751946, "learning_rate": 1.6823300917064462e-06, "loss": 0.0143, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 272305320.0, "step": 328 }, { "entropy": 0.4934539794921875, "epoch": 3.781609195402299, "grad_norm": 5.40884515086912, "learning_rate": 1.6673526822502982e-06, "loss": 0.0149, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 273117988.0, "step": 329 }, { "entropy": 0.48311614990234375, "epoch": 3.793103448275862, "grad_norm": 3.129709097075508, "learning_rate": 1.6524088117066984e-06, "loss": 0.0094, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 273990464.0, "step": 330 }, { "entropy": 0.491790771484375, "epoch": 3.8045977011494254, "grad_norm": 3.8874739362962645, "learning_rate": 1.637499082012574e-06, "loss": 0.0139, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 274810040.0, "step": 331 }, { "entropy": 0.5032882690429688, "epoch": 3.8160919540229887, "grad_norm": 4.149015091463172, "learning_rate": 1.6226240937296617e-06, "loss": 0.0231, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 275603319.0, "step": 332 }, { "entropy": 0.49298858642578125, "epoch": 3.8275862068965516, "grad_norm": 3.6377764732920284, "learning_rate": 1.6077844460203207e-06, "loss": 0.0098, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 276446052.0, "step": 333 }, { "entropy": 0.5046768188476562, "epoch": 3.839080459770115, "grad_norm": 2.0960585306443353, "learning_rate": 1.5929807366233979e-06, "loss": 0.0077, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 277257667.0, "step": 334 }, { "entropy": 0.5001907348632812, "epoch": 3.8505747126436782, "grad_norm": 1.1308830838111201, "learning_rate": 1.5782135618301486e-06, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 278073776.0, "step": 335 }, { "entropy": 0.49394989013671875, "epoch": 3.862068965517241, "grad_norm": 3.0083262718142056, "learning_rate": 1.56348351646022e-06, "loss": 0.007, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 278928254.0, "step": 336 }, { "entropy": 0.49828338623046875, "epoch": 3.873563218390805, "grad_norm": 5.123947951886666, "learning_rate": 1.5487911938376925e-06, "loss": 0.0146, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 279751330.0, "step": 337 }, { "entropy": 0.489593505859375, "epoch": 3.8850574712643677, "grad_norm": 4.035251892279188, "learning_rate": 1.5341371857671782e-06, "loss": 0.0109, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 280570344.0, "step": 338 }, { "entropy": 0.48815155029296875, "epoch": 3.896551724137931, "grad_norm": 5.476176656071363, "learning_rate": 1.5195220825099863e-06, "loss": 0.0167, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 281410149.0, "step": 339 }, { "entropy": 0.48846435546875, "epoch": 3.9080459770114944, "grad_norm": 6.167355254742196, "learning_rate": 1.5049464727603453e-06, "loss": 0.0162, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 282246542.0, "step": 340 }, { "entropy": 0.4861907958984375, "epoch": 3.9195402298850572, "grad_norm": 3.2444266602755376, "learning_rate": 1.4904109436216885e-06, "loss": 0.0111, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 283099687.0, "step": 341 }, { "entropy": 0.4877471923828125, "epoch": 3.9310344827586206, "grad_norm": 2.976257301458973, "learning_rate": 1.475916080583012e-06, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 283924877.0, "step": 342 }, { "entropy": 0.4852752685546875, "epoch": 3.942528735632184, "grad_norm": 5.111111810610919, "learning_rate": 1.4614624674952843e-06, "loss": 0.0069, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 284789322.0, "step": 343 }, { "entropy": 0.490936279296875, "epoch": 3.954022988505747, "grad_norm": 8.373925102066526, "learning_rate": 1.4470506865479337e-06, "loss": 0.0114, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 285618482.0, "step": 344 }, { "entropy": 0.49163818359375, "epoch": 3.9655172413793105, "grad_norm": 1.6045847963496473, "learning_rate": 1.4326813182453959e-06, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 286442938.0, "step": 345 }, { "entropy": 0.4775238037109375, "epoch": 3.9770114942528734, "grad_norm": 6.809758589293284, "learning_rate": 1.4183549413837288e-06, "loss": 0.0166, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 287300227.0, "step": 346 }, { "entropy": 0.49134063720703125, "epoch": 3.9885057471264367, "grad_norm": 2.7969846655833166, "learning_rate": 1.4040721330273063e-06, "loss": 0.0179, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 288119497.0, "step": 347 }, { "entropy": 0.478973388671875, "epoch": 4.0, "grad_norm": 4.820339118038163, "learning_rate": 1.3898334684855647e-06, "loss": 0.0223, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 288974058.0, "step": 348 }, { "entropy": 0.486541748046875, "epoch": 4.011494252873563, "grad_norm": 2.0004363087793915, "learning_rate": 1.375639521289836e-06, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 289813648.0, "step": 349 }, { "entropy": 0.49737548828125, "epoch": 4.022988505747127, "grad_norm": 5.139419062447743, "learning_rate": 1.3614908631702435e-06, "loss": 0.0061, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 290636567.0, "step": 350 }, { "entropy": 0.49163818359375, "epoch": 4.0344827586206895, "grad_norm": 0.788318794526319, "learning_rate": 1.3473880640326725e-06, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 291479050.0, "step": 351 }, { "entropy": 0.48863983154296875, "epoch": 4.045977011494253, "grad_norm": 2.0230698337679174, "learning_rate": 1.3333316919358159e-06, "loss": 0.0037, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 292300777.0, "step": 352 }, { "entropy": 0.49753570556640625, "epoch": 4.057471264367816, "grad_norm": 0.6796429821645044, "learning_rate": 1.3193223130682937e-06, "loss": 0.0098, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 293090581.0, "step": 353 }, { "entropy": 0.47222900390625, "epoch": 4.068965517241379, "grad_norm": 6.014376626856276, "learning_rate": 1.3053604917258428e-06, "loss": 0.0207, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 293972611.0, "step": 354 }, { "entropy": 0.4964752197265625, "epoch": 4.080459770114943, "grad_norm": 0.6912483480930615, "learning_rate": 1.2914467902885902e-06, "loss": 0.0111, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 294794906.0, "step": 355 }, { "entropy": 0.4761962890625, "epoch": 4.091954022988506, "grad_norm": 3.135946056259954, "learning_rate": 1.2775817691984032e-06, "loss": 0.0133, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 295656117.0, "step": 356 }, { "entropy": 0.4886016845703125, "epoch": 4.103448275862069, "grad_norm": 1.7539444927331345, "learning_rate": 1.2637659869363085e-06, "loss": 0.0117, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 296488812.0, "step": 357 }, { "entropy": 0.486907958984375, "epoch": 4.114942528735632, "grad_norm": 1.3961250048157883, "learning_rate": 1.2500000000000007e-06, "loss": 0.0043, "mean_token_accuracy": 1.0, "num_tokens": 297344919.0, "step": 358 }, { "entropy": 0.47713470458984375, "epoch": 4.126436781609195, "grad_norm": 1.0425696217158198, "learning_rate": 1.2362843628814267e-06, "loss": 0.0031, "mean_token_accuracy": 1.0, "num_tokens": 298218199.0, "step": 359 }, { "entropy": 0.4909515380859375, "epoch": 4.137931034482759, "grad_norm": 1.5089572088275034, "learning_rate": 1.222619628044449e-06, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 299059349.0, "step": 360 }, { "entropy": 0.5058059692382812, "epoch": 4.149425287356322, "grad_norm": 3.939531795615367, "learning_rate": 1.2090063459025956e-06, "loss": 0.0114, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 299836899.0, "step": 361 }, { "entropy": 0.5086669921875, "epoch": 4.160919540229885, "grad_norm": 5.465372506142496, "learning_rate": 1.1954450647968856e-06, "loss": 0.0159, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 300629307.0, "step": 362 }, { "entropy": 0.49275970458984375, "epoch": 4.172413793103448, "grad_norm": 1.2454373437193944, "learning_rate": 1.181936330973744e-06, "loss": 0.0089, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 301434572.0, "step": 363 }, { "entropy": 0.48398590087890625, "epoch": 4.183908045977011, "grad_norm": 0.8322721421543683, "learning_rate": 1.1684806885630003e-06, "loss": 0.0108, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 302271254.0, "step": 364 }, { "entropy": 0.49010467529296875, "epoch": 4.195402298850575, "grad_norm": 0.7220174084022981, "learning_rate": 1.155078679555969e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 303121365.0, "step": 365 }, { "entropy": 0.4936981201171875, "epoch": 4.206896551724138, "grad_norm": 1.6057246636473406, "learning_rate": 1.1417308437836181e-06, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 303940576.0, "step": 366 }, { "entropy": 0.48703765869140625, "epoch": 4.218390804597701, "grad_norm": 3.260885681236724, "learning_rate": 1.1284377188948258e-06, "loss": 0.0039, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 304779295.0, "step": 367 }, { "entropy": 0.486724853515625, "epoch": 4.2298850574712645, "grad_norm": 1.637009339887056, "learning_rate": 1.1151998403347245e-06, "loss": 0.0098, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 305615268.0, "step": 368 }, { "entropy": 0.48380279541015625, "epoch": 4.241379310344827, "grad_norm": 3.577848153648759, "learning_rate": 1.1020177413231334e-06, "loss": 0.0078, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 306462446.0, "step": 369 }, { "entropy": 0.48940277099609375, "epoch": 4.252873563218391, "grad_norm": 2.527345402012226, "learning_rate": 1.0888919528330778e-06, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 307285095.0, "step": 370 }, { "entropy": 0.498809814453125, "epoch": 4.264367816091954, "grad_norm": 0.7591305134467865, "learning_rate": 1.0758230035694031e-06, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 308089063.0, "step": 371 }, { "entropy": 0.47843170166015625, "epoch": 4.275862068965517, "grad_norm": 1.3490864229676036, "learning_rate": 1.062811419947482e-06, "loss": 0.0113, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 308928800.0, "step": 372 }, { "entropy": 0.5036239624023438, "epoch": 4.287356321839081, "grad_norm": 1.4383283661296111, "learning_rate": 1.049857726072005e-06, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 309730211.0, "step": 373 }, { "entropy": 0.4854278564453125, "epoch": 4.2988505747126435, "grad_norm": 6.081241592780342, "learning_rate": 1.036962443715872e-06, "loss": 0.0038, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 310582388.0, "step": 374 }, { "entropy": 0.47245025634765625, "epoch": 4.310344827586207, "grad_norm": 4.195343639160105, "learning_rate": 1.0241260922991761e-06, "loss": 0.0109, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 311440338.0, "step": 375 }, { "entropy": 0.48504638671875, "epoch": 4.32183908045977, "grad_norm": 1.4487989469262141, "learning_rate": 1.0113491888682802e-06, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 312258986.0, "step": 376 }, { "entropy": 0.492950439453125, "epoch": 4.333333333333333, "grad_norm": 4.518532952463173, "learning_rate": 9.986322480749926e-07, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 313058956.0, "step": 377 }, { "entropy": 0.4766845703125, "epoch": 4.344827586206897, "grad_norm": 6.1354639633433115, "learning_rate": 9.85975782155834e-07, "loss": 0.0233, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 313917084.0, "step": 378 }, { "entropy": 0.48448944091796875, "epoch": 4.35632183908046, "grad_norm": 6.820421188466488, "learning_rate": 9.733803009114045e-07, "loss": 0.0025, "mean_token_accuracy": 1.0, "num_tokens": 314751178.0, "step": 379 }, { "entropy": 0.5024642944335938, "epoch": 4.3678160919540225, "grad_norm": 3.825664202457435, "learning_rate": 9.608463116858544e-07, "loss": 0.006, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 315533124.0, "step": 380 }, { "entropy": 0.489990234375, "epoch": 4.379310344827586, "grad_norm": 0.3606406189949594, "learning_rate": 9.483743193464409e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 316351070.0, "step": 381 }, { "entropy": 0.4849853515625, "epoch": 4.390804597701149, "grad_norm": 4.022953332605156, "learning_rate": 9.359648262631962e-07, "loss": 0.0076, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 317193504.0, "step": 382 }, { "entropy": 0.4813385009765625, "epoch": 4.402298850574713, "grad_norm": 0.3387422920732581, "learning_rate": 9.236183322886946e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 318022047.0, "step": 383 }, { "entropy": 0.48401641845703125, "epoch": 4.413793103448276, "grad_norm": 2.361175377691583, "learning_rate": 9.113353347379097e-07, "loss": 0.0038, "mean_token_accuracy": 1.0, "num_tokens": 318833334.0, "step": 384 }, { "entropy": 0.48177337646484375, "epoch": 4.425287356321839, "grad_norm": 3.210148443515568, "learning_rate": 8.991163283681945e-07, "loss": 0.0031, "mean_token_accuracy": 1.0, "num_tokens": 319670458.0, "step": 385 }, { "entropy": 0.4922943115234375, "epoch": 4.436781609195402, "grad_norm": 0.8534452783712402, "learning_rate": 8.869618053593429e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 320463804.0, "step": 386 }, { "entropy": 0.4735107421875, "epoch": 4.448275862068965, "grad_norm": 1.2810358599901657, "learning_rate": 8.748722552937688e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 321305315.0, "step": 387 }, { "entropy": 0.49753570556640625, "epoch": 4.459770114942529, "grad_norm": 0.3022749679325425, "learning_rate": 8.628481651367876e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 322115329.0, "step": 388 }, { "entropy": 0.478118896484375, "epoch": 4.471264367816092, "grad_norm": 3.9075388394359165, "learning_rate": 8.508900192169964e-07, "loss": 0.0155, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 322943394.0, "step": 389 }, { "entropy": 0.48065948486328125, "epoch": 4.482758620689655, "grad_norm": 2.5853524966903825, "learning_rate": 8.389982992067688e-07, "loss": 0.0091, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 323762317.0, "step": 390 }, { "entropy": 0.47959136962890625, "epoch": 4.494252873563219, "grad_norm": 0.31803299256569667, "learning_rate": 8.271734841028553e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 324614783.0, "step": 391 }, { "entropy": 0.46380615234375, "epoch": 4.505747126436781, "grad_norm": 1.4702478415777527, "learning_rate": 8.154160502070804e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 325487590.0, "step": 392 }, { "entropy": 0.4808197021484375, "epoch": 4.517241379310345, "grad_norm": 0.3626143146323085, "learning_rate": 8.037264711071699e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 326309233.0, "step": 393 }, { "entropy": 0.48209381103515625, "epoch": 4.528735632183908, "grad_norm": 4.1348271243110215, "learning_rate": 7.921052176576643e-07, "loss": 0.0108, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 327109254.0, "step": 394 }, { "entropy": 0.4883575439453125, "epoch": 4.540229885057471, "grad_norm": 1.7488698968120222, "learning_rate": 7.805527579609575e-07, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 327899654.0, "step": 395 }, { "entropy": 0.4583587646484375, "epoch": 4.551724137931035, "grad_norm": 1.0455651427697132, "learning_rate": 7.690695573484433e-07, "loss": 0.0103, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 328801476.0, "step": 396 }, { "entropy": 0.481842041015625, "epoch": 4.563218390804598, "grad_norm": 0.7387887077265088, "learning_rate": 7.576560783617667e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 329628649.0, "step": 397 }, { "entropy": 0.4767913818359375, "epoch": 4.574712643678161, "grad_norm": 7.742789771491273, "learning_rate": 7.463127807341966e-07, "loss": 0.0095, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 330484216.0, "step": 398 }, { "entropy": 0.47850799560546875, "epoch": 4.586206896551724, "grad_norm": 8.42931358748137, "learning_rate": 7.35040121372109e-07, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 331339275.0, "step": 399 }, { "entropy": 0.4886627197265625, "epoch": 4.597701149425287, "grad_norm": 4.673599280994888, "learning_rate": 7.238385543365783e-07, "loss": 0.0076, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 332156313.0, "step": 400 }, { "entropy": 0.4760284423828125, "epoch": 4.609195402298851, "grad_norm": 2.668169707423765, "learning_rate": 7.127085308250914e-07, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 332983166.0, "step": 401 }, { "entropy": 0.49083709716796875, "epoch": 4.620689655172414, "grad_norm": 0.26151093544975906, "learning_rate": 7.016504991533727e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 333809756.0, "step": 402 }, { "entropy": 0.4631195068359375, "epoch": 4.6321839080459775, "grad_norm": 5.601919909459552, "learning_rate": 6.906649047373246e-07, "loss": 0.0081, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 334706440.0, "step": 403 }, { "entropy": 0.48708343505859375, "epoch": 4.64367816091954, "grad_norm": 1.1889732572236726, "learning_rate": 6.797521900750897e-07, "loss": 0.0198, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 335534859.0, "step": 404 }, { "entropy": 0.48416900634765625, "epoch": 4.655172413793103, "grad_norm": 5.14978044907013, "learning_rate": 6.689127947292232e-07, "loss": 0.0109, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 336338209.0, "step": 405 }, { "entropy": 0.485504150390625, "epoch": 4.666666666666667, "grad_norm": 0.2866668178929426, "learning_rate": 6.581471553089874e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 337177750.0, "step": 406 }, { "entropy": 0.48833465576171875, "epoch": 4.67816091954023, "grad_norm": 2.47127991193972, "learning_rate": 6.474557054527709e-07, "loss": 0.0036, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 337984644.0, "step": 407 }, { "entropy": 0.4751434326171875, "epoch": 4.689655172413794, "grad_norm": 0.8103649633355084, "learning_rate": 6.368388758106134e-07, "loss": 0.009, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 338842954.0, "step": 408 }, { "entropy": 0.478729248046875, "epoch": 4.7011494252873565, "grad_norm": 0.6095178762388435, "learning_rate": 6.262970940268653e-07, "loss": 0.0091, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 339691867.0, "step": 409 }, { "entropy": 0.490264892578125, "epoch": 4.712643678160919, "grad_norm": 0.29562950287042467, "learning_rate": 6.158307847229594e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 340497554.0, "step": 410 }, { "entropy": 0.496917724609375, "epoch": 4.724137931034483, "grad_norm": 1.389667795031155, "learning_rate": 6.05440369480308e-07, "loss": 0.0061, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 341293415.0, "step": 411 }, { "entropy": 0.48715972900390625, "epoch": 4.735632183908046, "grad_norm": 1.2423943514661475, "learning_rate": 5.951262668233232e-07, "loss": 0.0087, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 342132670.0, "step": 412 }, { "entropy": 0.49158477783203125, "epoch": 4.747126436781609, "grad_norm": 0.4338513235610179, "learning_rate": 5.848888922025553e-07, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 342954761.0, "step": 413 }, { "entropy": 0.4922332763671875, "epoch": 4.758620689655173, "grad_norm": 0.37787068708743277, "learning_rate": 5.747286579779607e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 343768247.0, "step": 414 }, { "entropy": 0.48549652099609375, "epoch": 4.7701149425287355, "grad_norm": 2.3481180551859806, "learning_rate": 5.646459734022938e-07, "loss": 0.0038, "mean_token_accuracy": 1.0, "num_tokens": 344601208.0, "step": 415 }, { "entropy": 0.49423980712890625, "epoch": 4.781609195402299, "grad_norm": 2.1462858882504197, "learning_rate": 5.546412446046187e-07, "loss": 0.0034, "mean_token_accuracy": 1.0, "num_tokens": 345394502.0, "step": 416 }, { "entropy": 0.4767913818359375, "epoch": 4.793103448275862, "grad_norm": 0.34717798081468426, "learning_rate": 5.447148745739522e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 346254704.0, "step": 417 }, { "entropy": 0.48309326171875, "epoch": 4.804597701149425, "grad_norm": 2.574711249295488, "learning_rate": 5.348672631430319e-07, "loss": 0.0107, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 347076198.0, "step": 418 }, { "entropy": 0.4826202392578125, "epoch": 4.816091954022989, "grad_norm": 0.29624777403809666, "learning_rate": 5.250988069722096e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 347895786.0, "step": 419 }, { "entropy": 0.48499298095703125, "epoch": 4.827586206896552, "grad_norm": 3.836909556315625, "learning_rate": 5.154098995334769e-07, "loss": 0.0127, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 348706168.0, "step": 420 }, { "entropy": 0.49582672119140625, "epoch": 4.8390804597701145, "grad_norm": 3.2563250713457106, "learning_rate": 5.058009310946119e-07, "loss": 0.0085, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 349502269.0, "step": 421 }, { "entropy": 0.476806640625, "epoch": 4.850574712643678, "grad_norm": 0.304724756838409, "learning_rate": 4.962722887034616e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 350325332.0, "step": 422 }, { "entropy": 0.47916412353515625, "epoch": 4.862068965517241, "grad_norm": 5.162633488189246, "learning_rate": 4.868243561723535e-07, "loss": 0.013, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 351159607.0, "step": 423 }, { "entropy": 0.47540283203125, "epoch": 4.873563218390805, "grad_norm": 0.5206824035136977, "learning_rate": 4.774575140626317e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 352006157.0, "step": 424 }, { "entropy": 0.4759368896484375, "epoch": 4.885057471264368, "grad_norm": 0.3162820177895493, "learning_rate": 4.681721396693303e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 352835790.0, "step": 425 }, { "entropy": 0.48349761962890625, "epoch": 4.896551724137931, "grad_norm": 3.5611052036772124, "learning_rate": 4.589686070059762e-07, "loss": 0.0027, "mean_token_accuracy": 1.0, "num_tokens": 353654373.0, "step": 426 }, { "entropy": 0.4834136962890625, "epoch": 4.908045977011494, "grad_norm": 0.6544510464811919, "learning_rate": 4.4984728678952234e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 354477439.0, "step": 427 }, { "entropy": 0.477691650390625, "epoch": 4.919540229885057, "grad_norm": 0.3747410441822833, "learning_rate": 4.4080854642541833e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 355302452.0, "step": 428 }, { "entropy": 0.4850006103515625, "epoch": 4.931034482758621, "grad_norm": 0.5252478732331773, "learning_rate": 4.318527499928074e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 356125422.0, "step": 429 }, { "entropy": 0.47662353515625, "epoch": 4.942528735632184, "grad_norm": 0.29643930127778295, "learning_rate": 4.229802582298634e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 356978310.0, "step": 430 }, { "entropy": 0.5031585693359375, "epoch": 4.954022988505747, "grad_norm": 0.6671572700465241, "learning_rate": 4.141914285192619e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 357743388.0, "step": 431 }, { "entropy": 0.46810150146484375, "epoch": 4.9655172413793105, "grad_norm": 0.24179886566109368, "learning_rate": 4.0548661487378184e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 358578324.0, "step": 432 }, { "entropy": 0.4846649169921875, "epoch": 4.977011494252873, "grad_norm": 0.9015919155281225, "learning_rate": 3.9686616792204677e-07, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 359402028.0, "step": 433 }, { "entropy": 0.47179412841796875, "epoch": 4.988505747126437, "grad_norm": 1.9609784789544666, "learning_rate": 3.8833043489440477e-07, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 360259014.0, "step": 434 }, { "entropy": 0.46057891845703125, "epoch": 5.0, "grad_norm": 3.135548029845806, "learning_rate": 3.798797596089351e-07, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 361140653.0, "step": 435 }, { "entropy": 0.4911651611328125, "epoch": 5.011494252873563, "grad_norm": 4.784507319668607, "learning_rate": 3.715144824576078e-07, "loss": 0.0031, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 361910005.0, "step": 436 }, { "entropy": 0.45987701416015625, "epoch": 5.022988505747127, "grad_norm": 3.53439712252648, "learning_rate": 3.632349403925664e-07, "loss": 0.0084, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 362789389.0, "step": 437 }, { "entropy": 0.48267364501953125, "epoch": 5.0344827586206895, "grad_norm": 0.23679774220430147, "learning_rate": 3.5504146691255736e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 363594617.0, "step": 438 }, { "entropy": 0.47402191162109375, "epoch": 5.045977011494253, "grad_norm": 3.7140900011253017, "learning_rate": 3.469343920494986e-07, "loss": 0.0038, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 364431317.0, "step": 439 }, { "entropy": 0.48137664794921875, "epoch": 5.057471264367816, "grad_norm": 0.24349208704506586, "learning_rate": 3.389140423551834e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 365260126.0, "step": 440 }, { "entropy": 0.47275543212890625, "epoch": 5.068965517241379, "grad_norm": 2.9727752314913225, "learning_rate": 3.3098074088812686e-07, "loss": 0.0026, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 366112155.0, "step": 441 }, { "entropy": 0.47698974609375, "epoch": 5.080459770114943, "grad_norm": 3.2479910943373254, "learning_rate": 3.2313480720055747e-07, "loss": 0.0036, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 366950263.0, "step": 442 }, { "entropy": 0.4771575927734375, "epoch": 5.091954022988506, "grad_norm": 1.2691063578241986, "learning_rate": 3.153765573255377e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 367776644.0, "step": 443 }, { "entropy": 0.483184814453125, "epoch": 5.103448275862069, "grad_norm": 2.8124807980483384, "learning_rate": 3.0770630376424276e-07, "loss": 0.0043, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 368596908.0, "step": 444 }, { "entropy": 0.4910125732421875, "epoch": 5.114942528735632, "grad_norm": 3.891577262381854, "learning_rate": 3.0012435547336737e-07, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 369390244.0, "step": 445 }, { "entropy": 0.475433349609375, "epoch": 5.126436781609195, "grad_norm": 0.9801102432206495, "learning_rate": 2.9263101785268253e-07, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 370237002.0, "step": 446 }, { "entropy": 0.4729156494140625, "epoch": 5.137931034482759, "grad_norm": 0.46368160142667864, "learning_rate": 2.8522659273273606e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 371089064.0, "step": 447 }, { "entropy": 0.4679107666015625, "epoch": 5.149425287356322, "grad_norm": 0.4466182713447019, "learning_rate": 2.779113783626916e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 371950407.0, "step": 448 }, { "entropy": 0.4793701171875, "epoch": 5.160919540229885, "grad_norm": 0.24997338132137475, "learning_rate": 2.7068566939831646e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 372769217.0, "step": 449 }, { "entropy": 0.468536376953125, "epoch": 5.172413793103448, "grad_norm": 5.078234022478168, "learning_rate": 2.6354975689011576e-07, "loss": 0.0043, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 373597313.0, "step": 450 }, { "entropy": 0.4726409912109375, "epoch": 5.183908045977011, "grad_norm": 0.23098046397612482, "learning_rate": 2.5650392827160446e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 374442696.0, "step": 451 }, { "entropy": 0.4760284423828125, "epoch": 5.195402298850575, "grad_norm": 0.2444782213662235, "learning_rate": 2.4954846734773054e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 375274472.0, "step": 452 }, { "entropy": 0.4780120849609375, "epoch": 5.206896551724138, "grad_norm": 0.23065508145988362, "learning_rate": 2.4268365428344737e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 376082294.0, "step": 453 }, { "entropy": 0.4742889404296875, "epoch": 5.218390804597701, "grad_norm": 0.22841771210591458, "learning_rate": 2.3590976559242278e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 376913132.0, "step": 454 }, { "entropy": 0.484130859375, "epoch": 5.2298850574712645, "grad_norm": 0.2312717975757419, "learning_rate": 2.29227074125907e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 377719614.0, "step": 455 }, { "entropy": 0.4854583740234375, "epoch": 5.241379310344827, "grad_norm": 2.7996410031164864, "learning_rate": 2.2263584906173723e-07, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 378523136.0, "step": 456 }, { "entropy": 0.47391510009765625, "epoch": 5.252873563218391, "grad_norm": 0.22824235020101177, "learning_rate": 2.1613635589349756e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 379375322.0, "step": 457 }, { "entropy": 0.4659881591796875, "epoch": 5.264367816091954, "grad_norm": 3.476542826635523, "learning_rate": 2.0972885641982605e-07, "loss": 0.0086, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 380214107.0, "step": 458 }, { "entropy": 0.4750518798828125, "epoch": 5.275862068965517, "grad_norm": 4.380500664778174, "learning_rate": 2.0341360873386673e-07, "loss": 0.005, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 381038853.0, "step": 459 }, { "entropy": 0.46482086181640625, "epoch": 5.287356321839081, "grad_norm": 2.015274773802452, "learning_rate": 1.97190867212875e-07, "loss": 0.0083, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 381914412.0, "step": 460 }, { "entropy": 0.47858428955078125, "epoch": 5.2988505747126435, "grad_norm": 0.22456749005464405, "learning_rate": 1.9106088250797266e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 382734720.0, "step": 461 }, { "entropy": 0.47489166259765625, "epoch": 5.310344827586207, "grad_norm": 0.22731724902580622, "learning_rate": 1.8502390153404936e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 383560929.0, "step": 462 }, { "entropy": 0.47196197509765625, "epoch": 5.32183908045977, "grad_norm": 4.740213779864106, "learning_rate": 1.790801674598186e-07, "loss": 0.0075, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 384389530.0, "step": 463 }, { "entropy": 0.47057342529296875, "epoch": 5.333333333333333, "grad_norm": 0.2258431066367018, "learning_rate": 1.732299196980225e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 385233840.0, "step": 464 }, { "entropy": 0.47980499267578125, "epoch": 5.344827586206897, "grad_norm": 1.4221626617705267, "learning_rate": 1.6747339389578732e-07, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 386042787.0, "step": 465 }, { "entropy": 0.47692108154296875, "epoch": 5.35632183908046, "grad_norm": 4.521214139616767, "learning_rate": 1.6181082192513352e-07, "loss": 0.0038, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 386861845.0, "step": 466 }, { "entropy": 0.482086181640625, "epoch": 5.3678160919540225, "grad_norm": 3.3231820358196895, "learning_rate": 1.5624243187363442e-07, "loss": 0.0038, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 387661163.0, "step": 467 }, { "entropy": 0.48667144775390625, "epoch": 5.379310344827586, "grad_norm": 0.2304870362722343, "learning_rate": 1.507684480352292e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 388485432.0, "step": 468 }, { "entropy": 0.4650726318359375, "epoch": 5.390804597701149, "grad_norm": 0.23256845046208066, "learning_rate": 1.4538909090118846e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 389334143.0, "step": 469 }, { "entropy": 0.48242950439453125, "epoch": 5.402298850574713, "grad_norm": 0.2585670975063039, "learning_rate": 1.4010457715123355e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 390131143.0, "step": 470 }, { "entropy": 0.492431640625, "epoch": 5.413793103448276, "grad_norm": 0.2528521501053522, "learning_rate": 1.3491511964480703e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 390931756.0, "step": 471 }, { "entropy": 0.4763031005859375, "epoch": 5.425287356321839, "grad_norm": 1.4739001845694437, "learning_rate": 1.2982092741250145e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 391752298.0, "step": 472 }, { "entropy": 0.4845428466796875, "epoch": 5.436781609195402, "grad_norm": 0.26445355496340167, "learning_rate": 1.2482220564763669e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 392545229.0, "step": 473 }, { "entropy": 0.4782867431640625, "epoch": 5.448275862068965, "grad_norm": 0.9542333022796412, "learning_rate": 1.1991915569799645e-07, "loss": 0.0097, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 393380338.0, "step": 474 }, { "entropy": 0.476104736328125, "epoch": 5.459770114942529, "grad_norm": 0.26357178354088867, "learning_rate": 1.1511197505771843e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 394226931.0, "step": 475 }, { "entropy": 0.48026275634765625, "epoch": 5.471264367816092, "grad_norm": 0.4080153444923309, "learning_rate": 1.1040085735933681e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 395032835.0, "step": 476 }, { "entropy": 0.46416473388671875, "epoch": 5.482758620689655, "grad_norm": 0.6081932699825013, "learning_rate": 1.0578599236598708e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 395872556.0, "step": 477 }, { "entropy": 0.47927093505859375, "epoch": 5.494252873563219, "grad_norm": 0.2756351529387358, "learning_rate": 1.0126756596375687e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 396695404.0, "step": 478 }, { "entropy": 0.46880340576171875, "epoch": 5.505747126436781, "grad_norm": 1.7900528624209657, "learning_rate": 9.684576015420277e-08, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 397548893.0, "step": 479 }, { "entropy": 0.47908782958984375, "epoch": 5.517241379310345, "grad_norm": 0.3007803080522495, "learning_rate": 9.252075304701929e-08, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 398357809.0, "step": 480 }, { "entropy": 0.472137451171875, "epoch": 5.528735632183908, "grad_norm": 0.2546056245953954, "learning_rate": 8.829271885286095e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 399193393.0, "step": 481 }, { "entropy": 0.4811248779296875, "epoch": 5.540229885057471, "grad_norm": 0.24009342643072343, "learning_rate": 8.416182787632871e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 399990785.0, "step": 482 }, { "entropy": 0.4644927978515625, "epoch": 5.551724137931035, "grad_norm": 0.23317450891721894, "learning_rate": 8.012824650910938e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 400845157.0, "step": 483 }, { "entropy": 0.47351837158203125, "epoch": 5.563218390804598, "grad_norm": 2.1147010615882196, "learning_rate": 7.619213722327184e-08, "loss": 0.0089, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 401679005.0, "step": 484 }, { "entropy": 0.468231201171875, "epoch": 5.574712643678161, "grad_norm": 0.24668197764885869, "learning_rate": 7.235365856472443e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 402520327.0, "step": 485 }, { "entropy": 0.4737548828125, "epoch": 5.586206896551724, "grad_norm": 0.24623472017693857, "learning_rate": 6.86129651468273e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 403342914.0, "step": 486 }, { "entropy": 0.46429443359375, "epoch": 5.597701149425287, "grad_norm": 1.0601206333936144, "learning_rate": 6.497020764416633e-08, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 404200783.0, "step": 487 }, { "entropy": 0.46352386474609375, "epoch": 5.609195402298851, "grad_norm": 0.22699217170268132, "learning_rate": 6.142553278648239e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 405062916.0, "step": 488 }, { "entropy": 0.483367919921875, "epoch": 5.620689655172414, "grad_norm": 0.23077255806225605, "learning_rate": 5.7979083352762146e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 405902942.0, "step": 489 }, { "entropy": 0.49231719970703125, "epoch": 5.6321839080459775, "grad_norm": 0.23144381904668673, "learning_rate": 5.463099816548578e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 406710949.0, "step": 490 }, { "entropy": 0.46439361572265625, "epoch": 5.64367816091954, "grad_norm": 2.3498455147538175, "learning_rate": 5.1381412085036994e-08, "loss": 0.0082, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 407554838.0, "step": 491 }, { "entropy": 0.48107147216796875, "epoch": 5.655172413793103, "grad_norm": 0.22765796262677274, "learning_rate": 4.823045600426901e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 408367291.0, "step": 492 }, { "entropy": 0.4683685302734375, "epoch": 5.666666666666667, "grad_norm": 0.23126276882370067, "learning_rate": 4.5178256843233235e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 409212322.0, "step": 493 }, { "entropy": 0.470672607421875, "epoch": 5.67816091954023, "grad_norm": 0.22570374435713633, "learning_rate": 4.2224937544067254e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 410059887.0, "step": 494 }, { "entropy": 0.47655487060546875, "epoch": 5.689655172413794, "grad_norm": 0.2395343469118418, "learning_rate": 3.9370617066040726e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 410882734.0, "step": 495 }, { "entropy": 0.4618377685546875, "epoch": 5.7011494252873565, "grad_norm": 0.22662430438163453, "learning_rate": 3.661541038076755e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 411766798.0, "step": 496 }, { "entropy": 0.4715728759765625, "epoch": 5.712643678160919, "grad_norm": 0.22608240200051918, "learning_rate": 3.395942846757067e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 412620860.0, "step": 497 }, { "entropy": 0.4783782958984375, "epoch": 5.724137931034483, "grad_norm": 0.23276920955328978, "learning_rate": 3.1402778309014284e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 413434227.0, "step": 498 }, { "entropy": 0.47417449951171875, "epoch": 5.735632183908046, "grad_norm": 0.22647336925027828, "learning_rate": 2.8945562886593948e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 414252581.0, "step": 499 }, { "entropy": 0.47452545166015625, "epoch": 5.747126436781609, "grad_norm": 0.22490185870003843, "learning_rate": 2.6587881176588782e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 415068424.0, "step": 500 }, { "entropy": 0.48020172119140625, "epoch": 5.758620689655173, "grad_norm": 0.23110370834284083, "learning_rate": 2.4329828146074096e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 415902990.0, "step": 501 }, { "entropy": 0.47350311279296875, "epoch": 5.7701149425287355, "grad_norm": 2.839859264372696, "learning_rate": 2.2171494749097243e-08, "loss": 0.0074, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 416757511.0, "step": 502 }, { "entropy": 0.46640777587890625, "epoch": 5.781609195402299, "grad_norm": 1.6462241133899467, "learning_rate": 2.011296792301165e-08, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 417604658.0, "step": 503 }, { "entropy": 0.46356964111328125, "epoch": 5.793103448275862, "grad_norm": 0.23650139037887488, "learning_rate": 1.8154330584978785e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 418461199.0, "step": 504 }, { "entropy": 0.45654296875, "epoch": 5.804597701149425, "grad_norm": 1.0619531610176047, "learning_rate": 1.629566162862445e-08, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 419328853.0, "step": 505 }, { "entropy": 0.473480224609375, "epoch": 5.816091954022989, "grad_norm": 0.22868022966207807, "learning_rate": 1.453703592086353e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 420144761.0, "step": 506 }, { "entropy": 0.46999359130859375, "epoch": 5.827586206896552, "grad_norm": 2.547490956818682, "learning_rate": 1.28785242988827e-08, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 420982312.0, "step": 507 }, { "entropy": 0.46546173095703125, "epoch": 5.8390804597701145, "grad_norm": 6.379105642099488, "learning_rate": 1.132019356728853e-08, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 421827384.0, "step": 508 }, { "entropy": 0.47126007080078125, "epoch": 5.850574712643678, "grad_norm": 1.8118443829201367, "learning_rate": 9.862106495415469e-09, "loss": 0.005, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 422668841.0, "step": 509 }, { "entropy": 0.46734619140625, "epoch": 5.862068965517241, "grad_norm": 0.22332659835801152, "learning_rate": 8.504321814798433e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 423516715.0, "step": 510 }, { "entropy": 0.4660186767578125, "epoch": 5.873563218390805, "grad_norm": 0.2260995469649812, "learning_rate": 7.246894216806355e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 424340682.0, "step": 511 }, { "entropy": 0.4914398193359375, "epoch": 5.885057471264368, "grad_norm": 0.22784964173297753, "learning_rate": 6.089874350439507e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 425132197.0, "step": 512 }, { "entropy": 0.4688720703125, "epoch": 5.896551724137931, "grad_norm": 0.22366761094391158, "learning_rate": 5.033308820289185e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 425969675.0, "step": 513 }, { "entropy": 0.475860595703125, "epoch": 5.908045977011494, "grad_norm": 0.22732344127849993, "learning_rate": 4.07724018466088e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 426790240.0, "step": 514 }, { "entropy": 0.465087890625, "epoch": 5.919540229885057, "grad_norm": 0.26543552224647043, "learning_rate": 3.2217069538600932e-09, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 427651071.0, "step": 515 }, { "entropy": 0.47153472900390625, "epoch": 5.931034482758621, "grad_norm": 0.2295916592336764, "learning_rate": 2.4667435886402414e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 428470168.0, "step": 516 }, { "entropy": 0.46768951416015625, "epoch": 5.942528735632184, "grad_norm": 0.2277843849699895, "learning_rate": 1.8123804988159909e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 429292956.0, "step": 517 }, { "entropy": 0.48101043701171875, "epoch": 5.954022988505747, "grad_norm": 0.22645001219989788, "learning_rate": 1.2586440420372936e-09, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 430114842.0, "step": 518 }, { "entropy": 0.475311279296875, "epoch": 5.9655172413793105, "grad_norm": 0.22916157689876235, "learning_rate": 8.0555652272718e-10, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 430955923.0, "step": 519 }, { "entropy": 0.4584197998046875, "epoch": 5.977011494252873, "grad_norm": 0.22928233995534253, "learning_rate": 4.5313619118553256e-10, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 431823368.0, "step": 520 }, { "entropy": 0.47629547119140625, "epoch": 5.988505747126437, "grad_norm": 0.23437557579169707, "learning_rate": 2.0139724285161976e-10, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 432631185.0, "step": 521 }, { "entropy": 0.47571563720703125, "epoch": 6.0, "grad_norm": 0.41607559776285885, "learning_rate": 5.0349817733719165e-11, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 433455617.0, "step": 522 }, { "epoch": 6.0, "step": 522, "total_flos": 509990855180288.0, "train_loss": 0.558762426631948, "train_runtime": 70510.0264, "train_samples_per_second": 3.489, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 522, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 44, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 509990855180288.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }