[ { "loss": 5.295790405273437, "grad_norm": 17.75, "learning_rate": 1.6937553464499572e-06, "entropy": 1.3026689371466638, "num_tokens": 788631.0, "mean_token_accuracy": 0.45803309440612794, "epoch": 0.02567723712928489, "step": 100 }, { "loss": 1.9093310546875, "grad_norm": 5.78125, "learning_rate": 3.4046193327630456e-06, "entropy": 1.1995259793102742, "num_tokens": 1575343.0, "mean_token_accuracy": 0.7073830207437277, "epoch": 0.05135447425856978, "step": 200 }, { "loss": 0.7623916625976562, "grad_norm": 0.93359375, "learning_rate": 5.1154833190761344e-06, "entropy": 0.7093755914270878, "num_tokens": 2362291.0, "mean_token_accuracy": 0.8655054874718189, "epoch": 0.07703171138785467, "step": 300 }, { "loss": 0.6790202331542968, "grad_norm": 0.75, "learning_rate": 6.826347305389223e-06, "entropy": 0.6651146249473094, "num_tokens": 3150439.0, "mean_token_accuracy": 0.8720192441344261, "epoch": 0.10270894851713956, "step": 400 }, { "loss": 0.661846923828125, "grad_norm": 0.82421875, "learning_rate": 8.537211291702311e-06, "entropy": 0.6505232656002045, "num_tokens": 3939189.0, "mean_token_accuracy": 0.8728892022371292, "epoch": 0.12838618564642446, "step": 500 }, { "eval_loss": 0.6569298505783081, "eval_runtime": 66.9323, "eval_samples_per_second": 201.905, "eval_steps_per_second": 25.249, "eval_entropy": 0.6593257126723521, "eval_num_tokens": 3939189.0, "eval_mean_token_accuracy": 0.8742751985964691, "epoch": 0.12838618564642446, "step": 500 }, { "loss": 0.6417784881591797, "grad_norm": 0.65625, "learning_rate": 1.02480752780154e-05, "entropy": 0.6309234929084778, "num_tokens": 4726987.0, "mean_token_accuracy": 0.8763968905806542, "epoch": 0.15406342277570934, "step": 600 }, { "loss": 0.6324351119995117, "grad_norm": 0.58984375, "learning_rate": 1.1958939264328486e-05, "entropy": 0.6225168199837208, "num_tokens": 5514520.0, "mean_token_accuracy": 0.8769305641949177, "epoch": 0.17974065990499422, "step": 700 }, { "loss": 0.6248664474487304, "grad_norm": 0.5703125, "learning_rate": 1.3669803250641576e-05, "entropy": 0.6152851846814156, "num_tokens": 6303997.0, "mean_token_accuracy": 0.8783293107151985, "epoch": 0.20541789703427912, "step": 800 }, { "loss": 0.6181363677978515, "grad_norm": 0.4453125, "learning_rate": 1.538066723695466e-05, "entropy": 0.607969797924161, "num_tokens": 7092051.0, "mean_token_accuracy": 0.8795923219621181, "epoch": 0.231095134163564, "step": 900 }, { "loss": 0.6079271697998047, "grad_norm": 0.63671875, "learning_rate": 1.7091531223267753e-05, "entropy": 0.5982871637493372, "num_tokens": 7879814.0, "mean_token_accuracy": 0.8805274599790573, "epoch": 0.2567723712928489, "step": 1000 }, { "eval_loss": 0.6138819456100464, "eval_runtime": 66.9074, "eval_samples_per_second": 201.981, "eval_steps_per_second": 25.259, "eval_entropy": 0.5835022156584192, "eval_num_tokens": 7879814.0, "eval_mean_token_accuracy": 0.8795767288941604, "epoch": 0.2567723712928489, "step": 1000 }, { "loss": 0.6031342315673828, "grad_norm": 0.4765625, "learning_rate": 1.8802395209580838e-05, "entropy": 0.5937141847610473, "num_tokens": 8669131.0, "mean_token_accuracy": 0.8811740911006928, "epoch": 0.2824496084221338, "step": 1100 }, { "loss": 0.6006821060180664, "grad_norm": 0.416015625, "learning_rate": 1.999959838659769e-05, "entropy": 0.5919705433398486, "num_tokens": 9457145.0, "mean_token_accuracy": 0.8807326038181782, "epoch": 0.30812684555141867, "step": 1200 }, { "loss": 0.5951248550415039, "grad_norm": 0.408203125, "learning_rate": 1.9992459490144817e-05, "entropy": 0.5857468252629041, "num_tokens": 10246533.0, "mean_token_accuracy": 0.8820273293554783, "epoch": 0.33380408268070355, "step": 1300 }, { "loss": 0.5904627990722656, "grad_norm": 0.380859375, "learning_rate": 1.9976403184682326e-05, "entropy": 0.5813354634493589, "num_tokens": 11033964.0, "mean_token_accuracy": 0.8821511951088905, "epoch": 0.35948131980998843, "step": 1400 }, { "loss": 0.5805374526977539, "grad_norm": 0.48828125, "learning_rate": 1.9951443799079215e-05, "entropy": 0.5730234136432409, "num_tokens": 11822337.0, "mean_token_accuracy": 0.8843117669224739, "epoch": 0.3851585569392733, "step": 1500 }, { "eval_loss": 0.587958812713623, "eval_runtime": 66.7304, "eval_samples_per_second": 202.516, "eval_steps_per_second": 25.326, "eval_entropy": 0.5761568091501146, "eval_num_tokens": 11822337.0, "eval_mean_token_accuracy": 0.8826649507827308, "epoch": 0.3851585569392733, "step": 1500 }, { "loss": 0.5845616149902344, "grad_norm": 0.38671875, "learning_rate": 1.99176036074363e-05, "entropy": 0.5758610642701387, "num_tokens": 12609317.0, "mean_token_accuracy": 0.8831216642260551, "epoch": 0.41083579406855825, "step": 1600 }, { "loss": 0.579046745300293, "grad_norm": 0.40234375, "learning_rate": 1.9874912809208492e-05, "entropy": 0.570131861642003, "num_tokens": 13396496.0, "mean_token_accuracy": 0.8837999847531318, "epoch": 0.4365130311978431, "step": 1700 }, { "loss": 0.5719428634643555, "grad_norm": 0.388671875, "learning_rate": 1.9823409502254395e-05, "entropy": 0.5643735866248608, "num_tokens": 14184641.0, "mean_token_accuracy": 0.8844822055101395, "epoch": 0.462190268327128, "step": 1800 }, { "loss": 0.566561393737793, "grad_norm": 0.38671875, "learning_rate": 1.976313964883724e-05, "entropy": 0.5589940486103296, "num_tokens": 14972736.0, "mean_token_accuracy": 0.8854228469729424, "epoch": 0.4878675054564129, "step": 1900 }, { "loss": 0.5729906463623047, "grad_norm": 0.400390625, "learning_rate": 1.969415703460754e-05, "entropy": 0.5654201730340719, "num_tokens": 15760765.0, "mean_token_accuracy": 0.884299693107605, "epoch": 0.5135447425856978, "step": 2000 }, { "eval_loss": 0.574901282787323, "eval_runtime": 66.9041, "eval_samples_per_second": 201.991, "eval_steps_per_second": 25.26, "eval_entropy": 0.5581040822895321, "eval_num_tokens": 15760765.0, "eval_mean_token_accuracy": 0.8844290988685112, "epoch": 0.5135447425856978, "step": 2000 }, { "loss": 0.5724992752075195, "grad_norm": 0.470703125, "learning_rate": 1.9616523220604026e-05, "entropy": 0.5640381355583668, "num_tokens": 16548995.0, "mean_token_accuracy": 0.8841193398833275, "epoch": 0.5392219797149826, "step": 2100 }, { "loss": 0.565244369506836, "grad_norm": 0.404296875, "learning_rate": 1.9530307488315705e-05, "entropy": 0.5574718941748142, "num_tokens": 17338195.0, "mean_token_accuracy": 0.8858554971218109, "epoch": 0.5648992168442676, "step": 2200 }, { "loss": 0.5641956329345703, "grad_norm": 0.375, "learning_rate": 1.943558677785414e-05, "entropy": 0.556980236619711, "num_tokens": 18126378.0, "mean_token_accuracy": 0.8855182483792305, "epoch": 0.5905764539735524, "step": 2300 }, { "loss": 0.5643896865844726, "grad_norm": 0.447265625, "learning_rate": 1.9332445619291003e-05, "entropy": 0.5556083285063506, "num_tokens": 18915415.0, "mean_token_accuracy": 0.8858178888261318, "epoch": 0.6162536911028373, "step": 2400 }, { "loss": 0.5537351608276367, "grad_norm": 0.361328125, "learning_rate": 1.9220976057222272e-05, "entropy": 0.5470526535063982, "num_tokens": 19703215.0, "mean_token_accuracy": 0.8877282282710075, "epoch": 0.6419309282321223, "step": 2500 }, { "eval_loss": 0.566738486289978, "eval_runtime": 66.8788, "eval_samples_per_second": 202.067, "eval_steps_per_second": 25.27, "eval_entropy": 0.5507401840750282, "eval_num_tokens": 19703215.0, "eval_mean_token_accuracy": 0.8854550624740194, "epoch": 0.6419309282321223, "step": 2500 }, { "loss": 0.5606909561157226, "grad_norm": 0.4296875, "learning_rate": 1.9101277568626374e-05, "entropy": 0.5524309245496988, "num_tokens": 20491378.0, "mean_token_accuracy": 0.886026524156332, "epoch": 0.6676081653614071, "step": 2600 }, { "loss": 0.5588908386230469, "grad_norm": 0.41015625, "learning_rate": 1.8973456974089533e-05, "entropy": 0.5509618154168129, "num_tokens": 21281188.0, "mean_token_accuracy": 0.8863780727982521, "epoch": 0.693285402490692, "step": 2700 }, { "loss": 0.5587222290039062, "grad_norm": 0.439453125, "learning_rate": 1.883762834247763e-05, "entropy": 0.5505382239073515, "num_tokens": 22068585.0, "mean_token_accuracy": 0.8866781835258007, "epoch": 0.7189626396199769, "step": 2800 }, { "loss": 0.5535868453979492, "grad_norm": 0.359375, "learning_rate": 1.8693912889139548e-05, "entropy": 0.5467930260300636, "num_tokens": 22857792.0, "mean_token_accuracy": 0.8876498517394066, "epoch": 0.7446398767492618, "step": 2900 }, { "loss": 0.5560418701171875, "grad_norm": 0.38671875, "learning_rate": 1.8542438867732926e-05, "entropy": 0.5481385685503483, "num_tokens": 23646674.0, "mean_token_accuracy": 0.8865408559143543, "epoch": 0.7703171138785466, "step": 3000 }, { "eval_loss": 0.5608235001564026, "eval_runtime": 66.9694, "eval_samples_per_second": 201.794, "eval_steps_per_second": 25.235, "eval_entropy": 0.5475869985727163, "eval_num_tokens": 23646674.0, "eval_mean_token_accuracy": 0.8862869346988271, "epoch": 0.7703171138785466, "step": 3000 }, { "loss": 0.5510826873779296, "grad_norm": 0.392578125, "learning_rate": 1.8383341455768818e-05, "entropy": 0.5442652675509453, "num_tokens": 24435112.0, "mean_token_accuracy": 0.8880468539893627, "epoch": 0.7959943510078316, "step": 3100 }, { "loss": 0.5556667709350586, "grad_norm": 0.46484375, "learning_rate": 1.821676263397742e-05, "entropy": 0.5477713013440371, "num_tokens": 25222534.0, "mean_token_accuracy": 0.8866990077495575, "epoch": 0.8216715881371165, "step": 3200 }, { "loss": 0.5512593460083007, "grad_norm": 0.36328125, "learning_rate": 1.80428510596025e-05, "entropy": 0.5437313695996999, "num_tokens": 26010061.0, "mean_token_accuracy": 0.8885632981359959, "epoch": 0.8473488252664013, "step": 3300 }, { "loss": 0.5498195266723633, "grad_norm": 0.39453125, "learning_rate": 1.7861761933737617e-05, "entropy": 0.5414738351106644, "num_tokens": 26798369.0, "mean_token_accuracy": 0.8879594984650612, "epoch": 0.8730260623956863, "step": 3400 }, { "loss": 0.5532369232177734, "grad_norm": 0.38671875, "learning_rate": 1.7673656862822515e-05, "entropy": 0.5464992509037256, "num_tokens": 27588005.0, "mean_token_accuracy": 0.8868030488491059, "epoch": 0.8987032995249711, "step": 3500 }, { "eval_loss": 0.5567756295204163, "eval_runtime": 66.7863, "eval_samples_per_second": 202.347, "eval_steps_per_second": 25.305, "eval_entropy": 0.5490564776595528, "eval_num_tokens": 27588005.0, "eval_mean_token_accuracy": 0.8867901984051134, "epoch": 0.8987032995249711, "step": 3500 }, { "loss": 0.5537870025634766, "grad_norm": 0.38671875, "learning_rate": 1.7478703714423316e-05, "entropy": 0.54683793194592, "num_tokens": 28377750.0, "mean_token_accuracy": 0.8869239047169686, "epoch": 0.924380536654256, "step": 3600 }, { "loss": 0.5514320755004882, "grad_norm": 0.435546875, "learning_rate": 1.7277076467425163e-05, "entropy": 0.5437256157398224, "num_tokens": 29166016.0, "mean_token_accuracy": 0.8873179040849208, "epoch": 0.9500577737835408, "step": 3700 }, { "loss": 0.5542150115966797, "grad_norm": 0.37109375, "learning_rate": 1.706895505677108e-05, "entropy": 0.5465144612640143, "num_tokens": 29953000.0, "mean_token_accuracy": 0.887007016390562, "epoch": 0.9757350109128258, "step": 3800 }, { "loss": 0.5507764053344727, "grad_norm": 0.443359375, "learning_rate": 1.6854525212885517e-05, "entropy": 0.5444752973827285, "num_tokens": 30735372.0, "mean_token_accuracy": 0.887075167355226, "epoch": 1.0012838618564643, "step": 3900 }, { "loss": 0.5405293273925781, "grad_norm": 0.404296875, "learning_rate": 1.6633978295925973e-05, "entropy": 0.5336830996721983, "num_tokens": 31523776.0, "mean_token_accuracy": 0.8888550646603107, "epoch": 1.0269610989857492, "step": 4000 }, { "eval_loss": 0.5542467832565308, "eval_runtime": 66.8867, "eval_samples_per_second": 202.043, "eval_steps_per_second": 25.267, "eval_entropy": 0.5430249788175673, "eval_num_tokens": 31523776.0, "eval_mean_token_accuracy": 0.8871892735450225, "epoch": 1.0269610989857492, "step": 4000 }, { "loss": 0.5439878845214844, "grad_norm": 0.408203125, "learning_rate": 1.6407511125010535e-05, "entropy": 0.5366661065071822, "num_tokens": 32312483.0, "mean_token_accuracy": 0.8886200107634068, "epoch": 1.052638336115034, "step": 4100 }, { "loss": 0.538640251159668, "grad_norm": 0.376953125, "learning_rate": 1.6175325802573762e-05, "entropy": 0.5315170773863792, "num_tokens": 33100106.0, "mean_token_accuracy": 0.8894328561425209, "epoch": 1.0783155732443188, "step": 4200 }, { "loss": 0.5469233703613281, "grad_norm": 0.392578125, "learning_rate": 1.593762953400771e-05, "entropy": 0.5396519158780575, "num_tokens": 33887447.0, "mean_token_accuracy": 0.8873359954357147, "epoch": 1.1039928103736039, "step": 4300 }, { "loss": 0.5387564086914063, "grad_norm": 0.375, "learning_rate": 1.569463444274896e-05, "entropy": 0.5312051647901535, "num_tokens": 34676497.0, "mean_token_accuracy": 0.8895892894268036, "epoch": 1.1296700475028887, "step": 4400 }, { "loss": 0.5374182510375977, "grad_norm": 0.359375, "learning_rate": 1.5446557380976705e-05, "entropy": 0.5307135570794344, "num_tokens": 35464531.0, "mean_token_accuracy": 0.8893489798903466, "epoch": 1.1553472846321735, "step": 4500 }, { "eval_loss": 0.5518077611923218, "eval_runtime": 66.9817, "eval_samples_per_second": 201.756, "eval_steps_per_second": 25.231, "eval_entropy": 0.538807505527897, "eval_num_tokens": 35464531.0, "eval_mean_token_accuracy": 0.8874176931804454, "epoch": 1.1553472846321735, "step": 4500 }, { "loss": 0.5373505783081055, "grad_norm": 0.34375, "learning_rate": 1.5193619736090915e-05, "entropy": 0.5305563137680293, "num_tokens": 36252194.0, "mean_token_accuracy": 0.8893280589580536, "epoch": 1.1810245217614586, "step": 4600 }, { "loss": 0.5385945892333984, "grad_norm": 0.3984375, "learning_rate": 1.4936047233143121e-05, "entropy": 0.5316550326347351, "num_tokens": 37040269.0, "mean_token_accuracy": 0.8889729425311088, "epoch": 1.2067017588907434, "step": 4700 }, { "loss": 0.5383005142211914, "grad_norm": 0.43359375, "learning_rate": 1.4674069733396277e-05, "entropy": 0.5314889302104712, "num_tokens": 37828541.0, "mean_token_accuracy": 0.8891593493521214, "epoch": 1.2323789960200282, "step": 4800 }, { "loss": 0.5413074493408203, "grad_norm": 0.41015625, "learning_rate": 1.4407921029193386e-05, "entropy": 0.5336076222360134, "num_tokens": 38617419.0, "mean_token_accuracy": 0.8887772466242313, "epoch": 1.258056233149313, "step": 4900 }, { "loss": 0.5325155639648438, "grad_norm": 0.38671875, "learning_rate": 1.4137838635317981e-05, "entropy": 0.5255533574521541, "num_tokens": 39405754.0, "mean_token_accuracy": 0.8911273476481437, "epoch": 1.283733470278598, "step": 5000 }, { "eval_loss": 0.5503326654434204, "eval_runtime": 66.9521, "eval_samples_per_second": 201.846, "eval_steps_per_second": 25.242, "eval_entropy": 0.5397688833037777, "eval_num_tokens": 39405754.0, "eval_mean_token_accuracy": 0.8876568412639686, "epoch": 1.283733470278598, "step": 5000 }, { "loss": 0.5346319198608398, "grad_norm": 0.37890625, "learning_rate": 1.3864063577032644e-05, "entropy": 0.5269782991707325, "num_tokens": 40194187.0, "mean_token_accuracy": 0.8896128107607365, "epoch": 1.309410707407883, "step": 5100 }, { "loss": 0.5403598022460937, "grad_norm": 0.40625, "learning_rate": 1.3586840174984741e-05, "entropy": 0.5327016961574554, "num_tokens": 40982840.0, "mean_token_accuracy": 0.8892854882776737, "epoch": 1.3350879445371677, "step": 5200 }, { "loss": 0.5382755661010742, "grad_norm": 0.421875, "learning_rate": 1.3306415827171285e-05, "entropy": 0.5321682307869195, "num_tokens": 41771135.0, "mean_token_accuracy": 0.8891067253053189, "epoch": 1.3607651816664528, "step": 5300 }, { "loss": 0.5331235885620117, "grad_norm": 0.380859375, "learning_rate": 1.3023040788157542e-05, "entropy": 0.5263735573738814, "num_tokens": 42560001.0, "mean_token_accuracy": 0.8908107495307922, "epoch": 1.3864424187957376, "step": 5400 }, { "loss": 0.5401896667480469, "grad_norm": 0.416015625, "learning_rate": 1.2736967945746414e-05, "entropy": 0.5332227103412152, "num_tokens": 43348221.0, "mean_token_accuracy": 0.8889072981476783, "epoch": 1.4121196559250224, "step": 5500 }, { "eval_loss": 0.5492498278617859, "eval_runtime": 67.3776, "eval_samples_per_second": 200.571, "eval_steps_per_second": 25.083, "eval_entropy": 0.537609067100745, "eval_num_tokens": 43348221.0, "eval_mean_token_accuracy": 0.8878510412379835, "epoch": 1.4121196559250224, "step": 5500 }, { "loss": 0.5376063156127929, "grad_norm": 0.404296875, "learning_rate": 1.244845259529785e-05, "entropy": 0.530102282166481, "num_tokens": 44137063.0, "mean_token_accuracy": 0.8888533094525337, "epoch": 1.4377968930543075, "step": 5600 }, { "loss": 0.5347390747070313, "grad_norm": 0.3828125, "learning_rate": 1.2157752211899743e-05, "entropy": 0.5278809563070536, "num_tokens": 44925913.0, "mean_token_accuracy": 0.890110841691494, "epoch": 1.4634741301835923, "step": 5700 }, { "loss": 0.5396208572387695, "grad_norm": 0.396484375, "learning_rate": 1.1865126220593606e-05, "entropy": 0.5324571677297354, "num_tokens": 45713041.0, "mean_token_accuracy": 0.8887992192804813, "epoch": 1.4891513673128771, "step": 5800 }, { "loss": 0.5366056060791016, "grad_norm": 0.38671875, "learning_rate": 1.157083576486007e-05, "entropy": 0.5297523141652346, "num_tokens": 46501563.0, "mean_token_accuracy": 0.8896933840215207, "epoch": 1.5148286044421622, "step": 5900 }, { "loss": 0.5389834213256836, "grad_norm": 0.43359375, "learning_rate": 1.127514347357083e-05, "entropy": 0.5312013550847768, "num_tokens": 47289330.0, "mean_token_accuracy": 0.889183943271637, "epoch": 1.5405058415714468, "step": 6000 }, { "eval_loss": 0.5483260750770569, "eval_runtime": 67.2323, "eval_samples_per_second": 201.005, "eval_steps_per_second": 25.137, "eval_entropy": 0.5363286497148536, "eval_num_tokens": 47289330.0, "eval_mean_token_accuracy": 0.8879236852628944, "epoch": 1.5405058415714468, "step": 6000 }, { "loss": 0.5336640548706054, "grad_norm": 0.43359375, "learning_rate": 1.097831322661502e-05, "entropy": 0.5288673885911703, "num_tokens": 48077668.0, "mean_token_accuracy": 0.8898833976686, "epoch": 1.5661830787007318, "step": 6100 }, { "loss": 0.5375140762329101, "grad_norm": 0.3671875, "learning_rate": 1.0680609919409147e-05, "entropy": 0.5301950394362211, "num_tokens": 48866262.0, "mean_token_accuracy": 0.8895807403326035, "epoch": 1.5918603158300166, "step": 6200 }, { "loss": 0.5402913665771485, "grad_norm": 0.431640625, "learning_rate": 1.0382299226500746e-05, "entropy": 0.5333884459733963, "num_tokens": 49654335.0, "mean_token_accuracy": 0.888499419093132, "epoch": 1.6175375529593015, "step": 6300 }, { "loss": 0.5385772323608399, "grad_norm": 0.431640625, "learning_rate": 1.0083647364476762e-05, "entropy": 0.531462483778596, "num_tokens": 50442084.0, "mean_token_accuracy": 0.8892641182243824, "epoch": 1.6432147900885865, "step": 6400 }, { "loss": 0.5353133010864258, "grad_norm": 0.38671875, "learning_rate": 9.784920854388168e-06, "entropy": 0.527387068271637, "num_tokens": 51229616.0, "mean_token_accuracy": 0.8898960523307323, "epoch": 1.6688920272178713, "step": 6500 }, { "eval_loss": 0.5475577116012573, "eval_runtime": 66.8604, "eval_samples_per_second": 202.122, "eval_steps_per_second": 25.277, "eval_entropy": 0.5349308549123403, "eval_num_tokens": 51229616.0, "eval_mean_token_accuracy": 0.8880981687610672, "epoch": 1.6688920272178713, "step": 6500 }, { "loss": 0.5429517364501953, "grad_norm": 0.380859375, "learning_rate": 9.486386283902909e-06, "entropy": 0.537000098451972, "num_tokens": 52018364.0, "mean_token_accuracy": 0.8879952408373356, "epoch": 1.6945692643471562, "step": 6600 }, { "loss": 0.53443115234375, "grad_norm": 0.447265625, "learning_rate": 9.188310069399368e-06, "entropy": 0.5280120493471623, "num_tokens": 52807100.0, "mean_token_accuracy": 0.8897963106632233, "epoch": 1.7202465014764412, "step": 6700 }, { "loss": 0.5356635665893554, "grad_norm": 0.40625, "learning_rate": 8.890958218212716e-06, "entropy": 0.5287372674047947, "num_tokens": 53594818.0, "mean_token_accuracy": 0.8895081703364849, "epoch": 1.745923738605726, "step": 6800 }, { "loss": 0.5367589569091797, "grad_norm": 0.369140625, "learning_rate": 8.594596091246282e-06, "entropy": 0.5299282168596983, "num_tokens": 54382275.0, "mean_token_accuracy": 0.8896372695267201, "epoch": 1.7716009757350109, "step": 6900 }, { "loss": 0.5325223159790039, "grad_norm": 0.3984375, "learning_rate": 8.299488166159817e-06, "entropy": 0.5243838762491941, "num_tokens": 55171058.0, "mean_token_accuracy": 0.890430568009615, "epoch": 1.797278212864296, "step": 7000 }, { "eval_loss": 0.5471183061599731, "eval_runtime": 67.121, "eval_samples_per_second": 201.338, "eval_steps_per_second": 25.178, "eval_entropy": 0.5362202901106614, "eval_num_tokens": 55171058.0, "eval_mean_token_accuracy": 0.8881139952402849, "epoch": 1.797278212864296, "step": 7000 }, { "loss": 0.5369550704956054, "grad_norm": 0.41015625, "learning_rate": 8.005897801345976e-06, "entropy": 0.5303717163205147, "num_tokens": 55960002.0, "mean_token_accuracy": 0.8894575189054013, "epoch": 1.8229554499935807, "step": 7100 }, { "loss": 0.5351543045043945, "grad_norm": 0.3671875, "learning_rate": 7.714087000905643e-06, "entropy": 0.5274955333024264, "num_tokens": 56748093.0, "mean_token_accuracy": 0.8897032625973225, "epoch": 1.8486326871228655, "step": 7200 }, { "loss": 0.5317428207397461, "grad_norm": 0.396484375, "learning_rate": 7.4243161808318465e-06, "entropy": 0.5234256482124329, "num_tokens": 57536251.0, "mean_token_accuracy": 0.8904528062045575, "epoch": 1.8743099242521506, "step": 7300 }, { "loss": 0.5373667526245117, "grad_norm": 0.462890625, "learning_rate": 7.136843936610935e-06, "entropy": 0.5299951387941837, "num_tokens": 58324650.0, "mean_token_accuracy": 0.8894152472913265, "epoch": 1.8999871613814352, "step": 7400 }, { "loss": 0.5356111907958985, "grad_norm": 0.4375, "learning_rate": 6.851926812448384e-06, "entropy": 0.5281449986249208, "num_tokens": 59112829.0, "mean_token_accuracy": 0.8894895881414413, "epoch": 1.9256643985107202, "step": 7500 }, { "eval_loss": 0.5469039082527161, "eval_runtime": 67.2936, "eval_samples_per_second": 200.821, "eval_steps_per_second": 25.114, "eval_entropy": 0.5319856126809261, "eval_num_tokens": 59112829.0, "eval_mean_token_accuracy": 0.8881882847060819, "epoch": 1.9256643985107202, "step": 7500 }, { "loss": 0.5324957275390625, "grad_norm": 0.373046875, "learning_rate": 6.569819072325195e-06, "entropy": 0.5259347888082266, "num_tokens": 59899148.0, "mean_token_accuracy": 0.8905223569273949, "epoch": 1.9513416356400053, "step": 7600 }, { "loss": 0.5345915985107422, "grad_norm": 0.396484375, "learning_rate": 6.290772473089214e-06, "entropy": 0.5266730510443449, "num_tokens": 60687114.0, "mean_token_accuracy": 0.8896890124678611, "epoch": 1.97701887276929, "step": 7700 }, { "loss": 0.5284463882446289, "grad_norm": 0.37109375, "learning_rate": 6.015036039783836e-06, "entropy": 0.5242100227717779, "num_tokens": 61470852.0, "mean_token_accuracy": 0.8908302205890867, "epoch": 2.0025677237129287, "step": 7800 }, { "loss": 0.5349670028686524, "grad_norm": 0.40234375, "learning_rate": 5.74285584341461e-06, "entropy": 0.5276301963627339, "num_tokens": 62258538.0, "mean_token_accuracy": 0.8893012750148773, "epoch": 2.0282449608422133, "step": 7900 }, { "loss": 0.5321082305908204, "grad_norm": 0.39453125, "learning_rate": 5.474474781352066e-06, "entropy": 0.5241756404936314, "num_tokens": 63047491.0, "mean_token_accuracy": 0.8900664694607258, "epoch": 2.0539221979714983, "step": 8000 }, { "eval_loss": 0.5467016696929932, "eval_runtime": 67.0874, "eval_samples_per_second": 201.439, "eval_steps_per_second": 25.191, "eval_entropy": 0.5306698962957901, "eval_num_tokens": 63047491.0, "eval_mean_token_accuracy": 0.8882128992024259, "epoch": 2.0539221979714983, "step": 8000 }, { "loss": 0.5294083023071289, "grad_norm": 0.43359375, "learning_rate": 5.210132360566756e-06, "entropy": 0.5223005886375904, "num_tokens": 63835062.0, "mean_token_accuracy": 0.8909649957716465, "epoch": 2.079599435100783, "step": 8100 }, { "loss": 0.5341888427734375, "grad_norm": 0.396484375, "learning_rate": 4.95006448388992e-06, "entropy": 0.5271259383112192, "num_tokens": 64622452.0, "mean_token_accuracy": 0.8900617562234402, "epoch": 2.105276672230068, "step": 8200 }, { "loss": 0.5306560897827148, "grad_norm": 0.455078125, "learning_rate": 4.69450323949053e-06, "entropy": 0.5243064795434474, "num_tokens": 65410929.0, "mean_token_accuracy": 0.890057615339756, "epoch": 2.130953909359353, "step": 8300 }, { "loss": 0.5324376678466797, "grad_norm": 0.3984375, "learning_rate": 4.443676693756599e-06, "entropy": 0.5259603321552276, "num_tokens": 66199581.0, "mean_token_accuracy": 0.8899568720161914, "epoch": 2.1566311464886376, "step": 8400 }, { "loss": 0.5324293899536133, "grad_norm": 0.42578125, "learning_rate": 4.197808687765592e-06, "entropy": 0.5257175669819116, "num_tokens": 66988012.0, "mean_token_accuracy": 0.8905250385403634, "epoch": 2.1823083836179227, "step": 8500 }, { "eval_loss": 0.5466301441192627, "eval_runtime": 67.1045, "eval_samples_per_second": 201.387, "eval_steps_per_second": 25.185, "eval_entropy": 0.5302212731901711, "eval_num_tokens": 66988012.0, "eval_mean_token_accuracy": 0.888240820310525, "epoch": 2.1823083836179227, "step": 8500 }, { "loss": 0.5308111572265625, "grad_norm": 0.423828125, "learning_rate": 3.957118637525545e-06, "entropy": 0.5240319129824639, "num_tokens": 67776064.0, "mean_token_accuracy": 0.8906722447276115, "epoch": 2.2079856207472077, "step": 8600 }, { "loss": 0.5349502182006836, "grad_norm": 0.384765625, "learning_rate": 3.721821338165191e-06, "entropy": 0.5290320947766304, "num_tokens": 68564254.0, "mean_token_accuracy": 0.8893519747257232, "epoch": 2.2336628578764923, "step": 8700 }, { "loss": 0.5332579040527343, "grad_norm": 0.40234375, "learning_rate": 3.4921267722478115e-06, "entropy": 0.5256545479595661, "num_tokens": 69352920.0, "mean_token_accuracy": 0.890228152424097, "epoch": 2.2593400950057774, "step": 8800 }, { "loss": 0.5388346099853516, "grad_norm": 0.41796875, "learning_rate": 3.2682399223799045e-06, "entropy": 0.5306229508668184, "num_tokens": 70141047.0, "mean_token_accuracy": 0.889285789579153, "epoch": 2.2850173321350624, "step": 8900 }, { "loss": 0.5309392929077148, "grad_norm": 0.478515625, "learning_rate": 3.0503605882818623e-06, "entropy": 0.5246253449469804, "num_tokens": 70929498.0, "mean_token_accuracy": 0.8901907150447369, "epoch": 2.310694569264347, "step": 9000 }, { "eval_loss": 0.5465638637542725, "eval_runtime": 67.4174, "eval_samples_per_second": 200.453, "eval_steps_per_second": 25.068, "eval_entropy": 0.529443961893313, "eval_num_tokens": 70929498.0, "eval_mean_token_accuracy": 0.8882127779828021, "epoch": 2.310694569264347, "step": 9000 }, { "loss": 0.5335653686523437, "grad_norm": 0.40234375, "learning_rate": 2.838683208483931e-06, "entropy": 0.5271350515633821, "num_tokens": 71718369.0, "mean_token_accuracy": 0.8902666576206684, "epoch": 2.336371806393632, "step": 9100 }, { "loss": 0.5284453964233399, "grad_norm": 0.39453125, "learning_rate": 2.633396686806604e-06, "entropy": 0.5240849039703608, "num_tokens": 72506753.0, "mean_token_accuracy": 0.8903698475658893, "epoch": 2.362049043522917, "step": 9200 }, { "loss": 0.5297767639160156, "grad_norm": 0.44921875, "learning_rate": 2.4346842237802137e-06, "entropy": 0.523836979046464, "num_tokens": 73294192.0, "mean_token_accuracy": 0.8904416194558143, "epoch": 2.3877262806522017, "step": 9300 }, { "loss": 0.5338259506225586, "grad_norm": 0.419921875, "learning_rate": 2.2427231531542605e-06, "entropy": 0.5259654937684536, "num_tokens": 74082610.0, "mean_token_accuracy": 0.8898109787702561, "epoch": 2.4134035177814868, "step": 9400 }, { "loss": 0.53539306640625, "grad_norm": 0.376953125, "learning_rate": 2.057684783642321e-06, "entropy": 0.5273031425476075, "num_tokens": 74870230.0, "mean_token_accuracy": 0.8893570882081986, "epoch": 2.439080754910772, "step": 9500 }, { "eval_loss": 0.5465569496154785, "eval_runtime": 67.2693, "eval_samples_per_second": 200.894, "eval_steps_per_second": 25.123, "eval_entropy": 0.5297543280576108, "eval_num_tokens": 74870230.0, "eval_mean_token_accuracy": 0.8882103748928161, "epoch": 2.439080754910772, "step": 9500 }, { "loss": 0.5300022888183594, "grad_norm": 0.42578125, "learning_rate": 1.8797342460437773e-06, "entropy": 0.5227815758436918, "num_tokens": 75659042.0, "mean_token_accuracy": 0.8905887195467949, "epoch": 2.4647579920400564, "step": 9600 }, { "loss": 0.5275243759155274, "grad_norm": 0.36328125, "learning_rate": 1.7090303458788138e-06, "entropy": 0.5209154675900937, "num_tokens": 76448289.0, "mean_token_accuracy": 0.891223351508379, "epoch": 2.4904352291693415, "step": 9700 }, { "loss": 0.5308470916748047, "grad_norm": 0.396484375, "learning_rate": 1.5457254216681706e-06, "entropy": 0.5242900583148002, "num_tokens": 77237168.0, "mean_token_accuracy": 0.8901365567743779, "epoch": 2.516112466298626, "step": 9800 }, { "loss": 0.5291099929809571, "grad_norm": 0.408203125, "learning_rate": 1.3899652089841475e-06, "entropy": 0.5219676418602467, "num_tokens": 78025882.0, "mean_token_accuracy": 0.8904913778603077, "epoch": 2.541789703427911, "step": 9900 }, { "loss": 0.5335563278198242, "grad_norm": 0.404296875, "learning_rate": 1.2418887103941613e-06, "entropy": 0.5248580784350634, "num_tokens": 78811967.0, "mean_token_accuracy": 0.8904596289992333, "epoch": 2.567466940557196, "step": 10000 }, { "eval_loss": 0.5465272665023804, "eval_runtime": 66.7472, "eval_samples_per_second": 202.465, "eval_steps_per_second": 25.319, "eval_entropy": 0.5304744325269609, "eval_num_tokens": 78811967.0, "eval_mean_token_accuracy": 0.8882129736905973, "epoch": 2.567466940557196, "step": 10000 }, { "loss": 0.5322509765625, "grad_norm": 0.3984375, "learning_rate": 1.10162807141293e-06, "entropy": 0.5262704361230135, "num_tokens": 79600687.0, "mean_token_accuracy": 0.8897101627290249, "epoch": 2.593144177686481, "step": 10100 }, { "loss": 0.5316716384887695, "grad_norm": 0.375, "learning_rate": 9.693084625739946e-07, "entropy": 0.5247344778478146, "num_tokens": 80387505.0, "mean_token_accuracy": 0.8906851357221603, "epoch": 2.618821414815766, "step": 10200 }, { "loss": 0.5358499526977539, "grad_norm": 0.3671875, "learning_rate": 8.450479677257962e-07, "entropy": 0.528542704731226, "num_tokens": 81176251.0, "mean_token_accuracy": 0.8893057556450367, "epoch": 2.644498651945051, "step": 10300 }, { "loss": 0.5294158172607422, "grad_norm": 0.43359375, "learning_rate": 7.289574786520237e-07, "entropy": 0.522821741476655, "num_tokens": 81964977.0, "mean_token_accuracy": 0.8909594585001469, "epoch": 2.6701758890743355, "step": 10400 }, { "loss": 0.5307606124877929, "grad_norm": 0.3828125, "learning_rate": 6.211405961102512e-07, "entropy": 0.5265567531436681, "num_tokens": 82752889.0, "mean_token_accuracy": 0.8900834138691426, "epoch": 2.6958531262036205, "step": 10500 }, { "eval_loss": 0.5465019941329956, "eval_runtime": 67.3423, "eval_samples_per_second": 200.676, "eval_steps_per_second": 25.096, "eval_entropy": 0.5300784435145248, "eval_num_tokens": 82752889.0, "eval_mean_token_accuracy": 0.8881937025919469, "epoch": 2.6958531262036205, "step": 10500 }, { "loss": 0.5329842758178711, "grad_norm": 0.376953125, "learning_rate": 5.216935373771859e-07, "entropy": 0.5243370252847671, "num_tokens": 83540835.0, "mean_token_accuracy": 0.8904808807373047, "epoch": 2.7215303633329055, "step": 10600 }, { "loss": 0.5306868362426758, "grad_norm": 0.419921875, "learning_rate": 4.307050503830457e-07, "entropy": 0.5237632666528225, "num_tokens": 84328863.0, "mean_token_accuracy": 0.890521085858345, "epoch": 2.74720760046219, "step": 10700 }, { "loss": 0.5309492874145508, "grad_norm": 0.37890625, "learning_rate": 3.482563345116763e-07, "entropy": 0.5249035055935383, "num_tokens": 85116662.0, "mean_token_accuracy": 0.8904018171131611, "epoch": 2.772884837591475, "step": 10800 }, { "loss": 0.5319928741455078, "grad_norm": 0.390625, "learning_rate": 2.7442096813709684e-07, "entropy": 0.524908444583416, "num_tokens": 85905228.0, "mean_token_accuracy": 0.8903980639576912, "epoch": 2.79856207472076, "step": 10900 }, { "loss": 0.5291864395141601, "grad_norm": 0.443359375, "learning_rate": 2.0926484296114324e-07, "entropy": 0.5247332839667798, "num_tokens": 86693156.0, "mean_token_accuracy": 0.8905435487627983, "epoch": 2.824239311850045, "step": 11000 }, { "eval_loss": 0.5464890003204346, "eval_runtime": 67.2701, "eval_samples_per_second": 200.892, "eval_steps_per_second": 25.123, "eval_entropy": 0.5298788991729183, "eval_num_tokens": 86693156.0, "eval_mean_token_accuracy": 0.8882385518776594, "epoch": 2.824239311850045, "step": 11000 }, { "loss": 0.5365339279174804, "grad_norm": 0.40625, "learning_rate": 1.5284610521080323e-07, "entropy": 0.5279695607721806, "num_tokens": 87481998.0, "mean_token_accuracy": 0.8891479071974754, "epoch": 2.84991654897933, "step": 11100 }, { "loss": 0.5289971160888672, "grad_norm": 0.380859375, "learning_rate": 1.0521510374771781e-07, "entropy": 0.5218742392957211, "num_tokens": 88271222.0, "mean_token_accuracy": 0.8910181857645512, "epoch": 2.875593786108615, "step": 11200 }, { "loss": 0.537615737915039, "grad_norm": 0.43359375, "learning_rate": 6.641434513616208e-08, "entropy": 0.5303182192891837, "num_tokens": 89059356.0, "mean_token_accuracy": 0.88922975435853, "epoch": 2.9012710232378995, "step": 11300 }, { "loss": 0.5294175338745117, "grad_norm": 0.37109375, "learning_rate": 3.6478455709598735e-08, "entropy": 0.5221536308526993, "num_tokens": 89847466.0, "mean_token_accuracy": 0.8907907168567181, "epoch": 2.9269482603671846, "step": 11400 }, { "loss": 0.5347037124633789, "grad_norm": 0.419921875, "learning_rate": 1.543415066966092e-08, "entropy": 0.5283580356836319, "num_tokens": 90636203.0, "mean_token_accuracy": 0.8894244636595249, "epoch": 2.952625497496469, "step": 11500 }, { "eval_loss": 0.5464821457862854, "eval_runtime": 66.8918, "eval_samples_per_second": 202.028, "eval_steps_per_second": 25.265, "eval_entropy": 0.5296846309180796, "eval_num_tokens": 90636203.0, "eval_mean_token_accuracy": 0.8882489815618865, "epoch": 2.952625497496469, "step": 11500 }, { "loss": 0.5325915145874024, "grad_norm": 0.43359375, "learning_rate": 3.3002102451362704e-09, "entropy": 0.528514067903161, "num_tokens": 91424572.0, "mean_token_accuracy": 0.8899151113629341, "epoch": 2.9783027346257542, "step": 11600 }, { "train_runtime": 9303.979, "train_samples_per_second": 40.182, "train_steps_per_second": 1.256, "total_flos": 9.506673059457577e+17, "train_loss": 0.6031799179292613, "entropy": 0.525621883262544, "num_tokens": 92088141.0, "mean_token_accuracy": 0.890005886554718, "epoch": 3.0, "step": 11685 } ]