{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.6268656716417915, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007462686567164179, "grad_norm": 11.35859680736035, "learning_rate": 0.0, "loss": 1.047095537185669, "num_tokens": 940173.0, "step": 1 }, { "epoch": 0.014925373134328358, "grad_norm": 11.310520487616877, "learning_rate": 5.263157894736843e-07, "loss": 1.0946075916290283, "num_tokens": 1940908.0, "step": 2 }, { "epoch": 0.022388059701492536, "grad_norm": 11.106569322922516, "learning_rate": 1.0526315789473685e-06, "loss": 1.0278105735778809, "num_tokens": 2857302.0, "step": 3 }, { "epoch": 0.029850746268656716, "grad_norm": 10.881054443812134, "learning_rate": 1.5789473684210526e-06, "loss": 1.0398736000061035, "num_tokens": 3696299.0, "step": 4 }, { "epoch": 0.03731343283582089, "grad_norm": 10.448295115598174, "learning_rate": 2.105263157894737e-06, "loss": 1.0615425109863281, "num_tokens": 4528104.0, "step": 5 }, { "epoch": 0.04477611940298507, "grad_norm": 10.151241780828355, "learning_rate": 2.631578947368421e-06, "loss": 1.0268486738204956, "num_tokens": 5554518.0, "step": 6 }, { "epoch": 0.05223880597014925, "grad_norm": 8.119312484055971, "learning_rate": 3.157894736842105e-06, "loss": 0.9329569935798645, "num_tokens": 6422948.0, "step": 7 }, { "epoch": 0.05970149253731343, "grad_norm": 7.409758964343402, "learning_rate": 3.6842105263157896e-06, "loss": 0.8917287588119507, "num_tokens": 7201431.0, "step": 8 }, { "epoch": 0.06716417910447761, "grad_norm": 5.971479536888058, "learning_rate": 4.210526315789474e-06, "loss": 0.8006043434143066, "num_tokens": 8128474.0, "step": 9 }, { "epoch": 0.07462686567164178, "grad_norm": 3.4445244902185927, "learning_rate": 4.736842105263158e-06, "loss": 0.7708431482315063, "num_tokens": 9073762.0, "step": 10 }, { "epoch": 0.08208955223880597, "grad_norm": 2.227913040407572, "learning_rate": 5.263157894736842e-06, "loss": 0.689713716506958, "num_tokens": 9950348.0, "step": 11 }, { "epoch": 0.08955223880597014, "grad_norm": 1.8665254369252244, "learning_rate": 5.789473684210527e-06, "loss": 0.7132350206375122, "num_tokens": 10884740.0, "step": 12 }, { "epoch": 0.09701492537313433, "grad_norm": 2.952404437976229, "learning_rate": 6.31578947368421e-06, "loss": 0.713362455368042, "num_tokens": 11697616.0, "step": 13 }, { "epoch": 0.1044776119402985, "grad_norm": 2.826605099421276, "learning_rate": 6.842105263157896e-06, "loss": 0.6958507895469666, "num_tokens": 12632232.0, "step": 14 }, { "epoch": 0.11194029850746269, "grad_norm": 2.4454572403082926, "learning_rate": 7.368421052631579e-06, "loss": 0.6733378171920776, "num_tokens": 13568493.0, "step": 15 }, { "epoch": 0.11940298507462686, "grad_norm": 2.0537063830263924, "learning_rate": 7.894736842105265e-06, "loss": 0.6741904020309448, "num_tokens": 14533820.0, "step": 16 }, { "epoch": 0.12686567164179105, "grad_norm": 1.4727507656008452, "learning_rate": 8.421052631578948e-06, "loss": 0.6536232829093933, "num_tokens": 15435498.0, "step": 17 }, { "epoch": 0.13432835820895522, "grad_norm": 1.054376608380898, "learning_rate": 8.947368421052632e-06, "loss": 0.6000441312789917, "num_tokens": 16351791.0, "step": 18 }, { "epoch": 0.1417910447761194, "grad_norm": 0.9835940111044099, "learning_rate": 9.473684210526315e-06, "loss": 0.6027337312698364, "num_tokens": 17276920.0, "step": 19 }, { "epoch": 0.14925373134328357, "grad_norm": 0.916308840098788, "learning_rate": 1e-05, "loss": 0.6199864149093628, "num_tokens": 18270172.0, "step": 20 }, { "epoch": 0.15671641791044777, "grad_norm": 0.6212633844448718, "learning_rate": 9.999938520216343e-06, "loss": 0.5760895609855652, "num_tokens": 19308005.0, "step": 21 }, { "epoch": 0.16417910447761194, "grad_norm": 0.5315615385439493, "learning_rate": 9.999754082545261e-06, "loss": 0.5423388481140137, "num_tokens": 20162217.0, "step": 22 }, { "epoch": 0.17164179104477612, "grad_norm": 0.5852277738108399, "learning_rate": 9.999446692026396e-06, "loss": 0.5618520975112915, "num_tokens": 20980497.0, "step": 23 }, { "epoch": 0.1791044776119403, "grad_norm": 0.5256536336611786, "learning_rate": 9.999016357058996e-06, "loss": 0.5482994914054871, "num_tokens": 21857362.0, "step": 24 }, { "epoch": 0.1865671641791045, "grad_norm": 0.436253543862231, "learning_rate": 9.99846308940168e-06, "loss": 0.5038638710975647, "num_tokens": 22792620.0, "step": 25 }, { "epoch": 0.19402985074626866, "grad_norm": 0.47872306271108794, "learning_rate": 9.997786904172126e-06, "loss": 0.5729074478149414, "num_tokens": 23723110.0, "step": 26 }, { "epoch": 0.20149253731343283, "grad_norm": 0.3887165593913177, "learning_rate": 9.996987819846656e-06, "loss": 0.5251473188400269, "num_tokens": 24725024.0, "step": 27 }, { "epoch": 0.208955223880597, "grad_norm": 0.4864210479565411, "learning_rate": 9.996065858259729e-06, "loss": 0.560759425163269, "num_tokens": 25729987.0, "step": 28 }, { "epoch": 0.21641791044776118, "grad_norm": 0.4545327828204722, "learning_rate": 9.995021044603343e-06, "loss": 0.5304505825042725, "num_tokens": 26557013.0, "step": 29 }, { "epoch": 0.22388059701492538, "grad_norm": 0.369912070212526, "learning_rate": 9.993853407426353e-06, "loss": 0.5103640556335449, "num_tokens": 27503464.0, "step": 30 }, { "epoch": 0.23134328358208955, "grad_norm": 0.32843421942348455, "learning_rate": 9.99256297863368e-06, "loss": 0.5005761384963989, "num_tokens": 28533732.0, "step": 31 }, { "epoch": 0.23880597014925373, "grad_norm": 0.36571377121484666, "learning_rate": 9.991149793485453e-06, "loss": 0.5339782238006592, "num_tokens": 29340667.0, "step": 32 }, { "epoch": 0.2462686567164179, "grad_norm": 0.3706600251055638, "learning_rate": 9.989613890596034e-06, "loss": 0.5353128910064697, "num_tokens": 30210961.0, "step": 33 }, { "epoch": 0.2537313432835821, "grad_norm": 0.3689913973205178, "learning_rate": 9.987955311932968e-06, "loss": 0.5166599750518799, "num_tokens": 31101886.0, "step": 34 }, { "epoch": 0.26119402985074625, "grad_norm": 0.33967789101967927, "learning_rate": 9.986174102815837e-06, "loss": 0.5018597841262817, "num_tokens": 31897310.0, "step": 35 }, { "epoch": 0.26865671641791045, "grad_norm": 0.34077171626781105, "learning_rate": 9.984270311915019e-06, "loss": 0.48667871952056885, "num_tokens": 32540943.0, "step": 36 }, { "epoch": 0.27611940298507465, "grad_norm": 0.3621091474207233, "learning_rate": 9.982243991250359e-06, "loss": 0.5088210105895996, "num_tokens": 33542067.0, "step": 37 }, { "epoch": 0.2835820895522388, "grad_norm": 0.3534080682731624, "learning_rate": 9.980095196189748e-06, "loss": 0.4913540482521057, "num_tokens": 34504224.0, "step": 38 }, { "epoch": 0.291044776119403, "grad_norm": 0.34385148887540573, "learning_rate": 9.977823985447613e-06, "loss": 0.5291423797607422, "num_tokens": 35410799.0, "step": 39 }, { "epoch": 0.29850746268656714, "grad_norm": 0.3614616882970318, "learning_rate": 9.975430421083307e-06, "loss": 0.5238292217254639, "num_tokens": 36306291.0, "step": 40 }, { "epoch": 0.30597014925373134, "grad_norm": 0.34380854428467267, "learning_rate": 9.972914568499412e-06, "loss": 0.49555328488349915, "num_tokens": 37195796.0, "step": 41 }, { "epoch": 0.31343283582089554, "grad_norm": 0.32872739996760125, "learning_rate": 9.970276496439967e-06, "loss": 0.48128455877304077, "num_tokens": 38111088.0, "step": 42 }, { "epoch": 0.3208955223880597, "grad_norm": 0.32224419409640415, "learning_rate": 9.967516276988569e-06, "loss": 0.47381213307380676, "num_tokens": 38854783.0, "step": 43 }, { "epoch": 0.3283582089552239, "grad_norm": 0.313605152437139, "learning_rate": 9.964633985566412e-06, "loss": 0.4922352433204651, "num_tokens": 39832057.0, "step": 44 }, { "epoch": 0.3358208955223881, "grad_norm": 0.3221801938329887, "learning_rate": 9.961629700930236e-06, "loss": 0.5065716505050659, "num_tokens": 40758959.0, "step": 45 }, { "epoch": 0.34328358208955223, "grad_norm": 0.34336243037288433, "learning_rate": 9.958503505170158e-06, "loss": 0.4985169470310211, "num_tokens": 41744543.0, "step": 46 }, { "epoch": 0.35074626865671643, "grad_norm": 0.323405267106758, "learning_rate": 9.95525548370744e-06, "loss": 0.4811803996562958, "num_tokens": 42685398.0, "step": 47 }, { "epoch": 0.3582089552238806, "grad_norm": 0.3472754733495145, "learning_rate": 9.951885725292152e-06, "loss": 0.4971832036972046, "num_tokens": 43509328.0, "step": 48 }, { "epoch": 0.3656716417910448, "grad_norm": 0.30314939517994505, "learning_rate": 9.948394322000747e-06, "loss": 0.4676430821418762, "num_tokens": 44360961.0, "step": 49 }, { "epoch": 0.373134328358209, "grad_norm": 0.3115400700181878, "learning_rate": 9.944781369233544e-06, "loss": 0.4450893700122833, "num_tokens": 45215408.0, "step": 50 }, { "epoch": 0.3805970149253731, "grad_norm": 0.3274967224701377, "learning_rate": 9.941046965712124e-06, "loss": 0.4661027491092682, "num_tokens": 46008801.0, "step": 51 }, { "epoch": 0.3880597014925373, "grad_norm": 0.3185260501598265, "learning_rate": 9.937191213476627e-06, "loss": 0.45998284220695496, "num_tokens": 46857304.0, "step": 52 }, { "epoch": 0.39552238805970147, "grad_norm": 0.3187630499897143, "learning_rate": 9.933214217882973e-06, "loss": 0.49932676553726196, "num_tokens": 47835515.0, "step": 53 }, { "epoch": 0.40298507462686567, "grad_norm": 0.3126440220395918, "learning_rate": 9.929116087599973e-06, "loss": 0.49588972330093384, "num_tokens": 48834826.0, "step": 54 }, { "epoch": 0.41044776119402987, "grad_norm": 0.31909099806625735, "learning_rate": 9.924896934606365e-06, "loss": 0.49547284841537476, "num_tokens": 49858718.0, "step": 55 }, { "epoch": 0.417910447761194, "grad_norm": 0.2999327415505548, "learning_rate": 9.920556874187757e-06, "loss": 0.45831602811813354, "num_tokens": 50784650.0, "step": 56 }, { "epoch": 0.4253731343283582, "grad_norm": 0.33478138187870804, "learning_rate": 9.91609602493347e-06, "loss": 0.44470953941345215, "num_tokens": 51788903.0, "step": 57 }, { "epoch": 0.43283582089552236, "grad_norm": 0.3098385124963181, "learning_rate": 9.911514508733307e-06, "loss": 0.48413345217704773, "num_tokens": 52740886.0, "step": 58 }, { "epoch": 0.44029850746268656, "grad_norm": 0.31570000266376347, "learning_rate": 9.906812450774207e-06, "loss": 0.5016104578971863, "num_tokens": 53671576.0, "step": 59 }, { "epoch": 0.44776119402985076, "grad_norm": 0.3184241179650494, "learning_rate": 9.901989979536841e-06, "loss": 0.4333784580230713, "num_tokens": 54565325.0, "step": 60 }, { "epoch": 0.4552238805970149, "grad_norm": 0.3257766657124954, "learning_rate": 9.897047226792093e-06, "loss": 0.47651222348213196, "num_tokens": 55458901.0, "step": 61 }, { "epoch": 0.4626865671641791, "grad_norm": 0.2817242291155619, "learning_rate": 9.891984327597462e-06, "loss": 0.4714818000793457, "num_tokens": 56519373.0, "step": 62 }, { "epoch": 0.4701492537313433, "grad_norm": 0.32585513855646564, "learning_rate": 9.886801420293365e-06, "loss": 0.4708700180053711, "num_tokens": 57420562.0, "step": 63 }, { "epoch": 0.47761194029850745, "grad_norm": 0.32958409535328365, "learning_rate": 9.88149864649937e-06, "loss": 0.49606209993362427, "num_tokens": 58259052.0, "step": 64 }, { "epoch": 0.48507462686567165, "grad_norm": 0.31230811419608556, "learning_rate": 9.876076151110313e-06, "loss": 0.4840630888938904, "num_tokens": 59121922.0, "step": 65 }, { "epoch": 0.4925373134328358, "grad_norm": 0.31050271225919246, "learning_rate": 9.870534082292349e-06, "loss": 0.4600119888782501, "num_tokens": 60031785.0, "step": 66 }, { "epoch": 0.5, "grad_norm": 0.2885380845506061, "learning_rate": 9.864872591478895e-06, "loss": 0.44136810302734375, "num_tokens": 60972704.0, "step": 67 }, { "epoch": 0.5074626865671642, "grad_norm": 0.28887203572406756, "learning_rate": 9.859091833366498e-06, "loss": 0.4619043469429016, "num_tokens": 61912202.0, "step": 68 }, { "epoch": 0.5149253731343284, "grad_norm": 0.297913211640831, "learning_rate": 9.853191965910606e-06, "loss": 0.48681432008743286, "num_tokens": 62799081.0, "step": 69 }, { "epoch": 0.5223880597014925, "grad_norm": 0.2978081791490928, "learning_rate": 9.847173150321252e-06, "loss": 0.4710129499435425, "num_tokens": 63821360.0, "step": 70 }, { "epoch": 0.5298507462686567, "grad_norm": 0.33901428896502994, "learning_rate": 9.84103555105865e-06, "loss": 0.46070268750190735, "num_tokens": 64698236.0, "step": 71 }, { "epoch": 0.5373134328358209, "grad_norm": 0.2863724536535567, "learning_rate": 9.8347793358287e-06, "loss": 0.43551623821258545, "num_tokens": 65531533.0, "step": 72 }, { "epoch": 0.5447761194029851, "grad_norm": 0.30884498358581325, "learning_rate": 9.828404675578405e-06, "loss": 0.43174412846565247, "num_tokens": 66409682.0, "step": 73 }, { "epoch": 0.5522388059701493, "grad_norm": 0.39653106497260543, "learning_rate": 9.821911744491203e-06, "loss": 0.47224926948547363, "num_tokens": 67201739.0, "step": 74 }, { "epoch": 0.5597014925373134, "grad_norm": 0.34427781009373076, "learning_rate": 9.815300719982204e-06, "loss": 0.46234217286109924, "num_tokens": 68054610.0, "step": 75 }, { "epoch": 0.5671641791044776, "grad_norm": 0.28593313207513976, "learning_rate": 9.808571782693345e-06, "loss": 0.4445508122444153, "num_tokens": 68905436.0, "step": 76 }, { "epoch": 0.5746268656716418, "grad_norm": 0.27754253103287374, "learning_rate": 9.80172511648845e-06, "loss": 0.4535985291004181, "num_tokens": 69815159.0, "step": 77 }, { "epoch": 0.582089552238806, "grad_norm": 0.2751626726169941, "learning_rate": 9.794760908448215e-06, "loss": 0.4778493642807007, "num_tokens": 70800960.0, "step": 78 }, { "epoch": 0.5895522388059702, "grad_norm": 0.2878195146653705, "learning_rate": 9.787679348865082e-06, "loss": 0.43559134006500244, "num_tokens": 71706284.0, "step": 79 }, { "epoch": 0.5970149253731343, "grad_norm": 0.3046702186252135, "learning_rate": 9.780480631238052e-06, "loss": 0.45745372772216797, "num_tokens": 72585611.0, "step": 80 }, { "epoch": 0.6044776119402985, "grad_norm": 0.2580161347993156, "learning_rate": 9.773164952267394e-06, "loss": 0.44172853231430054, "num_tokens": 73603712.0, "step": 81 }, { "epoch": 0.6119402985074627, "grad_norm": 0.31823458045045494, "learning_rate": 9.765732511849269e-06, "loss": 0.4543741047382355, "num_tokens": 74510353.0, "step": 82 }, { "epoch": 0.6194029850746269, "grad_norm": 0.3262276808903542, "learning_rate": 9.758183513070266e-06, "loss": 0.48102468252182007, "num_tokens": 75426311.0, "step": 83 }, { "epoch": 0.6268656716417911, "grad_norm": 0.298246592306743, "learning_rate": 9.750518162201858e-06, "loss": 0.45155635476112366, "num_tokens": 76290512.0, "step": 84 }, { "epoch": 0.6343283582089553, "grad_norm": 0.30840978846450423, "learning_rate": 9.74273666869476e-06, "loss": 0.4398882985115051, "num_tokens": 77207410.0, "step": 85 }, { "epoch": 0.6417910447761194, "grad_norm": 0.2986447882814022, "learning_rate": 9.734839245173213e-06, "loss": 0.43722379207611084, "num_tokens": 78061170.0, "step": 86 }, { "epoch": 0.6492537313432836, "grad_norm": 0.3213308600234638, "learning_rate": 9.726826107429168e-06, "loss": 0.44796180725097656, "num_tokens": 78868118.0, "step": 87 }, { "epoch": 0.6567164179104478, "grad_norm": 0.3249532753373927, "learning_rate": 9.71869747441639e-06, "loss": 0.4503297805786133, "num_tokens": 79869363.0, "step": 88 }, { "epoch": 0.664179104477612, "grad_norm": 0.5892356895414527, "learning_rate": 9.71045356824448e-06, "loss": 0.4414302110671997, "num_tokens": 80709876.0, "step": 89 }, { "epoch": 0.6716417910447762, "grad_norm": 0.32884534307528746, "learning_rate": 9.7020946141728e-06, "loss": 0.42054399847984314, "num_tokens": 81535856.0, "step": 90 }, { "epoch": 0.6791044776119403, "grad_norm": 0.2754517512669749, "learning_rate": 9.693620840604326e-06, "loss": 0.4349040985107422, "num_tokens": 82583455.0, "step": 91 }, { "epoch": 0.6865671641791045, "grad_norm": 0.3190387165435769, "learning_rate": 9.685032479079394e-06, "loss": 0.44351187348365784, "num_tokens": 83425036.0, "step": 92 }, { "epoch": 0.6940298507462687, "grad_norm": 0.29203678336341016, "learning_rate": 9.676329764269385e-06, "loss": 0.4587559103965759, "num_tokens": 84446952.0, "step": 93 }, { "epoch": 0.7014925373134329, "grad_norm": 0.2977218953461726, "learning_rate": 9.667512933970315e-06, "loss": 0.429887980222702, "num_tokens": 85254048.0, "step": 94 }, { "epoch": 0.7089552238805971, "grad_norm": 0.319328445980617, "learning_rate": 9.65858222909632e-06, "loss": 0.4590649902820587, "num_tokens": 86163467.0, "step": 95 }, { "epoch": 0.7164179104477612, "grad_norm": 0.5444784762173913, "learning_rate": 9.649537893673096e-06, "loss": 0.4472053647041321, "num_tokens": 86980140.0, "step": 96 }, { "epoch": 0.7238805970149254, "grad_norm": 0.33070572527793457, "learning_rate": 9.640380174831209e-06, "loss": 0.44589415192604065, "num_tokens": 87928454.0, "step": 97 }, { "epoch": 0.7313432835820896, "grad_norm": 0.31480720093895037, "learning_rate": 9.631109322799362e-06, "loss": 0.45890533924102783, "num_tokens": 88687125.0, "step": 98 }, { "epoch": 0.7388059701492538, "grad_norm": 0.3045515849614143, "learning_rate": 9.621725590897544e-06, "loss": 0.4472447633743286, "num_tokens": 89545040.0, "step": 99 }, { "epoch": 0.746268656716418, "grad_norm": 0.31053505819411625, "learning_rate": 9.61222923553011e-06, "loss": 0.44827064871788025, "num_tokens": 90294885.0, "step": 100 }, { "epoch": 0.753731343283582, "grad_norm": 0.3029175634429252, "learning_rate": 9.60262051617879e-06, "loss": 0.4412766695022583, "num_tokens": 91184198.0, "step": 101 }, { "epoch": 0.7611940298507462, "grad_norm": 0.31643279761949383, "learning_rate": 9.592899695395569e-06, "loss": 0.4483514428138733, "num_tokens": 91984545.0, "step": 102 }, { "epoch": 0.7686567164179104, "grad_norm": 0.29772953486777926, "learning_rate": 9.583067038795547e-06, "loss": 0.48575955629348755, "num_tokens": 92895986.0, "step": 103 }, { "epoch": 0.7761194029850746, "grad_norm": 0.3103900650504769, "learning_rate": 9.57312281504965e-06, "loss": 0.4450864791870117, "num_tokens": 93788383.0, "step": 104 }, { "epoch": 0.7835820895522388, "grad_norm": 0.2842262724404981, "learning_rate": 9.563067295877319e-06, "loss": 0.4178208112716675, "num_tokens": 94636525.0, "step": 105 }, { "epoch": 0.7910447761194029, "grad_norm": 0.318233292303752, "learning_rate": 9.552900756039057e-06, "loss": 0.48816001415252686, "num_tokens": 95397416.0, "step": 106 }, { "epoch": 0.7985074626865671, "grad_norm": 0.3031459599411157, "learning_rate": 9.54262347332894e-06, "loss": 0.4687079191207886, "num_tokens": 96224288.0, "step": 107 }, { "epoch": 0.8059701492537313, "grad_norm": 0.3044834471531261, "learning_rate": 9.532235728567025e-06, "loss": 0.4333556890487671, "num_tokens": 97053744.0, "step": 108 }, { "epoch": 0.8134328358208955, "grad_norm": 0.382174488436462, "learning_rate": 9.521737805591662e-06, "loss": 0.45386844873428345, "num_tokens": 97941243.0, "step": 109 }, { "epoch": 0.8208955223880597, "grad_norm": 0.29853935870773984, "learning_rate": 9.511129991251755e-06, "loss": 0.4180367588996887, "num_tokens": 98814023.0, "step": 110 }, { "epoch": 0.8283582089552238, "grad_norm": 0.3152812743712433, "learning_rate": 9.500412575398923e-06, "loss": 0.45900076627731323, "num_tokens": 99770911.0, "step": 111 }, { "epoch": 0.835820895522388, "grad_norm": 0.2798327916645599, "learning_rate": 9.489585850879565e-06, "loss": 0.4589983820915222, "num_tokens": 100802886.0, "step": 112 }, { "epoch": 0.8432835820895522, "grad_norm": 0.3302819245429099, "learning_rate": 9.478650113526875e-06, "loss": 0.44858676195144653, "num_tokens": 101744970.0, "step": 113 }, { "epoch": 0.8507462686567164, "grad_norm": 0.29962088349132515, "learning_rate": 9.467605662152746e-06, "loss": 0.4746031165122986, "num_tokens": 102730722.0, "step": 114 }, { "epoch": 0.8582089552238806, "grad_norm": 0.2939144591705004, "learning_rate": 9.456452798539617e-06, "loss": 0.4174093008041382, "num_tokens": 103574949.0, "step": 115 }, { "epoch": 0.8656716417910447, "grad_norm": 0.3825239836099086, "learning_rate": 9.445191827432216e-06, "loss": 0.439868301153183, "num_tokens": 104504791.0, "step": 116 }, { "epoch": 0.8731343283582089, "grad_norm": 0.30386076772048964, "learning_rate": 9.433823056529241e-06, "loss": 0.47291260957717896, "num_tokens": 105479834.0, "step": 117 }, { "epoch": 0.8805970149253731, "grad_norm": 0.2762720558491326, "learning_rate": 9.42234679647495e-06, "loss": 0.4426780045032501, "num_tokens": 106438084.0, "step": 118 }, { "epoch": 0.8880597014925373, "grad_norm": 0.3057508592926945, "learning_rate": 9.410763360850666e-06, "loss": 0.4623616933822632, "num_tokens": 107262750.0, "step": 119 }, { "epoch": 0.8955223880597015, "grad_norm": 0.3127855621446368, "learning_rate": 9.399073066166218e-06, "loss": 0.4572855234146118, "num_tokens": 108143548.0, "step": 120 }, { "epoch": 0.9029850746268657, "grad_norm": 0.32166755849704814, "learning_rate": 9.387276231851292e-06, "loss": 0.4610549211502075, "num_tokens": 109031239.0, "step": 121 }, { "epoch": 0.9104477611940298, "grad_norm": 0.308391680528446, "learning_rate": 9.375373180246698e-06, "loss": 0.4695647358894348, "num_tokens": 109986382.0, "step": 122 }, { "epoch": 0.917910447761194, "grad_norm": 0.2975657588114746, "learning_rate": 9.363364236595561e-06, "loss": 0.47796621918678284, "num_tokens": 110966120.0, "step": 123 }, { "epoch": 0.9253731343283582, "grad_norm": 0.31052979583373397, "learning_rate": 9.351249729034441e-06, "loss": 0.46253445744514465, "num_tokens": 111841748.0, "step": 124 }, { "epoch": 0.9328358208955224, "grad_norm": 0.30804176635348807, "learning_rate": 9.339029988584364e-06, "loss": 0.45033249258995056, "num_tokens": 112797621.0, "step": 125 }, { "epoch": 0.9402985074626866, "grad_norm": 0.2896323126815727, "learning_rate": 9.326705349141772e-06, "loss": 0.46928197145462036, "num_tokens": 113854322.0, "step": 126 }, { "epoch": 0.9477611940298507, "grad_norm": 0.2863377703738466, "learning_rate": 9.31427614746941e-06, "loss": 0.44036608934402466, "num_tokens": 114797592.0, "step": 127 }, { "epoch": 0.9552238805970149, "grad_norm": 0.3136460841921916, "learning_rate": 9.301742723187106e-06, "loss": 0.4462299644947052, "num_tokens": 115756574.0, "step": 128 }, { "epoch": 0.9626865671641791, "grad_norm": 0.30712216569223755, "learning_rate": 9.289105418762512e-06, "loss": 0.46634775400161743, "num_tokens": 116620827.0, "step": 129 }, { "epoch": 0.9701492537313433, "grad_norm": 0.30150157073298506, "learning_rate": 9.276364579501743e-06, "loss": 0.4525374174118042, "num_tokens": 117496028.0, "step": 130 }, { "epoch": 0.9776119402985075, "grad_norm": 0.2863498319159055, "learning_rate": 9.263520553539919e-06, "loss": 0.43308988213539124, "num_tokens": 118326101.0, "step": 131 }, { "epoch": 0.9850746268656716, "grad_norm": 0.31739713823558746, "learning_rate": 9.250573691831688e-06, "loss": 0.4591742753982544, "num_tokens": 119217901.0, "step": 132 }, { "epoch": 0.9925373134328358, "grad_norm": 0.3107389978804748, "learning_rate": 9.2375243481416e-06, "loss": 0.4491395056247711, "num_tokens": 120120192.0, "step": 133 }, { "epoch": 1.0, "grad_norm": 0.29934735002842794, "learning_rate": 9.224372879034471e-06, "loss": 0.44749873876571655, "num_tokens": 121051485.0, "step": 134 }, { "epoch": 1.007462686567164, "grad_norm": 0.33488387869414854, "learning_rate": 9.211119643865626e-06, "loss": 0.4307776689529419, "num_tokens": 121991896.0, "step": 135 }, { "epoch": 1.0149253731343284, "grad_norm": 0.32499655410029626, "learning_rate": 9.197765004771074e-06, "loss": 0.4204443097114563, "num_tokens": 122819690.0, "step": 136 }, { "epoch": 1.0223880597014925, "grad_norm": 0.34181089478733623, "learning_rate": 9.184309326657627e-06, "loss": 0.41079288721084595, "num_tokens": 123657032.0, "step": 137 }, { "epoch": 1.0298507462686568, "grad_norm": 0.5825488788426431, "learning_rate": 9.17075297719292e-06, "loss": 0.4082901179790497, "num_tokens": 124550556.0, "step": 138 }, { "epoch": 1.037313432835821, "grad_norm": 1.1799244713672623, "learning_rate": 9.157096326795369e-06, "loss": 0.42325854301452637, "num_tokens": 125328617.0, "step": 139 }, { "epoch": 1.044776119402985, "grad_norm": 0.3981431547057968, "learning_rate": 9.143339748624044e-06, "loss": 0.40712812542915344, "num_tokens": 126306594.0, "step": 140 }, { "epoch": 1.0522388059701493, "grad_norm": 0.32884099051410826, "learning_rate": 9.129483618568478e-06, "loss": 0.4147931933403015, "num_tokens": 127215038.0, "step": 141 }, { "epoch": 1.0597014925373134, "grad_norm": 0.3071551975535917, "learning_rate": 9.115528315238396e-06, "loss": 0.4247783422470093, "num_tokens": 128054129.0, "step": 142 }, { "epoch": 1.0671641791044777, "grad_norm": 0.3132240777032372, "learning_rate": 9.101474219953367e-06, "loss": 0.4133056104183197, "num_tokens": 128952014.0, "step": 143 }, { "epoch": 1.0746268656716418, "grad_norm": 0.31895939410654406, "learning_rate": 9.087321716732384e-06, "loss": 0.4213321805000305, "num_tokens": 129774041.0, "step": 144 }, { "epoch": 1.0820895522388059, "grad_norm": 0.32304487832880724, "learning_rate": 9.073071192283374e-06, "loss": 0.4195047616958618, "num_tokens": 130656187.0, "step": 145 }, { "epoch": 1.0895522388059702, "grad_norm": 0.31668877560620456, "learning_rate": 9.058723035992632e-06, "loss": 0.4216320514678955, "num_tokens": 131546421.0, "step": 146 }, { "epoch": 1.0970149253731343, "grad_norm": 0.30109857359574926, "learning_rate": 9.044277639914177e-06, "loss": 0.4255885183811188, "num_tokens": 132482644.0, "step": 147 }, { "epoch": 1.1044776119402986, "grad_norm": 0.28611352244816046, "learning_rate": 9.029735398759044e-06, "loss": 0.4004859924316406, "num_tokens": 133363098.0, "step": 148 }, { "epoch": 1.1119402985074627, "grad_norm": 0.3246541214309705, "learning_rate": 9.015096709884493e-06, "loss": 0.41801226139068604, "num_tokens": 134281169.0, "step": 149 }, { "epoch": 1.1194029850746268, "grad_norm": 0.39523810160114464, "learning_rate": 9.00036197328316e-06, "loss": 0.39403271675109863, "num_tokens": 135132326.0, "step": 150 }, { "epoch": 1.126865671641791, "grad_norm": 0.3372219635650443, "learning_rate": 8.985531591572117e-06, "loss": 0.40995997190475464, "num_tokens": 136009199.0, "step": 151 }, { "epoch": 1.1343283582089552, "grad_norm": 0.2880187226242739, "learning_rate": 8.97060596998188e-06, "loss": 0.44250696897506714, "num_tokens": 136974761.0, "step": 152 }, { "epoch": 1.1417910447761195, "grad_norm": 0.2840439662929065, "learning_rate": 8.955585516345333e-06, "loss": 0.41125112771987915, "num_tokens": 137953131.0, "step": 153 }, { "epoch": 1.1492537313432836, "grad_norm": 0.30854018310336556, "learning_rate": 8.940470641086583e-06, "loss": 0.41466018557548523, "num_tokens": 138890202.0, "step": 154 }, { "epoch": 1.1567164179104479, "grad_norm": 0.2861522107018775, "learning_rate": 8.925261757209744e-06, "loss": 0.4421645998954773, "num_tokens": 139921851.0, "step": 155 }, { "epoch": 1.164179104477612, "grad_norm": 0.30184466401361404, "learning_rate": 8.909959280287657e-06, "loss": 0.41726770997047424, "num_tokens": 140840212.0, "step": 156 }, { "epoch": 1.171641791044776, "grad_norm": 0.29786414496705443, "learning_rate": 8.894563628450534e-06, "loss": 0.4137997627258301, "num_tokens": 141681181.0, "step": 157 }, { "epoch": 1.1791044776119404, "grad_norm": 0.27612956474353256, "learning_rate": 8.879075222374522e-06, "loss": 0.3967845141887665, "num_tokens": 142603331.0, "step": 158 }, { "epoch": 1.1865671641791045, "grad_norm": 0.2936198747641151, "learning_rate": 8.863494485270228e-06, "loss": 0.3882240355014801, "num_tokens": 143438386.0, "step": 159 }, { "epoch": 1.1940298507462686, "grad_norm": 0.28750782577222145, "learning_rate": 8.847821842871137e-06, "loss": 0.42263633012771606, "num_tokens": 144352522.0, "step": 160 }, { "epoch": 1.2014925373134329, "grad_norm": 0.32255178451364774, "learning_rate": 8.832057723421989e-06, "loss": 0.42398497462272644, "num_tokens": 145160558.0, "step": 161 }, { "epoch": 1.208955223880597, "grad_norm": 0.32016607068719616, "learning_rate": 8.816202557667076e-06, "loss": 0.40889400243759155, "num_tokens": 145970221.0, "step": 162 }, { "epoch": 1.2164179104477613, "grad_norm": 0.30212941397274007, "learning_rate": 8.800256778838468e-06, "loss": 0.3960338234901428, "num_tokens": 146893310.0, "step": 163 }, { "epoch": 1.2238805970149254, "grad_norm": 0.31197303744834676, "learning_rate": 8.78422082264418e-06, "loss": 0.44305476546287537, "num_tokens": 147701963.0, "step": 164 }, { "epoch": 1.2313432835820897, "grad_norm": 0.2823293130053843, "learning_rate": 8.768095127256263e-06, "loss": 0.3833114206790924, "num_tokens": 148634179.0, "step": 165 }, { "epoch": 1.2388059701492538, "grad_norm": 0.2811151003410808, "learning_rate": 8.751880133298834e-06, "loss": 0.4171923100948334, "num_tokens": 149594443.0, "step": 166 }, { "epoch": 1.2462686567164178, "grad_norm": 0.31565679619489956, "learning_rate": 8.735576283836039e-06, "loss": 0.43264657258987427, "num_tokens": 150495465.0, "step": 167 }, { "epoch": 1.2537313432835822, "grad_norm": 0.3023001398731657, "learning_rate": 8.719184024359935e-06, "loss": 0.4185860753059387, "num_tokens": 151402535.0, "step": 168 }, { "epoch": 1.2611940298507462, "grad_norm": 0.3114367097991156, "learning_rate": 8.702703802778332e-06, "loss": 0.444894403219223, "num_tokens": 152354215.0, "step": 169 }, { "epoch": 1.2686567164179103, "grad_norm": 0.3130958107073367, "learning_rate": 8.686136069402542e-06, "loss": 0.3862420916557312, "num_tokens": 153135819.0, "step": 170 }, { "epoch": 1.2761194029850746, "grad_norm": 0.32026467648986173, "learning_rate": 8.669481276935085e-06, "loss": 0.43771523237228394, "num_tokens": 154060950.0, "step": 171 }, { "epoch": 1.2835820895522387, "grad_norm": 0.33753040760769915, "learning_rate": 8.652739880457309e-06, "loss": 0.4314393401145935, "num_tokens": 154999582.0, "step": 172 }, { "epoch": 1.291044776119403, "grad_norm": 0.31404977555481944, "learning_rate": 8.635912337416963e-06, "loss": 0.4238457679748535, "num_tokens": 155889540.0, "step": 173 }, { "epoch": 1.2985074626865671, "grad_norm": 0.2917828706410469, "learning_rate": 8.618999107615694e-06, "loss": 0.4157620072364807, "num_tokens": 156887223.0, "step": 174 }, { "epoch": 1.3059701492537314, "grad_norm": 0.2929002597150211, "learning_rate": 8.602000653196484e-06, "loss": 0.4093779921531677, "num_tokens": 157776705.0, "step": 175 }, { "epoch": 1.3134328358208955, "grad_norm": 0.2981368517552101, "learning_rate": 8.584917438631022e-06, "loss": 0.4151228070259094, "num_tokens": 158724790.0, "step": 176 }, { "epoch": 1.3208955223880596, "grad_norm": 0.307459834676784, "learning_rate": 8.567749930707012e-06, "loss": 0.42905163764953613, "num_tokens": 159719326.0, "step": 177 }, { "epoch": 1.328358208955224, "grad_norm": 0.3174851983597954, "learning_rate": 8.55049859851542e-06, "loss": 0.44639986753463745, "num_tokens": 160650411.0, "step": 178 }, { "epoch": 1.335820895522388, "grad_norm": 0.37310729673210785, "learning_rate": 8.533163913437657e-06, "loss": 0.4070381820201874, "num_tokens": 161685151.0, "step": 179 }, { "epoch": 1.3432835820895521, "grad_norm": 0.34243880652688075, "learning_rate": 8.515746349132693e-06, "loss": 0.40524742007255554, "num_tokens": 162668291.0, "step": 180 }, { "epoch": 1.3507462686567164, "grad_norm": 0.3314697629279733, "learning_rate": 8.498246381524123e-06, "loss": 0.39374542236328125, "num_tokens": 163602019.0, "step": 181 }, { "epoch": 1.3582089552238805, "grad_norm": 0.39714424307879675, "learning_rate": 8.480664488787157e-06, "loss": 0.41536325216293335, "num_tokens": 164374987.0, "step": 182 }, { "epoch": 1.3656716417910448, "grad_norm": 0.30470654817019394, "learning_rate": 8.463001151335556e-06, "loss": 0.420206755399704, "num_tokens": 165277351.0, "step": 183 }, { "epoch": 1.373134328358209, "grad_norm": 0.30147269826178413, "learning_rate": 8.445256851808504e-06, "loss": 0.40577423572540283, "num_tokens": 166179864.0, "step": 184 }, { "epoch": 1.3805970149253732, "grad_norm": 0.3160553991473881, "learning_rate": 8.427432075057422e-06, "loss": 0.3979928195476532, "num_tokens": 167127067.0, "step": 185 }, { "epoch": 1.3880597014925373, "grad_norm": 0.31665903933128287, "learning_rate": 8.409527308132717e-06, "loss": 0.4436604976654053, "num_tokens": 168100947.0, "step": 186 }, { "epoch": 1.3955223880597014, "grad_norm": 0.296181555140025, "learning_rate": 8.391543040270477e-06, "loss": 0.42373591661453247, "num_tokens": 168977100.0, "step": 187 }, { "epoch": 1.4029850746268657, "grad_norm": 0.340781706854354, "learning_rate": 8.373479762879104e-06, "loss": 0.4242423474788666, "num_tokens": 169809036.0, "step": 188 }, { "epoch": 1.4104477611940298, "grad_norm": 0.2912347476979519, "learning_rate": 8.355337969525876e-06, "loss": 0.3881043791770935, "num_tokens": 170799001.0, "step": 189 }, { "epoch": 1.417910447761194, "grad_norm": 0.3167891630018227, "learning_rate": 8.337118155923474e-06, "loss": 0.417064368724823, "num_tokens": 171563636.0, "step": 190 }, { "epoch": 1.4253731343283582, "grad_norm": 0.32116936347486175, "learning_rate": 8.318820819916433e-06, "loss": 0.40856266021728516, "num_tokens": 172297711.0, "step": 191 }, { "epoch": 1.4328358208955223, "grad_norm": 0.3019887016574649, "learning_rate": 8.300446461467533e-06, "loss": 0.4446168541908264, "num_tokens": 173246434.0, "step": 192 }, { "epoch": 1.4402985074626866, "grad_norm": 0.3138769818399579, "learning_rate": 8.281995582644145e-06, "loss": 0.4181920289993286, "num_tokens": 174149904.0, "step": 193 }, { "epoch": 1.4477611940298507, "grad_norm": 0.313975344503838, "learning_rate": 8.263468687604508e-06, "loss": 0.4371890425682068, "num_tokens": 174963687.0, "step": 194 }, { "epoch": 1.455223880597015, "grad_norm": 0.29628794439446526, "learning_rate": 8.244866282583957e-06, "loss": 0.43816518783569336, "num_tokens": 175988598.0, "step": 195 }, { "epoch": 1.462686567164179, "grad_norm": 0.2963583065242463, "learning_rate": 8.226188875881082e-06, "loss": 0.41185736656188965, "num_tokens": 176960311.0, "step": 196 }, { "epoch": 1.4701492537313432, "grad_norm": 0.2991189293307387, "learning_rate": 8.20743697784385e-06, "loss": 0.46473461389541626, "num_tokens": 177889691.0, "step": 197 }, { "epoch": 1.4776119402985075, "grad_norm": 0.26573849496019714, "learning_rate": 8.188611100855656e-06, "loss": 0.3865639567375183, "num_tokens": 178835508.0, "step": 198 }, { "epoch": 1.4850746268656716, "grad_norm": 0.28471866573069565, "learning_rate": 8.169711759321318e-06, "loss": 0.4254840612411499, "num_tokens": 179780829.0, "step": 199 }, { "epoch": 1.4925373134328357, "grad_norm": 0.27591064975620333, "learning_rate": 8.150739469653026e-06, "loss": 0.3821393847465515, "num_tokens": 180675259.0, "step": 200 }, { "epoch": 1.5, "grad_norm": 0.2912891463065521, "learning_rate": 8.131694750256234e-06, "loss": 0.4260258972644806, "num_tokens": 181593083.0, "step": 201 }, { "epoch": 1.5074626865671643, "grad_norm": 0.3470505245514532, "learning_rate": 8.112578121515485e-06, "loss": 0.42295166850090027, "num_tokens": 182453649.0, "step": 202 }, { "epoch": 1.5149253731343284, "grad_norm": 0.333624297966994, "learning_rate": 8.0933901057802e-06, "loss": 0.4165676534175873, "num_tokens": 183252908.0, "step": 203 }, { "epoch": 1.5223880597014925, "grad_norm": 0.2999450247966616, "learning_rate": 8.074131227350408e-06, "loss": 0.42348137497901917, "num_tokens": 184218061.0, "step": 204 }, { "epoch": 1.5298507462686568, "grad_norm": 0.33075885588759496, "learning_rate": 8.05480201246241e-06, "loss": 0.4413604140281677, "num_tokens": 185123701.0, "step": 205 }, { "epoch": 1.537313432835821, "grad_norm": 0.3236918821990334, "learning_rate": 8.035402989274402e-06, "loss": 0.4267103970050812, "num_tokens": 186020054.0, "step": 206 }, { "epoch": 1.544776119402985, "grad_norm": 0.28545115313146596, "learning_rate": 8.015934687852053e-06, "loss": 0.4010322690010071, "num_tokens": 186957926.0, "step": 207 }, { "epoch": 1.5522388059701493, "grad_norm": 0.33525388932605726, "learning_rate": 7.996397640154012e-06, "loss": 0.43479830026626587, "num_tokens": 187967937.0, "step": 208 }, { "epoch": 1.5597014925373134, "grad_norm": 0.2852110581692416, "learning_rate": 7.976792380017374e-06, "loss": 0.3835904002189636, "num_tokens": 188699883.0, "step": 209 }, { "epoch": 1.5671641791044775, "grad_norm": 0.38746256380732114, "learning_rate": 7.957119443143093e-06, "loss": 0.43473392724990845, "num_tokens": 189533459.0, "step": 210 }, { "epoch": 1.5746268656716418, "grad_norm": 0.30040372660742176, "learning_rate": 7.937379367081356e-06, "loss": 0.4094908535480499, "num_tokens": 190331401.0, "step": 211 }, { "epoch": 1.582089552238806, "grad_norm": 0.35097170028371405, "learning_rate": 7.917572691216868e-06, "loss": 0.44787487387657166, "num_tokens": 191163315.0, "step": 212 }, { "epoch": 1.5895522388059702, "grad_norm": 0.29035162522974023, "learning_rate": 7.897699956754142e-06, "loss": 0.41564756631851196, "num_tokens": 192105809.0, "step": 213 }, { "epoch": 1.5970149253731343, "grad_norm": 0.3234055460991543, "learning_rate": 7.877761706702698e-06, "loss": 0.42737478017807007, "num_tokens": 193098168.0, "step": 214 }, { "epoch": 1.6044776119402986, "grad_norm": 0.3181366599415042, "learning_rate": 7.85775848586222e-06, "loss": 0.4263436794281006, "num_tokens": 193975959.0, "step": 215 }, { "epoch": 1.6119402985074627, "grad_norm": 0.3047597849777916, "learning_rate": 7.837690840807688e-06, "loss": 0.4356343150138855, "num_tokens": 194828963.0, "step": 216 }, { "epoch": 1.6194029850746268, "grad_norm": 0.2953366209904587, "learning_rate": 7.817559319874417e-06, "loss": 0.39498403668403625, "num_tokens": 195757337.0, "step": 217 }, { "epoch": 1.626865671641791, "grad_norm": 0.2936401683412748, "learning_rate": 7.797364473143105e-06, "loss": 0.4154474139213562, "num_tokens": 196731181.0, "step": 218 }, { "epoch": 1.6343283582089554, "grad_norm": 0.2898185408597091, "learning_rate": 7.77710685242477e-06, "loss": 0.42473846673965454, "num_tokens": 197621017.0, "step": 219 }, { "epoch": 1.6417910447761193, "grad_norm": 0.29114088952907274, "learning_rate": 7.7567870112457e-06, "loss": 0.4433613419532776, "num_tokens": 198631859.0, "step": 220 }, { "epoch": 1.6492537313432836, "grad_norm": 0.31287064287880717, "learning_rate": 7.736405504832314e-06, "loss": 0.4322376549243927, "num_tokens": 199557498.0, "step": 221 }, { "epoch": 1.6567164179104479, "grad_norm": 0.3031132335175992, "learning_rate": 7.715962890095988e-06, "loss": 0.41872939467430115, "num_tokens": 200455519.0, "step": 222 }, { "epoch": 1.664179104477612, "grad_norm": 0.5127084447985639, "learning_rate": 7.695459725617851e-06, "loss": 0.4426816999912262, "num_tokens": 201364168.0, "step": 223 }, { "epoch": 1.671641791044776, "grad_norm": 0.36355358662257686, "learning_rate": 7.674896571633507e-06, "loss": 0.3920941650867462, "num_tokens": 202272665.0, "step": 224 }, { "epoch": 1.6791044776119404, "grad_norm": 0.2918543179655489, "learning_rate": 7.654273990017742e-06, "loss": 0.3865686058998108, "num_tokens": 203236852.0, "step": 225 }, { "epoch": 1.6865671641791045, "grad_norm": 0.29443958475831755, "learning_rate": 7.633592544269152e-06, "loss": 0.41160887479782104, "num_tokens": 204144409.0, "step": 226 }, { "epoch": 1.6940298507462686, "grad_norm": 0.29368087510062574, "learning_rate": 7.61285279949477e-06, "loss": 0.41996899247169495, "num_tokens": 205087641.0, "step": 227 }, { "epoch": 1.7014925373134329, "grad_norm": 0.2981876720268518, "learning_rate": 7.592055322394602e-06, "loss": 0.4322773814201355, "num_tokens": 205964269.0, "step": 228 }, { "epoch": 1.7089552238805972, "grad_norm": 0.3032205060654827, "learning_rate": 7.5712006812461595e-06, "loss": 0.4357481002807617, "num_tokens": 206853325.0, "step": 229 }, { "epoch": 1.716417910447761, "grad_norm": 0.30382769873452287, "learning_rate": 7.5502894458889154e-06, "loss": 0.42187392711639404, "num_tokens": 207780456.0, "step": 230 }, { "epoch": 1.7238805970149254, "grad_norm": 0.28458753280851, "learning_rate": 7.529322187708752e-06, "loss": 0.4417547583580017, "num_tokens": 208692271.0, "step": 231 }, { "epoch": 1.7313432835820897, "grad_norm": 0.28678480761878283, "learning_rate": 7.5082994796223355e-06, "loss": 0.4000692367553711, "num_tokens": 209542301.0, "step": 232 }, { "epoch": 1.7388059701492538, "grad_norm": 0.3105804034516556, "learning_rate": 7.487221896061458e-06, "loss": 0.43237993121147156, "num_tokens": 210462903.0, "step": 233 }, { "epoch": 1.7462686567164178, "grad_norm": 0.3069476203994755, "learning_rate": 7.466090012957361e-06, "loss": 0.4426308274269104, "num_tokens": 211451379.0, "step": 234 }, { "epoch": 1.7537313432835822, "grad_norm": 0.29187302592713965, "learning_rate": 7.444904407724973e-06, "loss": 0.4144989252090454, "num_tokens": 212341336.0, "step": 235 }, { "epoch": 1.7611940298507462, "grad_norm": 0.2715020106858522, "learning_rate": 7.423665659247154e-06, "loss": 0.4140280485153198, "num_tokens": 213184565.0, "step": 236 }, { "epoch": 1.7686567164179103, "grad_norm": 0.3042751492929567, "learning_rate": 7.402374347858862e-06, "loss": 0.4220738708972931, "num_tokens": 214162910.0, "step": 237 }, { "epoch": 1.7761194029850746, "grad_norm": 0.283596579410495, "learning_rate": 7.381031055331306e-06, "loss": 0.43350133299827576, "num_tokens": 215182240.0, "step": 238 }, { "epoch": 1.783582089552239, "grad_norm": 0.29114085647177373, "learning_rate": 7.3596363648560445e-06, "loss": 0.4327085316181183, "num_tokens": 216074554.0, "step": 239 }, { "epoch": 1.7910447761194028, "grad_norm": 0.28379283338161987, "learning_rate": 7.338190861029052e-06, "loss": 0.4293884038925171, "num_tokens": 216989156.0, "step": 240 }, { "epoch": 1.7985074626865671, "grad_norm": 0.31407525298001004, "learning_rate": 7.316695129834744e-06, "loss": 0.4033690392971039, "num_tokens": 217859754.0, "step": 241 }, { "epoch": 1.8059701492537314, "grad_norm": 0.3013707320804031, "learning_rate": 7.2951497586299665e-06, "loss": 0.415780246257782, "num_tokens": 218674048.0, "step": 242 }, { "epoch": 1.8134328358208955, "grad_norm": 0.3130414485143585, "learning_rate": 7.273555336127948e-06, "loss": 0.4289485216140747, "num_tokens": 219544627.0, "step": 243 }, { "epoch": 1.8208955223880596, "grad_norm": 0.271886252549519, "learning_rate": 7.251912452382206e-06, "loss": 0.4117184579372406, "num_tokens": 220510777.0, "step": 244 }, { "epoch": 1.828358208955224, "grad_norm": 0.3095984364408915, "learning_rate": 7.2302216987704395e-06, "loss": 0.40528762340545654, "num_tokens": 221358648.0, "step": 245 }, { "epoch": 1.835820895522388, "grad_norm": 0.28537942146166506, "learning_rate": 7.208483667978351e-06, "loss": 0.37842410802841187, "num_tokens": 222227328.0, "step": 246 }, { "epoch": 1.8432835820895521, "grad_norm": 0.3285002711937223, "learning_rate": 7.186698953983466e-06, "loss": 0.4463423192501068, "num_tokens": 223216379.0, "step": 247 }, { "epoch": 1.8507462686567164, "grad_norm": 0.29900827070350944, "learning_rate": 7.164868152038899e-06, "loss": 0.42675986886024475, "num_tokens": 224109870.0, "step": 248 }, { "epoch": 1.8582089552238807, "grad_norm": 0.27490080435841, "learning_rate": 7.1429918586570815e-06, "loss": 0.4331856667995453, "num_tokens": 225101205.0, "step": 249 }, { "epoch": 1.8656716417910446, "grad_norm": 0.2935787072389711, "learning_rate": 7.121070671593477e-06, "loss": 0.4262286424636841, "num_tokens": 226119986.0, "step": 250 }, { "epoch": 1.873134328358209, "grad_norm": 0.3045861994484339, "learning_rate": 7.099105189830235e-06, "loss": 0.4218306541442871, "num_tokens": 226995732.0, "step": 251 }, { "epoch": 1.8805970149253732, "grad_norm": 0.27595409032706397, "learning_rate": 7.077096013559831e-06, "loss": 0.4189199209213257, "num_tokens": 227872634.0, "step": 252 }, { "epoch": 1.8880597014925373, "grad_norm": 0.289326233334052, "learning_rate": 7.055043744168658e-06, "loss": 0.44568511843681335, "num_tokens": 228843256.0, "step": 253 }, { "epoch": 1.8955223880597014, "grad_norm": 0.3108178596802667, "learning_rate": 7.032948984220611e-06, "loss": 0.39977630972862244, "num_tokens": 229749232.0, "step": 254 }, { "epoch": 1.9029850746268657, "grad_norm": 0.3029945133044889, "learning_rate": 7.0108123374406046e-06, "loss": 0.41192835569381714, "num_tokens": 230524739.0, "step": 255 }, { "epoch": 1.9104477611940298, "grad_norm": 0.25289759257512634, "learning_rate": 6.988634408698083e-06, "loss": 0.38565781712532043, "num_tokens": 231455850.0, "step": 256 }, { "epoch": 1.917910447761194, "grad_norm": 0.298108417839461, "learning_rate": 6.966415803990501e-06, "loss": 0.4397220015525818, "num_tokens": 232349234.0, "step": 257 }, { "epoch": 1.9253731343283582, "grad_norm": 0.30576254773905986, "learning_rate": 6.944157130426745e-06, "loss": 0.43654486536979675, "num_tokens": 233187315.0, "step": 258 }, { "epoch": 1.9328358208955225, "grad_norm": 0.28668295683966216, "learning_rate": 6.9218589962105695e-06, "loss": 0.40597644448280334, "num_tokens": 234091956.0, "step": 259 }, { "epoch": 1.9402985074626866, "grad_norm": 0.2807573548073224, "learning_rate": 6.899522010623959e-06, "loss": 0.42698317766189575, "num_tokens": 235133005.0, "step": 260 }, { "epoch": 1.9477611940298507, "grad_norm": 0.2676937710994811, "learning_rate": 6.877146784010486e-06, "loss": 0.4118936061859131, "num_tokens": 235967243.0, "step": 261 }, { "epoch": 1.955223880597015, "grad_norm": 0.29199333652094117, "learning_rate": 6.854733927758636e-06, "loss": 0.42816537618637085, "num_tokens": 236876001.0, "step": 262 }, { "epoch": 1.962686567164179, "grad_norm": 0.3572922506463511, "learning_rate": 6.832284054285101e-06, "loss": 0.43847325444221497, "num_tokens": 237876952.0, "step": 263 }, { "epoch": 1.9701492537313432, "grad_norm": 0.2960985809182997, "learning_rate": 6.809797777018041e-06, "loss": 0.43155139684677124, "num_tokens": 238704164.0, "step": 264 }, { "epoch": 1.9776119402985075, "grad_norm": 0.3169980642916318, "learning_rate": 6.78727571038033e-06, "loss": 0.4308193027973175, "num_tokens": 239595870.0, "step": 265 }, { "epoch": 1.9850746268656716, "grad_norm": 0.3191747061655072, "learning_rate": 6.764718469772759e-06, "loss": 0.4188956022262573, "num_tokens": 240337386.0, "step": 266 }, { "epoch": 1.9925373134328357, "grad_norm": 0.28286588606011187, "learning_rate": 6.7421266715572275e-06, "loss": 0.40036123991012573, "num_tokens": 241215348.0, "step": 267 }, { "epoch": 2.0, "grad_norm": 0.2981753233991589, "learning_rate": 6.719500933039898e-06, "loss": 0.41549932956695557, "num_tokens": 242121111.0, "step": 268 }, { "epoch": 2.0074626865671643, "grad_norm": 0.33640737374184443, "learning_rate": 6.696841872454332e-06, "loss": 0.4132290482521057, "num_tokens": 243025320.0, "step": 269 }, { "epoch": 2.014925373134328, "grad_norm": 0.2822051764181089, "learning_rate": 6.674150108944593e-06, "loss": 0.37781068682670593, "num_tokens": 243793916.0, "step": 270 }, { "epoch": 2.0223880597014925, "grad_norm": 0.38987929902231017, "learning_rate": 6.651426262548326e-06, "loss": 0.40918004512786865, "num_tokens": 244799351.0, "step": 271 }, { "epoch": 2.029850746268657, "grad_norm": 0.348061447310908, "learning_rate": 6.62867095417983e-06, "loss": 0.3939589858055115, "num_tokens": 245795313.0, "step": 272 }, { "epoch": 2.0373134328358207, "grad_norm": 0.3046732710135438, "learning_rate": 6.605884805613073e-06, "loss": 0.36584192514419556, "num_tokens": 246732184.0, "step": 273 }, { "epoch": 2.044776119402985, "grad_norm": 0.3664198494618375, "learning_rate": 6.583068439464716e-06, "loss": 0.4081302881240845, "num_tokens": 247606091.0, "step": 274 }, { "epoch": 2.0522388059701493, "grad_norm": 0.3112614984470978, "learning_rate": 6.560222479177095e-06, "loss": 0.3947848081588745, "num_tokens": 248474307.0, "step": 275 }, { "epoch": 2.0597014925373136, "grad_norm": 0.3268123714386943, "learning_rate": 6.537347549001184e-06, "loss": 0.39627498388290405, "num_tokens": 249293743.0, "step": 276 }, { "epoch": 2.0671641791044775, "grad_norm": 0.30038025917744793, "learning_rate": 6.514444273979544e-06, "loss": 0.3961779773235321, "num_tokens": 250164041.0, "step": 277 }, { "epoch": 2.074626865671642, "grad_norm": 0.30941665860783496, "learning_rate": 6.491513279929238e-06, "loss": 0.3704898953437805, "num_tokens": 251063865.0, "step": 278 }, { "epoch": 2.082089552238806, "grad_norm": 0.2822311579038674, "learning_rate": 6.468555193424736e-06, "loss": 0.3888505697250366, "num_tokens": 251954121.0, "step": 279 }, { "epoch": 2.08955223880597, "grad_norm": 0.2838966427637005, "learning_rate": 6.445570641780786e-06, "loss": 0.3732953667640686, "num_tokens": 252767775.0, "step": 280 }, { "epoch": 2.0970149253731343, "grad_norm": 0.30198287700287857, "learning_rate": 6.422560253035287e-06, "loss": 0.3989664614200592, "num_tokens": 253671573.0, "step": 281 }, { "epoch": 2.1044776119402986, "grad_norm": 0.3143195160978541, "learning_rate": 6.399524655932111e-06, "loss": 0.4071004390716553, "num_tokens": 254540226.0, "step": 282 }, { "epoch": 2.111940298507463, "grad_norm": 0.29633039155095714, "learning_rate": 6.376464479903938e-06, "loss": 0.3590371012687683, "num_tokens": 255292355.0, "step": 283 }, { "epoch": 2.1194029850746268, "grad_norm": 0.2746728490799242, "learning_rate": 6.353380355055051e-06, "loss": 0.38884416222572327, "num_tokens": 256176530.0, "step": 284 }, { "epoch": 2.126865671641791, "grad_norm": 0.2951568696719758, "learning_rate": 6.330272912144116e-06, "loss": 0.42871013283729553, "num_tokens": 257090645.0, "step": 285 }, { "epoch": 2.1343283582089554, "grad_norm": 0.2902093873074645, "learning_rate": 6.307142782566952e-06, "loss": 0.3986203670501709, "num_tokens": 258131119.0, "step": 286 }, { "epoch": 2.1417910447761193, "grad_norm": 0.3900114303550773, "learning_rate": 6.283990598339274e-06, "loss": 0.390123188495636, "num_tokens": 258880552.0, "step": 287 }, { "epoch": 2.1492537313432836, "grad_norm": 0.2806374479908933, "learning_rate": 6.2608169920794314e-06, "loss": 0.36130136251449585, "num_tokens": 259758999.0, "step": 288 }, { "epoch": 2.156716417910448, "grad_norm": 0.2942927245657638, "learning_rate": 6.237622596991106e-06, "loss": 0.40030941367149353, "num_tokens": 260602559.0, "step": 289 }, { "epoch": 2.1641791044776117, "grad_norm": 0.3214957885578966, "learning_rate": 6.214408046846034e-06, "loss": 0.39499810338020325, "num_tokens": 261439646.0, "step": 290 }, { "epoch": 2.171641791044776, "grad_norm": 0.27240683635483437, "learning_rate": 6.191173975966669e-06, "loss": 0.3880019783973694, "num_tokens": 262474020.0, "step": 291 }, { "epoch": 2.1791044776119404, "grad_norm": 0.34023027676143563, "learning_rate": 6.167921019208851e-06, "loss": 0.42268985509872437, "num_tokens": 263528820.0, "step": 292 }, { "epoch": 2.1865671641791047, "grad_norm": 0.287848829860692, "learning_rate": 6.144649811944474e-06, "loss": 0.3913387656211853, "num_tokens": 264372315.0, "step": 293 }, { "epoch": 2.1940298507462686, "grad_norm": 0.29220713499868917, "learning_rate": 6.121360990044107e-06, "loss": 0.40157270431518555, "num_tokens": 265188957.0, "step": 294 }, { "epoch": 2.201492537313433, "grad_norm": 0.286455151799939, "learning_rate": 6.098055189859634e-06, "loss": 0.3945062756538391, "num_tokens": 266184697.0, "step": 295 }, { "epoch": 2.208955223880597, "grad_norm": 0.289286738435993, "learning_rate": 6.074733048206852e-06, "loss": 0.3945891559123993, "num_tokens": 267190971.0, "step": 296 }, { "epoch": 2.216417910447761, "grad_norm": 0.27448176767847715, "learning_rate": 6.051395202348089e-06, "loss": 0.3953642249107361, "num_tokens": 268121281.0, "step": 297 }, { "epoch": 2.2238805970149254, "grad_norm": 0.297149102735408, "learning_rate": 6.028042289974768e-06, "loss": 0.3815913796424866, "num_tokens": 269026334.0, "step": 298 }, { "epoch": 2.2313432835820897, "grad_norm": 0.29135459719595014, "learning_rate": 6.004674949190004e-06, "loss": 0.3744094967842102, "num_tokens": 269848673.0, "step": 299 }, { "epoch": 2.2388059701492535, "grad_norm": 0.3163386130777747, "learning_rate": 5.981293818491153e-06, "loss": 0.411973237991333, "num_tokens": 270729219.0, "step": 300 }, { "epoch": 2.246268656716418, "grad_norm": 0.2996160649578529, "learning_rate": 5.957899536752373e-06, "loss": 0.4180707335472107, "num_tokens": 271647605.0, "step": 301 }, { "epoch": 2.253731343283582, "grad_norm": 0.2744717376139136, "learning_rate": 5.934492743207168e-06, "loss": 0.36764925718307495, "num_tokens": 272444857.0, "step": 302 }, { "epoch": 2.2611940298507465, "grad_norm": 0.3051287913390687, "learning_rate": 5.911074077430917e-06, "loss": 0.3950934410095215, "num_tokens": 273313831.0, "step": 303 }, { "epoch": 2.2686567164179103, "grad_norm": 0.2740805047822694, "learning_rate": 5.887644179323403e-06, "loss": 0.38602137565612793, "num_tokens": 274151817.0, "step": 304 }, { "epoch": 2.2761194029850746, "grad_norm": 0.2811027592780593, "learning_rate": 5.864203689091316e-06, "loss": 0.40490180253982544, "num_tokens": 275023603.0, "step": 305 }, { "epoch": 2.283582089552239, "grad_norm": 0.37103511230501807, "learning_rate": 5.840753247230781e-06, "loss": 0.39756178855895996, "num_tokens": 275922951.0, "step": 306 }, { "epoch": 2.291044776119403, "grad_norm": 0.260165834106451, "learning_rate": 5.817293494509836e-06, "loss": 0.3657914996147156, "num_tokens": 276733073.0, "step": 307 }, { "epoch": 2.298507462686567, "grad_norm": 0.2676322746611296, "learning_rate": 5.793825071950936e-06, "loss": 0.3826783299446106, "num_tokens": 277699551.0, "step": 308 }, { "epoch": 2.3059701492537314, "grad_norm": 0.3171630796152734, "learning_rate": 5.770348620813433e-06, "loss": 0.38245660066604614, "num_tokens": 278695133.0, "step": 309 }, { "epoch": 2.3134328358208958, "grad_norm": 0.2749216503608562, "learning_rate": 5.746864782576054e-06, "loss": 0.38771188259124756, "num_tokens": 279483451.0, "step": 310 }, { "epoch": 2.3208955223880596, "grad_norm": 0.34619757766961257, "learning_rate": 5.723374198919376e-06, "loss": 0.40358829498291016, "num_tokens": 280316518.0, "step": 311 }, { "epoch": 2.328358208955224, "grad_norm": 0.2628421365077709, "learning_rate": 5.699877511708285e-06, "loss": 0.37161552906036377, "num_tokens": 281300113.0, "step": 312 }, { "epoch": 2.3358208955223883, "grad_norm": 0.2865924626367908, "learning_rate": 5.67637536297445e-06, "loss": 0.3707822561264038, "num_tokens": 282213553.0, "step": 313 }, { "epoch": 2.343283582089552, "grad_norm": 0.2782360921000711, "learning_rate": 5.652868394898766e-06, "loss": 0.38021302223205566, "num_tokens": 283069634.0, "step": 314 }, { "epoch": 2.3507462686567164, "grad_norm": 0.274968159536365, "learning_rate": 5.6293572497938165e-06, "loss": 0.4070481061935425, "num_tokens": 284055909.0, "step": 315 }, { "epoch": 2.3582089552238807, "grad_norm": 0.25137582516547385, "learning_rate": 5.605842570086321e-06, "loss": 0.38819169998168945, "num_tokens": 285072190.0, "step": 316 }, { "epoch": 2.3656716417910446, "grad_norm": 0.27416935469654424, "learning_rate": 5.582324998299573e-06, "loss": 0.3976019620895386, "num_tokens": 285997942.0, "step": 317 }, { "epoch": 2.373134328358209, "grad_norm": 0.28976153755834105, "learning_rate": 5.558805177035902e-06, "loss": 0.39910900592803955, "num_tokens": 286957228.0, "step": 318 }, { "epoch": 2.3805970149253732, "grad_norm": 0.3526174425898886, "learning_rate": 5.53528374895909e-06, "loss": 0.37735995650291443, "num_tokens": 287834123.0, "step": 319 }, { "epoch": 2.388059701492537, "grad_norm": 0.2753135236966283, "learning_rate": 5.511761356776834e-06, "loss": 0.3974205553531647, "num_tokens": 288755581.0, "step": 320 }, { "epoch": 2.3955223880597014, "grad_norm": 0.2836500955971764, "learning_rate": 5.488238643223167e-06, "loss": 0.4040617346763611, "num_tokens": 289616887.0, "step": 321 }, { "epoch": 2.4029850746268657, "grad_norm": 0.3001483066578534, "learning_rate": 5.464716251040911e-06, "loss": 0.39118584990501404, "num_tokens": 290466034.0, "step": 322 }, { "epoch": 2.41044776119403, "grad_norm": 0.29609458212755346, "learning_rate": 5.4411948229641e-06, "loss": 0.4012300372123718, "num_tokens": 291327531.0, "step": 323 }, { "epoch": 2.417910447761194, "grad_norm": 0.282307409973888, "learning_rate": 5.417675001700428e-06, "loss": 0.39297211170196533, "num_tokens": 292249211.0, "step": 324 }, { "epoch": 2.425373134328358, "grad_norm": 0.31947796875203593, "learning_rate": 5.394157429913681e-06, "loss": 0.43389707803726196, "num_tokens": 293154262.0, "step": 325 }, { "epoch": 2.4328358208955225, "grad_norm": 0.2806921837500959, "learning_rate": 5.370642750206184e-06, "loss": 0.4193563461303711, "num_tokens": 294190925.0, "step": 326 }, { "epoch": 2.4402985074626864, "grad_norm": 0.28217215862589007, "learning_rate": 5.347131605101237e-06, "loss": 0.42073380947113037, "num_tokens": 295155201.0, "step": 327 }, { "epoch": 2.4477611940298507, "grad_norm": 0.2595127351145338, "learning_rate": 5.323624637025552e-06, "loss": 0.38413190841674805, "num_tokens": 296039941.0, "step": 328 }, { "epoch": 2.455223880597015, "grad_norm": 0.27537880701127315, "learning_rate": 5.300122488291717e-06, "loss": 0.3896210193634033, "num_tokens": 296897125.0, "step": 329 }, { "epoch": 2.4626865671641793, "grad_norm": 0.2806456708250513, "learning_rate": 5.276625801080626e-06, "loss": 0.40547412633895874, "num_tokens": 297829206.0, "step": 330 }, { "epoch": 2.470149253731343, "grad_norm": 0.3233513262930407, "learning_rate": 5.253135217423948e-06, "loss": 0.3998452425003052, "num_tokens": 298813976.0, "step": 331 }, { "epoch": 2.4776119402985075, "grad_norm": 0.2870679405386201, "learning_rate": 5.229651379186569e-06, "loss": 0.41445013880729675, "num_tokens": 299755392.0, "step": 332 }, { "epoch": 2.485074626865672, "grad_norm": 0.2623639243435129, "learning_rate": 5.206174928049066e-06, "loss": 0.3996489644050598, "num_tokens": 300745461.0, "step": 333 }, { "epoch": 2.4925373134328357, "grad_norm": 0.2657883700801823, "learning_rate": 5.182706505490166e-06, "loss": 0.3919597864151001, "num_tokens": 301635785.0, "step": 334 }, { "epoch": 2.5, "grad_norm": 0.2730887704012263, "learning_rate": 5.15924675276922e-06, "loss": 0.37381941080093384, "num_tokens": 302529314.0, "step": 335 }, { "epoch": 2.5074626865671643, "grad_norm": 0.27926647905507407, "learning_rate": 5.135796310908685e-06, "loss": 0.4020169675350189, "num_tokens": 303325140.0, "step": 336 }, { "epoch": 2.5149253731343286, "grad_norm": 0.2573449307599577, "learning_rate": 5.1123558206766e-06, "loss": 0.3959069848060608, "num_tokens": 304291697.0, "step": 337 }, { "epoch": 2.5223880597014925, "grad_norm": 0.2713627052957801, "learning_rate": 5.088925922569084e-06, "loss": 0.4036637246608734, "num_tokens": 305167326.0, "step": 338 }, { "epoch": 2.529850746268657, "grad_norm": 0.29137688284390684, "learning_rate": 5.065507256792833e-06, "loss": 0.40749210119247437, "num_tokens": 306083413.0, "step": 339 }, { "epoch": 2.5373134328358207, "grad_norm": 0.27645786153124524, "learning_rate": 5.04210046324763e-06, "loss": 0.3960036039352417, "num_tokens": 306930925.0, "step": 340 }, { "epoch": 2.544776119402985, "grad_norm": 0.2959257579408876, "learning_rate": 5.018706181508851e-06, "loss": 0.40943804383277893, "num_tokens": 307667223.0, "step": 341 }, { "epoch": 2.5522388059701493, "grad_norm": 0.2941768147406628, "learning_rate": 4.995325050809999e-06, "loss": 0.42352843284606934, "num_tokens": 308548843.0, "step": 342 }, { "epoch": 2.5597014925373136, "grad_norm": 0.3093404075043933, "learning_rate": 4.971957710025235e-06, "loss": 0.4167254567146301, "num_tokens": 309456869.0, "step": 343 }, { "epoch": 2.5671641791044775, "grad_norm": 0.285830294036988, "learning_rate": 4.948604797651914e-06, "loss": 0.41970574855804443, "num_tokens": 310374426.0, "step": 344 }, { "epoch": 2.574626865671642, "grad_norm": 0.2822303940696211, "learning_rate": 4.925266951793149e-06, "loss": 0.39743444323539734, "num_tokens": 311185331.0, "step": 345 }, { "epoch": 2.582089552238806, "grad_norm": 0.2722209732746419, "learning_rate": 4.90194481014037e-06, "loss": 0.4093334674835205, "num_tokens": 312287344.0, "step": 346 }, { "epoch": 2.58955223880597, "grad_norm": 0.3685744506907742, "learning_rate": 4.878639009955896e-06, "loss": 0.3837957978248596, "num_tokens": 313203808.0, "step": 347 }, { "epoch": 2.5970149253731343, "grad_norm": 0.26210814461472964, "learning_rate": 4.855350188055528e-06, "loss": 0.374228835105896, "num_tokens": 314127724.0, "step": 348 }, { "epoch": 2.6044776119402986, "grad_norm": 0.26577422679986124, "learning_rate": 4.83207898079115e-06, "loss": 0.3950842022895813, "num_tokens": 315094649.0, "step": 349 }, { "epoch": 2.611940298507463, "grad_norm": 0.2694330124125045, "learning_rate": 4.808826024033334e-06, "loss": 0.3894980251789093, "num_tokens": 315902867.0, "step": 350 }, { "epoch": 2.6194029850746268, "grad_norm": 0.30012143917049156, "learning_rate": 4.785591953153966e-06, "loss": 0.3923467695713043, "num_tokens": 316809248.0, "step": 351 }, { "epoch": 2.626865671641791, "grad_norm": 0.27202743774586025, "learning_rate": 4.762377403008895e-06, "loss": 0.40671366453170776, "num_tokens": 317806785.0, "step": 352 }, { "epoch": 2.6343283582089554, "grad_norm": 0.2663498979647159, "learning_rate": 4.739183007920572e-06, "loss": 0.40148887038230896, "num_tokens": 318773135.0, "step": 353 }, { "epoch": 2.6417910447761193, "grad_norm": 0.26964724456667694, "learning_rate": 4.716009401660728e-06, "loss": 0.36810237169265747, "num_tokens": 319712540.0, "step": 354 }, { "epoch": 2.6492537313432836, "grad_norm": 0.2745583940218022, "learning_rate": 4.69285721743305e-06, "loss": 0.3969258666038513, "num_tokens": 320623524.0, "step": 355 }, { "epoch": 2.656716417910448, "grad_norm": 0.2691069702675602, "learning_rate": 4.669727087855886e-06, "loss": 0.39531204104423523, "num_tokens": 321558026.0, "step": 356 }, { "epoch": 2.664179104477612, "grad_norm": 0.2790488198361277, "learning_rate": 4.646619644944951e-06, "loss": 0.3691323399543762, "num_tokens": 322457137.0, "step": 357 }, { "epoch": 2.671641791044776, "grad_norm": 0.25676092193729705, "learning_rate": 4.623535520096063e-06, "loss": 0.3830498456954956, "num_tokens": 323406835.0, "step": 358 }, { "epoch": 2.6791044776119404, "grad_norm": 0.27765790893840286, "learning_rate": 4.6004753440678894e-06, "loss": 0.38582926988601685, "num_tokens": 324270762.0, "step": 359 }, { "epoch": 2.6865671641791042, "grad_norm": 0.2578194748970744, "learning_rate": 4.577439746964715e-06, "loss": 0.39646175503730774, "num_tokens": 325172716.0, "step": 360 }, { "epoch": 2.6940298507462686, "grad_norm": 0.26611474982215905, "learning_rate": 4.554429358219214e-06, "loss": 0.38044852018356323, "num_tokens": 326161663.0, "step": 361 }, { "epoch": 2.701492537313433, "grad_norm": 0.2670566328628317, "learning_rate": 4.531444806575266e-06, "loss": 0.40564393997192383, "num_tokens": 327106201.0, "step": 362 }, { "epoch": 2.708955223880597, "grad_norm": 0.274772662861299, "learning_rate": 4.508486720070761e-06, "loss": 0.39564812183380127, "num_tokens": 328050673.0, "step": 363 }, { "epoch": 2.716417910447761, "grad_norm": 0.3094439511198801, "learning_rate": 4.485555726020455e-06, "loss": 0.3800423741340637, "num_tokens": 328859100.0, "step": 364 }, { "epoch": 2.7238805970149254, "grad_norm": 0.2875993414193674, "learning_rate": 4.462652450998816e-06, "loss": 0.4001840353012085, "num_tokens": 329666962.0, "step": 365 }, { "epoch": 2.7313432835820897, "grad_norm": 0.27308262203119327, "learning_rate": 4.439777520822905e-06, "loss": 0.39083579182624817, "num_tokens": 330477732.0, "step": 366 }, { "epoch": 2.7388059701492535, "grad_norm": 0.2708315720399402, "learning_rate": 4.416931560535284e-06, "loss": 0.39352381229400635, "num_tokens": 331330359.0, "step": 367 }, { "epoch": 2.746268656716418, "grad_norm": 0.2678850422820554, "learning_rate": 4.394115194386928e-06, "loss": 0.38045477867126465, "num_tokens": 332347647.0, "step": 368 }, { "epoch": 2.753731343283582, "grad_norm": 0.2753212357157175, "learning_rate": 4.371329045820172e-06, "loss": 0.3969570994377136, "num_tokens": 333284873.0, "step": 369 }, { "epoch": 2.7611940298507465, "grad_norm": 0.28683339254512785, "learning_rate": 4.3485737374516745e-06, "loss": 0.4235033392906189, "num_tokens": 334098107.0, "step": 370 }, { "epoch": 2.7686567164179103, "grad_norm": 0.2698726522878529, "learning_rate": 4.3258498910554095e-06, "loss": 0.38629546761512756, "num_tokens": 334979408.0, "step": 371 }, { "epoch": 2.7761194029850746, "grad_norm": 0.2615554761622241, "learning_rate": 4.303158127545669e-06, "loss": 0.3924221694469452, "num_tokens": 335891381.0, "step": 372 }, { "epoch": 2.783582089552239, "grad_norm": 0.26064429917011145, "learning_rate": 4.280499066960102e-06, "loss": 0.3906182050704956, "num_tokens": 336949128.0, "step": 373 }, { "epoch": 2.791044776119403, "grad_norm": 0.27127505364411514, "learning_rate": 4.257873328442774e-06, "loss": 0.3783274292945862, "num_tokens": 337776659.0, "step": 374 }, { "epoch": 2.798507462686567, "grad_norm": 0.27410164043945023, "learning_rate": 4.2352815302272425e-06, "loss": 0.3829938471317291, "num_tokens": 338685204.0, "step": 375 }, { "epoch": 2.8059701492537314, "grad_norm": 0.2706332327188829, "learning_rate": 4.212724289619672e-06, "loss": 0.37140512466430664, "num_tokens": 339492119.0, "step": 376 }, { "epoch": 2.8134328358208958, "grad_norm": 0.29552966231342986, "learning_rate": 4.190202222981959e-06, "loss": 0.41518405079841614, "num_tokens": 340414044.0, "step": 377 }, { "epoch": 2.8208955223880596, "grad_norm": 0.4384124363415056, "learning_rate": 4.1677159457149005e-06, "loss": 0.3670823574066162, "num_tokens": 341275739.0, "step": 378 }, { "epoch": 2.828358208955224, "grad_norm": 0.2818008385366561, "learning_rate": 4.145266072241365e-06, "loss": 0.38579511642456055, "num_tokens": 342203284.0, "step": 379 }, { "epoch": 2.835820895522388, "grad_norm": 0.26814078006971265, "learning_rate": 4.122853215989515e-06, "loss": 0.4062846899032593, "num_tokens": 343206534.0, "step": 380 }, { "epoch": 2.843283582089552, "grad_norm": 0.27452179515826414, "learning_rate": 4.1004779893760424e-06, "loss": 0.397432416677475, "num_tokens": 344154341.0, "step": 381 }, { "epoch": 2.8507462686567164, "grad_norm": 0.27288188181425943, "learning_rate": 4.078141003789431e-06, "loss": 0.391731858253479, "num_tokens": 345024971.0, "step": 382 }, { "epoch": 2.8582089552238807, "grad_norm": 0.2967872715212152, "learning_rate": 4.055842869573256e-06, "loss": 0.400160551071167, "num_tokens": 345812228.0, "step": 383 }, { "epoch": 2.8656716417910446, "grad_norm": 0.27985989099065167, "learning_rate": 4.0335841960095025e-06, "loss": 0.3944920599460602, "num_tokens": 346769134.0, "step": 384 }, { "epoch": 2.873134328358209, "grad_norm": 0.2548795141867926, "learning_rate": 4.011365591301918e-06, "loss": 0.404415488243103, "num_tokens": 347740543.0, "step": 385 }, { "epoch": 2.8805970149253732, "grad_norm": 0.2353554630176529, "learning_rate": 3.989187662559397e-06, "loss": 0.3925011157989502, "num_tokens": 348799551.0, "step": 386 }, { "epoch": 2.888059701492537, "grad_norm": 0.4371240438139863, "learning_rate": 3.967051015779389e-06, "loss": 0.394489049911499, "num_tokens": 349833256.0, "step": 387 }, { "epoch": 2.8955223880597014, "grad_norm": 0.492017294414543, "learning_rate": 3.944956255831342e-06, "loss": 0.3901214003562927, "num_tokens": 350675901.0, "step": 388 }, { "epoch": 2.9029850746268657, "grad_norm": 0.28604462735158265, "learning_rate": 3.922903986440171e-06, "loss": 0.3956416845321655, "num_tokens": 351593161.0, "step": 389 }, { "epoch": 2.91044776119403, "grad_norm": 0.3019009320890686, "learning_rate": 3.900894810169766e-06, "loss": 0.4037666618824005, "num_tokens": 352556035.0, "step": 390 }, { "epoch": 2.917910447761194, "grad_norm": 0.2929989612906795, "learning_rate": 3.878929328406524e-06, "loss": 0.38326603174209595, "num_tokens": 353175046.0, "step": 391 }, { "epoch": 2.925373134328358, "grad_norm": 0.2811533155158446, "learning_rate": 3.857008141342921e-06, "loss": 0.3970789909362793, "num_tokens": 354040412.0, "step": 392 }, { "epoch": 2.9328358208955225, "grad_norm": 0.2642763742866724, "learning_rate": 3.8351318479611045e-06, "loss": 0.40754109621047974, "num_tokens": 354957977.0, "step": 393 }, { "epoch": 2.9402985074626864, "grad_norm": 0.2553969638436942, "learning_rate": 3.8133010460165364e-06, "loss": 0.3917849361896515, "num_tokens": 355897000.0, "step": 394 }, { "epoch": 2.9477611940298507, "grad_norm": 0.3227768986284808, "learning_rate": 3.791516332021651e-06, "loss": 0.38059675693511963, "num_tokens": 356775946.0, "step": 395 }, { "epoch": 2.955223880597015, "grad_norm": 0.26373506539724473, "learning_rate": 3.769778301229562e-06, "loss": 0.392505407333374, "num_tokens": 357732570.0, "step": 396 }, { "epoch": 2.9626865671641793, "grad_norm": 0.27141559638214446, "learning_rate": 3.748087547617795e-06, "loss": 0.38036075234413147, "num_tokens": 358510667.0, "step": 397 }, { "epoch": 2.970149253731343, "grad_norm": 0.24786828522735252, "learning_rate": 3.7264446638720542e-06, "loss": 0.37426790595054626, "num_tokens": 359444745.0, "step": 398 }, { "epoch": 2.9776119402985075, "grad_norm": 0.25219066519802286, "learning_rate": 3.704850241370035e-06, "loss": 0.3932304382324219, "num_tokens": 360351403.0, "step": 399 }, { "epoch": 2.9850746268656714, "grad_norm": 0.2314040595153558, "learning_rate": 3.6833048701652574e-06, "loss": 0.3921104669570923, "num_tokens": 361414260.0, "step": 400 }, { "epoch": 2.9925373134328357, "grad_norm": 0.24531758323496658, "learning_rate": 3.661809138970951e-06, "loss": 0.39479339122772217, "num_tokens": 362313539.0, "step": 401 }, { "epoch": 3.0, "grad_norm": 0.269225436814872, "learning_rate": 3.6403636351439577e-06, "loss": 0.39549848437309265, "num_tokens": 363114852.0, "step": 402 }, { "epoch": 3.0074626865671643, "grad_norm": 0.28662511975668975, "learning_rate": 3.618968944668696e-06, "loss": 0.35942816734313965, "num_tokens": 363883703.0, "step": 403 }, { "epoch": 3.014925373134328, "grad_norm": 0.2897343949926782, "learning_rate": 3.5976256521411402e-06, "loss": 0.37709563970565796, "num_tokens": 364726957.0, "step": 404 }, { "epoch": 3.0223880597014925, "grad_norm": 0.25819303755354905, "learning_rate": 3.576334340752847e-06, "loss": 0.3720802664756775, "num_tokens": 365712205.0, "step": 405 }, { "epoch": 3.029850746268657, "grad_norm": 0.28009429409591957, "learning_rate": 3.5550955922750275e-06, "loss": 0.3992989659309387, "num_tokens": 366502371.0, "step": 406 }, { "epoch": 3.0373134328358207, "grad_norm": 0.2764674226920931, "learning_rate": 3.533909987042642e-06, "loss": 0.39246252179145813, "num_tokens": 367405016.0, "step": 407 }, { "epoch": 3.044776119402985, "grad_norm": 0.30985373317019865, "learning_rate": 3.512778103938542e-06, "loss": 0.4023834466934204, "num_tokens": 368186973.0, "step": 408 }, { "epoch": 3.0522388059701493, "grad_norm": 0.28547534212425507, "learning_rate": 3.491700520377667e-06, "loss": 0.38294538855552673, "num_tokens": 369054384.0, "step": 409 }, { "epoch": 3.0597014925373136, "grad_norm": 0.2749822220227637, "learning_rate": 3.470677812291248e-06, "loss": 0.3690488636493683, "num_tokens": 370021137.0, "step": 410 }, { "epoch": 3.0671641791044775, "grad_norm": 0.2617585883370724, "learning_rate": 3.4497105541110847e-06, "loss": 0.39320921897888184, "num_tokens": 370954131.0, "step": 411 }, { "epoch": 3.074626865671642, "grad_norm": 0.276121676089303, "learning_rate": 3.4287993187538445e-06, "loss": 0.3605678975582123, "num_tokens": 371779138.0, "step": 412 }, { "epoch": 3.082089552238806, "grad_norm": 0.3190227559580631, "learning_rate": 3.407944677605399e-06, "loss": 0.408037006855011, "num_tokens": 372652437.0, "step": 413 }, { "epoch": 3.08955223880597, "grad_norm": 0.3764832484269211, "learning_rate": 3.387147200505232e-06, "loss": 0.38565126061439514, "num_tokens": 373477902.0, "step": 414 }, { "epoch": 3.0970149253731343, "grad_norm": 0.28107007973769577, "learning_rate": 3.366407455730849e-06, "loss": 0.414955735206604, "num_tokens": 374298186.0, "step": 415 }, { "epoch": 3.1044776119402986, "grad_norm": 0.2538068604333711, "learning_rate": 3.345726009982262e-06, "loss": 0.3739873766899109, "num_tokens": 375232722.0, "step": 416 }, { "epoch": 3.111940298507463, "grad_norm": 0.25345140165817104, "learning_rate": 3.3251034283664945e-06, "loss": 0.39425763487815857, "num_tokens": 376192544.0, "step": 417 }, { "epoch": 3.1194029850746268, "grad_norm": 0.26126693334804235, "learning_rate": 3.304540274382151e-06, "loss": 0.3673323094844818, "num_tokens": 377142524.0, "step": 418 }, { "epoch": 3.126865671641791, "grad_norm": 0.2718425837582604, "learning_rate": 3.284037109904013e-06, "loss": 0.38800495862960815, "num_tokens": 378076354.0, "step": 419 }, { "epoch": 3.1343283582089554, "grad_norm": 0.24762599606026042, "learning_rate": 3.263594495167688e-06, "loss": 0.3551333248615265, "num_tokens": 378966330.0, "step": 420 }, { "epoch": 3.1417910447761193, "grad_norm": 0.3979931015660995, "learning_rate": 3.2432129887543026e-06, "loss": 0.3955429196357727, "num_tokens": 379888904.0, "step": 421 }, { "epoch": 3.1492537313432836, "grad_norm": 0.27409522127657593, "learning_rate": 3.2228931475752323e-06, "loss": 0.35347574949264526, "num_tokens": 380738966.0, "step": 422 }, { "epoch": 3.156716417910448, "grad_norm": 0.26157991571638095, "learning_rate": 3.2026355268568987e-06, "loss": 0.35351991653442383, "num_tokens": 381614529.0, "step": 423 }, { "epoch": 3.1641791044776117, "grad_norm": 0.253961852327095, "learning_rate": 3.1824406801255836e-06, "loss": 0.36370548605918884, "num_tokens": 382513458.0, "step": 424 }, { "epoch": 3.171641791044776, "grad_norm": 0.24868042189319053, "learning_rate": 3.162309159192316e-06, "loss": 0.3607192635536194, "num_tokens": 383449861.0, "step": 425 }, { "epoch": 3.1791044776119404, "grad_norm": 0.26485700184898936, "learning_rate": 3.1422415141377815e-06, "loss": 0.3481111228466034, "num_tokens": 384253017.0, "step": 426 }, { "epoch": 3.1865671641791047, "grad_norm": 0.28281284316278155, "learning_rate": 3.122238293297305e-06, "loss": 0.3816152811050415, "num_tokens": 385257443.0, "step": 427 }, { "epoch": 3.1940298507462686, "grad_norm": 0.2628707804556158, "learning_rate": 3.10230004324586e-06, "loss": 0.349966824054718, "num_tokens": 386017753.0, "step": 428 }, { "epoch": 3.201492537313433, "grad_norm": 0.2606711695382564, "learning_rate": 3.0824273087831335e-06, "loss": 0.38945478200912476, "num_tokens": 386978912.0, "step": 429 }, { "epoch": 3.208955223880597, "grad_norm": 0.2747230623623624, "learning_rate": 3.062620632918648e-06, "loss": 0.3638556897640228, "num_tokens": 387852467.0, "step": 430 }, { "epoch": 3.216417910447761, "grad_norm": 0.2803007110615389, "learning_rate": 3.0428805568569076e-06, "loss": 0.38482367992401123, "num_tokens": 388658923.0, "step": 431 }, { "epoch": 3.2238805970149254, "grad_norm": 0.2645967994643593, "learning_rate": 3.023207619982629e-06, "loss": 0.36384740471839905, "num_tokens": 389508858.0, "step": 432 }, { "epoch": 3.2313432835820897, "grad_norm": 0.27202749711662244, "learning_rate": 3.0036023598459895e-06, "loss": 0.39492571353912354, "num_tokens": 390450838.0, "step": 433 }, { "epoch": 3.2388059701492535, "grad_norm": 0.2858842639475798, "learning_rate": 2.9840653121479478e-06, "loss": 0.3738439679145813, "num_tokens": 391283207.0, "step": 434 }, { "epoch": 3.246268656716418, "grad_norm": 0.24793258551891303, "learning_rate": 2.9645970107255997e-06, "loss": 0.35694074630737305, "num_tokens": 392285292.0, "step": 435 }, { "epoch": 3.253731343283582, "grad_norm": 0.2819278079547717, "learning_rate": 2.9451979875375913e-06, "loss": 0.3710547387599945, "num_tokens": 393145041.0, "step": 436 }, { "epoch": 3.2611940298507465, "grad_norm": 0.2631494530375275, "learning_rate": 2.925868772649591e-06, "loss": 0.3825373351573944, "num_tokens": 394022264.0, "step": 437 }, { "epoch": 3.2686567164179103, "grad_norm": 0.2555768738323888, "learning_rate": 2.9066098942197995e-06, "loss": 0.36353516578674316, "num_tokens": 394892104.0, "step": 438 }, { "epoch": 3.2761194029850746, "grad_norm": 0.252531665467931, "learning_rate": 2.887421878484516e-06, "loss": 0.38284653425216675, "num_tokens": 395835092.0, "step": 439 }, { "epoch": 3.283582089552239, "grad_norm": 0.2845282411268666, "learning_rate": 2.8683052497437665e-06, "loss": 0.3927590548992157, "num_tokens": 396725722.0, "step": 440 }, { "epoch": 3.291044776119403, "grad_norm": 0.26812964504110554, "learning_rate": 2.8492605303469732e-06, "loss": 0.37616321444511414, "num_tokens": 397618546.0, "step": 441 }, { "epoch": 3.298507462686567, "grad_norm": 0.25144632819615587, "learning_rate": 2.8302882406786817e-06, "loss": 0.382343053817749, "num_tokens": 398571441.0, "step": 442 }, { "epoch": 3.3059701492537314, "grad_norm": 0.29981470486255846, "learning_rate": 2.811388899144345e-06, "loss": 0.3775964379310608, "num_tokens": 399409770.0, "step": 443 }, { "epoch": 3.3134328358208958, "grad_norm": 0.37890241833609745, "learning_rate": 2.7925630221561506e-06, "loss": 0.37770912051200867, "num_tokens": 400392695.0, "step": 444 }, { "epoch": 3.3208955223880596, "grad_norm": 0.2755196059695862, "learning_rate": 2.7738111241189185e-06, "loss": 0.3694460690021515, "num_tokens": 401345623.0, "step": 445 }, { "epoch": 3.328358208955224, "grad_norm": 0.2680514510877795, "learning_rate": 2.755133717416043e-06, "loss": 0.3776453137397766, "num_tokens": 402260500.0, "step": 446 }, { "epoch": 3.3358208955223883, "grad_norm": 0.24171155783588386, "learning_rate": 2.7365313123954916e-06, "loss": 0.3985833525657654, "num_tokens": 403276687.0, "step": 447 }, { "epoch": 3.343283582089552, "grad_norm": 0.2569574542175994, "learning_rate": 2.718004417355855e-06, "loss": 0.3654242157936096, "num_tokens": 404190134.0, "step": 448 }, { "epoch": 3.3507462686567164, "grad_norm": 0.2493193792220214, "learning_rate": 2.699553538532467e-06, "loss": 0.3807545006275177, "num_tokens": 405215802.0, "step": 449 }, { "epoch": 3.3582089552238807, "grad_norm": 0.35218345161133224, "learning_rate": 2.6811791800835684e-06, "loss": 0.37028026580810547, "num_tokens": 406189028.0, "step": 450 }, { "epoch": 3.3656716417910446, "grad_norm": 0.2630947673101325, "learning_rate": 2.662881844076527e-06, "loss": 0.3866269886493683, "num_tokens": 407112961.0, "step": 451 }, { "epoch": 3.373134328358209, "grad_norm": 0.24502727833486168, "learning_rate": 2.6446620304741267e-06, "loss": 0.3389516770839691, "num_tokens": 407955720.0, "step": 452 }, { "epoch": 3.3805970149253732, "grad_norm": 0.29642792873153473, "learning_rate": 2.6265202371208985e-06, "loss": 0.3727038502693176, "num_tokens": 408861534.0, "step": 453 }, { "epoch": 3.388059701492537, "grad_norm": 0.2729055281837691, "learning_rate": 2.6084569597295227e-06, "loss": 0.37226539850234985, "num_tokens": 409769033.0, "step": 454 }, { "epoch": 3.3955223880597014, "grad_norm": 0.2591963730270879, "learning_rate": 2.590472691867284e-06, "loss": 0.3665540814399719, "num_tokens": 410734429.0, "step": 455 }, { "epoch": 3.4029850746268657, "grad_norm": 0.24603379464247438, "learning_rate": 2.57256792494258e-06, "loss": 0.3557394742965698, "num_tokens": 411668760.0, "step": 456 }, { "epoch": 3.41044776119403, "grad_norm": 0.26710941082613454, "learning_rate": 2.5547431481914973e-06, "loss": 0.3810808062553406, "num_tokens": 412593612.0, "step": 457 }, { "epoch": 3.417910447761194, "grad_norm": 0.24517969588523647, "learning_rate": 2.536998848664445e-06, "loss": 0.36506032943725586, "num_tokens": 413566574.0, "step": 458 }, { "epoch": 3.425373134328358, "grad_norm": 0.26080308677293107, "learning_rate": 2.5193355112128436e-06, "loss": 0.375240683555603, "num_tokens": 414490201.0, "step": 459 }, { "epoch": 3.4328358208955225, "grad_norm": 0.2507828857248926, "learning_rate": 2.501753618475877e-06, "loss": 0.3682469129562378, "num_tokens": 415392501.0, "step": 460 }, { "epoch": 3.4402985074626864, "grad_norm": 0.2692051104680508, "learning_rate": 2.4842536508673087e-06, "loss": 0.37688201665878296, "num_tokens": 416317197.0, "step": 461 }, { "epoch": 3.4477611940298507, "grad_norm": 0.2549135365443059, "learning_rate": 2.466836086562345e-06, "loss": 0.36603114008903503, "num_tokens": 417156988.0, "step": 462 }, { "epoch": 3.455223880597015, "grad_norm": 0.2453940193702825, "learning_rate": 2.4495014014845807e-06, "loss": 0.3681268095970154, "num_tokens": 418076187.0, "step": 463 }, { "epoch": 3.4626865671641793, "grad_norm": 0.2725566155237126, "learning_rate": 2.432250069292989e-06, "loss": 0.37921467423439026, "num_tokens": 418901462.0, "step": 464 }, { "epoch": 3.470149253731343, "grad_norm": 0.25907951334448015, "learning_rate": 2.415082561368979e-06, "loss": 0.39200738072395325, "num_tokens": 419804291.0, "step": 465 }, { "epoch": 3.4776119402985075, "grad_norm": 0.26406315997541896, "learning_rate": 2.397999346803518e-06, "loss": 0.39208582043647766, "num_tokens": 420712064.0, "step": 466 }, { "epoch": 3.485074626865672, "grad_norm": 0.23773901962622077, "learning_rate": 2.3810008923843077e-06, "loss": 0.37207821011543274, "num_tokens": 421699792.0, "step": 467 }, { "epoch": 3.4925373134328357, "grad_norm": 0.2479152678036227, "learning_rate": 2.3640876625830385e-06, "loss": 0.37208831310272217, "num_tokens": 422643169.0, "step": 468 }, { "epoch": 3.5, "grad_norm": 0.2563637058550244, "learning_rate": 2.347260119542692e-06, "loss": 0.378294974565506, "num_tokens": 423633161.0, "step": 469 }, { "epoch": 3.5074626865671643, "grad_norm": 0.2606090648417702, "learning_rate": 2.3305187230649177e-06, "loss": 0.3819723129272461, "num_tokens": 424556814.0, "step": 470 }, { "epoch": 3.5149253731343286, "grad_norm": 0.251352839735243, "learning_rate": 2.3138639305974596e-06, "loss": 0.37940090894699097, "num_tokens": 425479906.0, "step": 471 }, { "epoch": 3.5223880597014925, "grad_norm": 0.24680617583096287, "learning_rate": 2.2972961972216703e-06, "loss": 0.3712913393974304, "num_tokens": 426446651.0, "step": 472 }, { "epoch": 3.529850746268657, "grad_norm": 0.25068376553010957, "learning_rate": 2.2808159756400667e-06, "loss": 0.36781617999076843, "num_tokens": 427310770.0, "step": 473 }, { "epoch": 3.5373134328358207, "grad_norm": 0.2575081517211329, "learning_rate": 2.264423716163962e-06, "loss": 0.38692015409469604, "num_tokens": 428270355.0, "step": 474 }, { "epoch": 3.544776119402985, "grad_norm": 0.26353888813152593, "learning_rate": 2.2481198667011675e-06, "loss": 0.4076312184333801, "num_tokens": 429240026.0, "step": 475 }, { "epoch": 3.5522388059701493, "grad_norm": 0.24599771689956054, "learning_rate": 2.231904872743739e-06, "loss": 0.3803725838661194, "num_tokens": 430167582.0, "step": 476 }, { "epoch": 3.5597014925373136, "grad_norm": 0.24639104277677626, "learning_rate": 2.2157791773558222e-06, "loss": 0.3705400228500366, "num_tokens": 431118645.0, "step": 477 }, { "epoch": 3.5671641791044775, "grad_norm": 0.25169470907126656, "learning_rate": 2.199743221161533e-06, "loss": 0.40112996101379395, "num_tokens": 432105903.0, "step": 478 }, { "epoch": 3.574626865671642, "grad_norm": 0.2494015959735466, "learning_rate": 2.1837974423329254e-06, "loss": 0.37427645921707153, "num_tokens": 432968989.0, "step": 479 }, { "epoch": 3.582089552238806, "grad_norm": 0.2510312087393284, "learning_rate": 2.1679422765780115e-06, "loss": 0.3761802613735199, "num_tokens": 433879607.0, "step": 480 }, { "epoch": 3.58955223880597, "grad_norm": 0.2472662036793444, "learning_rate": 2.152178157128865e-06, "loss": 0.37739771604537964, "num_tokens": 434793981.0, "step": 481 }, { "epoch": 3.5970149253731343, "grad_norm": 0.2526868205714577, "learning_rate": 2.136505514729774e-06, "loss": 0.3701442778110504, "num_tokens": 435697283.0, "step": 482 }, { "epoch": 3.6044776119402986, "grad_norm": 0.2516013371512894, "learning_rate": 2.1209247776254795e-06, "loss": 0.3924868106842041, "num_tokens": 436627533.0, "step": 483 }, { "epoch": 3.611940298507463, "grad_norm": 0.24502502103917492, "learning_rate": 2.1054363715494695e-06, "loss": 0.34178441762924194, "num_tokens": 437481939.0, "step": 484 }, { "epoch": 3.6194029850746268, "grad_norm": 0.26412516983013123, "learning_rate": 2.0900407197123444e-06, "loss": 0.3800678253173828, "num_tokens": 438276274.0, "step": 485 }, { "epoch": 3.626865671641791, "grad_norm": 0.2650046240456923, "learning_rate": 2.0747382427902574e-06, "loss": 0.4031677544116974, "num_tokens": 439089480.0, "step": 486 }, { "epoch": 3.6343283582089554, "grad_norm": 0.2587379164469128, "learning_rate": 2.059529358913418e-06, "loss": 0.37271153926849365, "num_tokens": 439983559.0, "step": 487 }, { "epoch": 3.6417910447761193, "grad_norm": 0.2540913543502109, "learning_rate": 2.0444144836546684e-06, "loss": 0.3822531998157501, "num_tokens": 440850324.0, "step": 488 }, { "epoch": 3.6492537313432836, "grad_norm": 0.27783327558446214, "learning_rate": 2.0293940300181216e-06, "loss": 0.3831808269023895, "num_tokens": 441605590.0, "step": 489 }, { "epoch": 3.656716417910448, "grad_norm": 0.2796967269697153, "learning_rate": 2.0144684084278847e-06, "loss": 0.3709692060947418, "num_tokens": 442348946.0, "step": 490 }, { "epoch": 3.664179104477612, "grad_norm": 0.2465314987769435, "learning_rate": 1.999638026716842e-06, "loss": 0.35937702655792236, "num_tokens": 443300971.0, "step": 491 }, { "epoch": 3.671641791044776, "grad_norm": 0.24683809772269052, "learning_rate": 1.9849032901155075e-06, "loss": 0.39329999685287476, "num_tokens": 444301774.0, "step": 492 }, { "epoch": 3.6791044776119404, "grad_norm": 0.23859031932692393, "learning_rate": 1.970264601240958e-06, "loss": 0.3722185492515564, "num_tokens": 445224414.0, "step": 493 }, { "epoch": 3.6865671641791042, "grad_norm": 0.2707873095537451, "learning_rate": 1.955722360085824e-06, "loss": 0.38121020793914795, "num_tokens": 446138719.0, "step": 494 }, { "epoch": 3.6940298507462686, "grad_norm": 0.27134162465622047, "learning_rate": 1.941276964007369e-06, "loss": 0.41704389452934265, "num_tokens": 447027595.0, "step": 495 }, { "epoch": 3.701492537313433, "grad_norm": 0.27102779980384334, "learning_rate": 1.9269288077166264e-06, "loss": 0.41601014137268066, "num_tokens": 447918016.0, "step": 496 }, { "epoch": 3.708955223880597, "grad_norm": 0.2795343486481597, "learning_rate": 1.9126782832676175e-06, "loss": 0.37963247299194336, "num_tokens": 448782123.0, "step": 497 }, { "epoch": 3.716417910447761, "grad_norm": 0.24533152172023073, "learning_rate": 1.898525780046635e-06, "loss": 0.37255096435546875, "num_tokens": 449735295.0, "step": 498 }, { "epoch": 3.7238805970149254, "grad_norm": 0.25068064600084505, "learning_rate": 1.8844716847616053e-06, "loss": 0.3953704237937927, "num_tokens": 450703519.0, "step": 499 }, { "epoch": 3.7313432835820897, "grad_norm": 0.27826952511668485, "learning_rate": 1.870516381431523e-06, "loss": 0.37893447279930115, "num_tokens": 451523722.0, "step": 500 }, { "epoch": 3.7388059701492535, "grad_norm": 0.2470705912133386, "learning_rate": 1.8566602513759573e-06, "loss": 0.36960500478744507, "num_tokens": 452496914.0, "step": 501 }, { "epoch": 3.746268656716418, "grad_norm": 0.2380353729045607, "learning_rate": 1.8429036732046328e-06, "loss": 0.3598456084728241, "num_tokens": 453486873.0, "step": 502 }, { "epoch": 3.753731343283582, "grad_norm": 0.24753875738528466, "learning_rate": 1.8292470228070808e-06, "loss": 0.3775923550128937, "num_tokens": 454415514.0, "step": 503 }, { "epoch": 3.7611940298507465, "grad_norm": 0.24852622318044526, "learning_rate": 1.815690673342374e-06, "loss": 0.377275288105011, "num_tokens": 455330400.0, "step": 504 }, { "epoch": 3.7686567164179103, "grad_norm": 0.24830439594342327, "learning_rate": 1.8022349952289275e-06, "loss": 0.3592768907546997, "num_tokens": 456232858.0, "step": 505 }, { "epoch": 3.7761194029850746, "grad_norm": 0.2661718758635726, "learning_rate": 1.7888803561343755e-06, "loss": 0.3917810320854187, "num_tokens": 457091321.0, "step": 506 }, { "epoch": 3.783582089552239, "grad_norm": 0.2652414658319871, "learning_rate": 1.7756271209655296e-06, "loss": 0.41377222537994385, "num_tokens": 457990573.0, "step": 507 }, { "epoch": 3.791044776119403, "grad_norm": 0.260047413567863, "learning_rate": 1.7624756518584015e-06, "loss": 0.3786197304725647, "num_tokens": 458827375.0, "step": 508 }, { "epoch": 3.798507462686567, "grad_norm": 0.24921975710502509, "learning_rate": 1.7494263081683134e-06, "loss": 0.36924827098846436, "num_tokens": 459694321.0, "step": 509 }, { "epoch": 3.8059701492537314, "grad_norm": 0.24376539520051552, "learning_rate": 1.736479446460081e-06, "loss": 0.3597017526626587, "num_tokens": 460616396.0, "step": 510 }, { "epoch": 3.8134328358208958, "grad_norm": 0.24365917664342365, "learning_rate": 1.723635420498259e-06, "loss": 0.36935943365097046, "num_tokens": 461530829.0, "step": 511 }, { "epoch": 3.8208955223880596, "grad_norm": 0.23932370443954964, "learning_rate": 1.7108945812374874e-06, "loss": 0.387093722820282, "num_tokens": 462464505.0, "step": 512 }, { "epoch": 3.828358208955224, "grad_norm": 0.257078997056124, "learning_rate": 1.6982572768128964e-06, "loss": 0.38530057668685913, "num_tokens": 463398691.0, "step": 513 }, { "epoch": 3.835820895522388, "grad_norm": 0.24718882255890465, "learning_rate": 1.6857238525305924e-06, "loss": 0.3774847388267517, "num_tokens": 464295344.0, "step": 514 }, { "epoch": 3.843283582089552, "grad_norm": 0.23460337795500458, "learning_rate": 1.6732946508582288e-06, "loss": 0.3643302619457245, "num_tokens": 465251963.0, "step": 515 }, { "epoch": 3.8507462686567164, "grad_norm": 0.23769889055431628, "learning_rate": 1.6609700114156368e-06, "loss": 0.3710617423057556, "num_tokens": 466213047.0, "step": 516 }, { "epoch": 3.8582089552238807, "grad_norm": 0.23396843344896867, "learning_rate": 1.6487502709655591e-06, "loss": 0.382940411567688, "num_tokens": 467245768.0, "step": 517 }, { "epoch": 3.8656716417910446, "grad_norm": 0.23909686285484746, "learning_rate": 1.6366357634044406e-06, "loss": 0.3723403215408325, "num_tokens": 468129089.0, "step": 518 }, { "epoch": 3.873134328358209, "grad_norm": 0.2688981703218909, "learning_rate": 1.6246268197533046e-06, "loss": 0.3829047381877899, "num_tokens": 468938058.0, "step": 519 }, { "epoch": 3.8805970149253732, "grad_norm": 0.25519957310879615, "learning_rate": 1.6127237681487096e-06, "loss": 0.39446866512298584, "num_tokens": 469847619.0, "step": 520 }, { "epoch": 3.888059701492537, "grad_norm": 0.2486366229197261, "learning_rate": 1.6009269338337832e-06, "loss": 0.3983200788497925, "num_tokens": 470791148.0, "step": 521 }, { "epoch": 3.8955223880597014, "grad_norm": 0.24830658428540756, "learning_rate": 1.5892366391493363e-06, "loss": 0.38877153396606445, "num_tokens": 471735636.0, "step": 522 }, { "epoch": 3.9029850746268657, "grad_norm": 0.24923977507707654, "learning_rate": 1.5776532035250513e-06, "loss": 0.37799936532974243, "num_tokens": 472685312.0, "step": 523 }, { "epoch": 3.91044776119403, "grad_norm": 0.23192614084158372, "learning_rate": 1.5661769434707585e-06, "loss": 0.36345481872558594, "num_tokens": 473551908.0, "step": 524 }, { "epoch": 3.917910447761194, "grad_norm": 0.2552136498693883, "learning_rate": 1.5548081725677843e-06, "loss": 0.38905611634254456, "num_tokens": 474411763.0, "step": 525 }, { "epoch": 3.925373134328358, "grad_norm": 0.24222236291728852, "learning_rate": 1.543547201460384e-06, "loss": 0.39437806606292725, "num_tokens": 475386853.0, "step": 526 }, { "epoch": 3.9328358208955225, "grad_norm": 0.24678201558159044, "learning_rate": 1.5323943378472547e-06, "loss": 0.38338255882263184, "num_tokens": 476308351.0, "step": 527 }, { "epoch": 3.9402985074626864, "grad_norm": 0.24156858852006619, "learning_rate": 1.5213498864731266e-06, "loss": 0.3475341796875, "num_tokens": 477113932.0, "step": 528 }, { "epoch": 3.9477611940298507, "grad_norm": 0.2450649252841632, "learning_rate": 1.510414149120436e-06, "loss": 0.3621699810028076, "num_tokens": 477978986.0, "step": 529 }, { "epoch": 3.955223880597015, "grad_norm": 0.2615934671849586, "learning_rate": 1.4995874246010778e-06, "loss": 0.39790230989456177, "num_tokens": 478804801.0, "step": 530 }, { "epoch": 3.9626865671641793, "grad_norm": 0.23841246285008993, "learning_rate": 1.4888700087482447e-06, "loss": 0.36489465832710266, "num_tokens": 479744154.0, "step": 531 }, { "epoch": 3.970149253731343, "grad_norm": 0.23884234571084306, "learning_rate": 1.4782621944083395e-06, "loss": 0.3676777482032776, "num_tokens": 480672910.0, "step": 532 }, { "epoch": 3.9776119402985075, "grad_norm": 0.24521642019046497, "learning_rate": 1.4677642714329772e-06, "loss": 0.36571812629699707, "num_tokens": 481542586.0, "step": 533 }, { "epoch": 3.9850746268656714, "grad_norm": 0.2490357512875355, "learning_rate": 1.45737652667106e-06, "loss": 0.3776237964630127, "num_tokens": 482388483.0, "step": 534 }, { "epoch": 3.9925373134328357, "grad_norm": 0.26895614724288625, "learning_rate": 1.4470992439609447e-06, "loss": 0.36370331048965454, "num_tokens": 483130281.0, "step": 535 }, { "epoch": 4.0, "grad_norm": 0.23598448132329167, "learning_rate": 1.4369327041226832e-06, "loss": 0.3770376443862915, "num_tokens": 484157211.0, "step": 536 }, { "epoch": 4.007462686567164, "grad_norm": 0.2696832935027054, "learning_rate": 1.4268771849503507e-06, "loss": 0.3495013117790222, "num_tokens": 484950425.0, "step": 537 }, { "epoch": 4.014925373134329, "grad_norm": 0.2523061504872546, "learning_rate": 1.416932961204457e-06, "loss": 0.35033246874809265, "num_tokens": 485897373.0, "step": 538 }, { "epoch": 4.022388059701493, "grad_norm": 0.24871017609979634, "learning_rate": 1.4071003046044324e-06, "loss": 0.3654225468635559, "num_tokens": 486751466.0, "step": 539 }, { "epoch": 4.029850746268656, "grad_norm": 0.23941923046022578, "learning_rate": 1.3973794838212124e-06, "loss": 0.36163097620010376, "num_tokens": 487741373.0, "step": 540 }, { "epoch": 4.037313432835821, "grad_norm": 0.2662894021736037, "learning_rate": 1.3877707644698895e-06, "loss": 0.3875274062156677, "num_tokens": 488582397.0, "step": 541 }, { "epoch": 4.044776119402985, "grad_norm": 0.2747015512526315, "learning_rate": 1.3782744091024586e-06, "loss": 0.3777075409889221, "num_tokens": 489319854.0, "step": 542 }, { "epoch": 4.052238805970149, "grad_norm": 0.2525490812172139, "learning_rate": 1.3688906772006393e-06, "loss": 0.36404550075531006, "num_tokens": 490257709.0, "step": 543 }, { "epoch": 4.059701492537314, "grad_norm": 0.24847635532681875, "learning_rate": 1.359619825168792e-06, "loss": 0.36995524168014526, "num_tokens": 491153491.0, "step": 544 }, { "epoch": 4.067164179104478, "grad_norm": 0.23960840738937653, "learning_rate": 1.3504621063269058e-06, "loss": 0.36562579870224, "num_tokens": 492103168.0, "step": 545 }, { "epoch": 4.074626865671641, "grad_norm": 0.25453231573026114, "learning_rate": 1.3414177709036802e-06, "loss": 0.36385661363601685, "num_tokens": 493050344.0, "step": 546 }, { "epoch": 4.082089552238806, "grad_norm": 0.2409618977265192, "learning_rate": 1.3324870660296869e-06, "loss": 0.34029990434646606, "num_tokens": 493993937.0, "step": 547 }, { "epoch": 4.08955223880597, "grad_norm": 0.23732030399559195, "learning_rate": 1.3236702357306157e-06, "loss": 0.37044817209243774, "num_tokens": 494995752.0, "step": 548 }, { "epoch": 4.097014925373134, "grad_norm": 0.27145957936067777, "learning_rate": 1.3149675209206086e-06, "loss": 0.36308181285858154, "num_tokens": 495757177.0, "step": 549 }, { "epoch": 4.104477611940299, "grad_norm": 0.2833034141091316, "learning_rate": 1.3063791593956758e-06, "loss": 0.37331539392471313, "num_tokens": 496689668.0, "step": 550 }, { "epoch": 4.111940298507463, "grad_norm": 0.240823460931463, "learning_rate": 1.2979053858271995e-06, "loss": 0.36020007729530334, "num_tokens": 497565858.0, "step": 551 }, { "epoch": 4.119402985074627, "grad_norm": 0.2594604561644764, "learning_rate": 1.2895464317555206e-06, "loss": 0.3884323239326477, "num_tokens": 498385563.0, "step": 552 }, { "epoch": 4.126865671641791, "grad_norm": 0.23073517132438157, "learning_rate": 1.2813025255836104e-06, "loss": 0.349163293838501, "num_tokens": 499323100.0, "step": 553 }, { "epoch": 4.134328358208955, "grad_norm": 0.2603679958630556, "learning_rate": 1.2731738925708328e-06, "loss": 0.36741840839385986, "num_tokens": 500196622.0, "step": 554 }, { "epoch": 4.141791044776119, "grad_norm": 0.24326119145979633, "learning_rate": 1.2651607548267873e-06, "loss": 0.3810882568359375, "num_tokens": 501224710.0, "step": 555 }, { "epoch": 4.149253731343284, "grad_norm": 0.22934377798425087, "learning_rate": 1.257263331305241e-06, "loss": 0.37762486934661865, "num_tokens": 502305655.0, "step": 556 }, { "epoch": 4.156716417910448, "grad_norm": 0.2399419262393838, "learning_rate": 1.249481837798144e-06, "loss": 0.360861212015152, "num_tokens": 503186087.0, "step": 557 }, { "epoch": 4.164179104477612, "grad_norm": 0.2356017748084062, "learning_rate": 1.2418164869297353e-06, "loss": 0.36369866132736206, "num_tokens": 504097376.0, "step": 558 }, { "epoch": 4.1716417910447765, "grad_norm": 0.239368624704367, "learning_rate": 1.2342674881507327e-06, "loss": 0.36475175619125366, "num_tokens": 505048926.0, "step": 559 }, { "epoch": 4.17910447761194, "grad_norm": 0.24555194813944806, "learning_rate": 1.2268350477326073e-06, "loss": 0.3852774500846863, "num_tokens": 505967694.0, "step": 560 }, { "epoch": 4.186567164179104, "grad_norm": 0.24385261062576613, "learning_rate": 1.2195193687619505e-06, "loss": 0.3750133812427521, "num_tokens": 506924348.0, "step": 561 }, { "epoch": 4.1940298507462686, "grad_norm": 0.24733441550806298, "learning_rate": 1.2123206511349212e-06, "loss": 0.36548683047294617, "num_tokens": 507837247.0, "step": 562 }, { "epoch": 4.201492537313433, "grad_norm": 0.2626516276894915, "learning_rate": 1.2052390915517881e-06, "loss": 0.36941125988960266, "num_tokens": 508615951.0, "step": 563 }, { "epoch": 4.208955223880597, "grad_norm": 0.24609691004441409, "learning_rate": 1.1982748835115512e-06, "loss": 0.3862428665161133, "num_tokens": 509598473.0, "step": 564 }, { "epoch": 4.2164179104477615, "grad_norm": 0.24842515895556683, "learning_rate": 1.1914282173066574e-06, "loss": 0.38270822167396545, "num_tokens": 510499495.0, "step": 565 }, { "epoch": 4.223880597014926, "grad_norm": 0.2407337171765148, "learning_rate": 1.1846992800177979e-06, "loss": 0.3664012551307678, "num_tokens": 511393216.0, "step": 566 }, { "epoch": 4.231343283582089, "grad_norm": 0.2442416141258047, "learning_rate": 1.1780882555087988e-06, "loss": 0.3886314034461975, "num_tokens": 512343363.0, "step": 567 }, { "epoch": 4.2388059701492535, "grad_norm": 0.2577619381883818, "learning_rate": 1.1715953244215964e-06, "loss": 0.3437773585319519, "num_tokens": 513127609.0, "step": 568 }, { "epoch": 4.246268656716418, "grad_norm": 0.25087871697950354, "learning_rate": 1.165220664171302e-06, "loss": 0.3734786808490753, "num_tokens": 514033936.0, "step": 569 }, { "epoch": 4.253731343283582, "grad_norm": 0.2392856334846873, "learning_rate": 1.1589644489413516e-06, "loss": 0.35015231370925903, "num_tokens": 514934044.0, "step": 570 }, { "epoch": 4.2611940298507465, "grad_norm": 0.23533059380991045, "learning_rate": 1.1528268496787498e-06, "loss": 0.3818935453891754, "num_tokens": 515909265.0, "step": 571 }, { "epoch": 4.268656716417911, "grad_norm": 0.28002873497751246, "learning_rate": 1.1468080340893958e-06, "loss": 0.3613874316215515, "num_tokens": 516712628.0, "step": 572 }, { "epoch": 4.276119402985074, "grad_norm": 0.26573428139291055, "learning_rate": 1.1409081666335035e-06, "loss": 0.40466490387916565, "num_tokens": 517664539.0, "step": 573 }, { "epoch": 4.2835820895522385, "grad_norm": 0.2622221544713941, "learning_rate": 1.1351274085211068e-06, "loss": 0.36875689029693604, "num_tokens": 518492097.0, "step": 574 }, { "epoch": 4.291044776119403, "grad_norm": 0.8295997519231081, "learning_rate": 1.1294659177076523e-06, "loss": 0.343036413192749, "num_tokens": 519432536.0, "step": 575 }, { "epoch": 4.298507462686567, "grad_norm": 0.26477934459538893, "learning_rate": 1.1239238488896875e-06, "loss": 0.39276033639907837, "num_tokens": 520276253.0, "step": 576 }, { "epoch": 4.3059701492537314, "grad_norm": 0.2751291575165678, "learning_rate": 1.118501353500631e-06, "loss": 0.36554020643234253, "num_tokens": 521085557.0, "step": 577 }, { "epoch": 4.313432835820896, "grad_norm": 0.26704770077542006, "learning_rate": 1.1131985797066364e-06, "loss": 0.39840590953826904, "num_tokens": 521915761.0, "step": 578 }, { "epoch": 4.32089552238806, "grad_norm": 0.267325084112826, "learning_rate": 1.1080156724025409e-06, "loss": 0.3594783842563629, "num_tokens": 522783342.0, "step": 579 }, { "epoch": 4.3283582089552235, "grad_norm": 0.23810536176661679, "learning_rate": 1.1029527732079084e-06, "loss": 0.37440672516822815, "num_tokens": 523807264.0, "step": 580 }, { "epoch": 4.335820895522388, "grad_norm": 0.27369911060242186, "learning_rate": 1.0980100204631604e-06, "loss": 0.40351587533950806, "num_tokens": 524601938.0, "step": 581 }, { "epoch": 4.343283582089552, "grad_norm": 0.23536111609123755, "learning_rate": 1.0931875492257946e-06, "loss": 0.33745962381362915, "num_tokens": 525537212.0, "step": 582 }, { "epoch": 4.350746268656716, "grad_norm": 0.2600131581237491, "learning_rate": 1.088485491266694e-06, "loss": 0.38494178652763367, "num_tokens": 526347121.0, "step": 583 }, { "epoch": 4.358208955223881, "grad_norm": 0.23219951832538527, "learning_rate": 1.0839039750665292e-06, "loss": 0.35427361726760864, "num_tokens": 527281437.0, "step": 584 }, { "epoch": 4.365671641791045, "grad_norm": 0.2489391057817072, "learning_rate": 1.079443125812243e-06, "loss": 0.3624609708786011, "num_tokens": 528208071.0, "step": 585 }, { "epoch": 4.373134328358209, "grad_norm": 0.2539695897127002, "learning_rate": 1.0751030653936356e-06, "loss": 0.3747778534889221, "num_tokens": 529032089.0, "step": 586 }, { "epoch": 4.380597014925373, "grad_norm": 0.2499880144186626, "learning_rate": 1.0708839124000287e-06, "loss": 0.38273054361343384, "num_tokens": 529947287.0, "step": 587 }, { "epoch": 4.388059701492537, "grad_norm": 0.2506974248310357, "learning_rate": 1.0667857821170282e-06, "loss": 0.3470362424850464, "num_tokens": 530728896.0, "step": 588 }, { "epoch": 4.395522388059701, "grad_norm": 0.24506418459436066, "learning_rate": 1.0628087865233737e-06, "loss": 0.35882338881492615, "num_tokens": 531620091.0, "step": 589 }, { "epoch": 4.402985074626866, "grad_norm": 0.24329483114740325, "learning_rate": 1.058953034287877e-06, "loss": 0.37174564599990845, "num_tokens": 532460579.0, "step": 590 }, { "epoch": 4.41044776119403, "grad_norm": 0.23831984993388738, "learning_rate": 1.0552186307664567e-06, "loss": 0.363148033618927, "num_tokens": 533351390.0, "step": 591 }, { "epoch": 4.417910447761194, "grad_norm": 0.26162136426743393, "learning_rate": 1.0516056779992543e-06, "loss": 0.38013726472854614, "num_tokens": 534195605.0, "step": 592 }, { "epoch": 4.425373134328359, "grad_norm": 0.2635745464481523, "learning_rate": 1.0481142747078494e-06, "loss": 0.3700369596481323, "num_tokens": 535033541.0, "step": 593 }, { "epoch": 4.432835820895522, "grad_norm": 0.25007207032778783, "learning_rate": 1.0447445162925614e-06, "loss": 0.3790166974067688, "num_tokens": 535964895.0, "step": 594 }, { "epoch": 4.440298507462686, "grad_norm": 0.22799545701890034, "learning_rate": 1.0414964948298436e-06, "loss": 0.36508986353874207, "num_tokens": 536941184.0, "step": 595 }, { "epoch": 4.447761194029851, "grad_norm": 0.23265306394886567, "learning_rate": 1.0383702990697657e-06, "loss": 0.3546326160430908, "num_tokens": 537896596.0, "step": 596 }, { "epoch": 4.455223880597015, "grad_norm": 0.2452826212608677, "learning_rate": 1.0353660144335892e-06, "loss": 0.3647281229496002, "num_tokens": 538748931.0, "step": 597 }, { "epoch": 4.462686567164179, "grad_norm": 0.24623855227956742, "learning_rate": 1.0324837230114332e-06, "loss": 0.3664322793483734, "num_tokens": 539622406.0, "step": 598 }, { "epoch": 4.470149253731344, "grad_norm": 0.24476867667376634, "learning_rate": 1.0297235035600337e-06, "loss": 0.35626494884490967, "num_tokens": 540561688.0, "step": 599 }, { "epoch": 4.477611940298507, "grad_norm": 0.22411638197357536, "learning_rate": 1.0270854315005874e-06, "loss": 0.3493247628211975, "num_tokens": 541498885.0, "step": 600 }, { "epoch": 4.485074626865671, "grad_norm": 0.23854702147816884, "learning_rate": 1.024569578916695e-06, "loss": 0.36460673809051514, "num_tokens": 542468798.0, "step": 601 }, { "epoch": 4.492537313432836, "grad_norm": 0.24473776240066009, "learning_rate": 1.0221760145523876e-06, "loss": 0.3664558529853821, "num_tokens": 543354992.0, "step": 602 }, { "epoch": 4.5, "grad_norm": 0.3484100772975978, "learning_rate": 1.0199048038102528e-06, "loss": 0.3781493902206421, "num_tokens": 544264190.0, "step": 603 }, { "epoch": 4.507462686567164, "grad_norm": 0.23041088788536823, "learning_rate": 1.0177560087496425e-06, "loss": 0.36557939648628235, "num_tokens": 545199765.0, "step": 604 }, { "epoch": 4.514925373134329, "grad_norm": 0.26397201636028744, "learning_rate": 1.0157296880849826e-06, "loss": 0.39719897508621216, "num_tokens": 546061065.0, "step": 605 }, { "epoch": 4.522388059701493, "grad_norm": 0.2510378043077616, "learning_rate": 1.0138258971841642e-06, "loss": 0.3602595925331116, "num_tokens": 546928816.0, "step": 606 }, { "epoch": 4.529850746268656, "grad_norm": 0.25217406420558186, "learning_rate": 1.0120446880670326e-06, "loss": 0.3766353130340576, "num_tokens": 547847934.0, "step": 607 }, { "epoch": 4.537313432835821, "grad_norm": 0.23959568238841403, "learning_rate": 1.010386109403967e-06, "loss": 0.3650025725364685, "num_tokens": 548766636.0, "step": 608 }, { "epoch": 4.544776119402985, "grad_norm": 0.2377901920772251, "learning_rate": 1.008850206514547e-06, "loss": 0.3625343143939972, "num_tokens": 549661389.0, "step": 609 }, { "epoch": 4.552238805970149, "grad_norm": 0.26122470845807755, "learning_rate": 1.0074370213663202e-06, "loss": 0.3682940602302551, "num_tokens": 550430887.0, "step": 610 }, { "epoch": 4.559701492537314, "grad_norm": 0.2481365703161649, "learning_rate": 1.0061465925736478e-06, "loss": 0.36531317234039307, "num_tokens": 551293916.0, "step": 611 }, { "epoch": 4.567164179104478, "grad_norm": 0.23719670021949013, "learning_rate": 1.004978955396657e-06, "loss": 0.3669975996017456, "num_tokens": 552281926.0, "step": 612 }, { "epoch": 4.574626865671641, "grad_norm": 0.25803252973725255, "learning_rate": 1.0039341417402715e-06, "loss": 0.37066352367401123, "num_tokens": 553148975.0, "step": 613 }, { "epoch": 4.582089552238806, "grad_norm": 0.2476936983459798, "learning_rate": 1.0030121801533442e-06, "loss": 0.3824441134929657, "num_tokens": 554068576.0, "step": 614 }, { "epoch": 4.58955223880597, "grad_norm": 0.2489594826146839, "learning_rate": 1.002213095827875e-06, "loss": 0.3596557378768921, "num_tokens": 554855138.0, "step": 615 }, { "epoch": 4.597014925373134, "grad_norm": 0.2550266059020853, "learning_rate": 1.0015369105983218e-06, "loss": 0.34850555658340454, "num_tokens": 555783649.0, "step": 616 }, { "epoch": 4.604477611940299, "grad_norm": 0.28933444541800885, "learning_rate": 1.0009836429410053e-06, "loss": 0.3593859076499939, "num_tokens": 556756059.0, "step": 617 }, { "epoch": 4.611940298507463, "grad_norm": 0.24100103005251267, "learning_rate": 1.0005533079736037e-06, "loss": 0.34157663583755493, "num_tokens": 557624997.0, "step": 618 }, { "epoch": 4.619402985074627, "grad_norm": 0.2434497947580223, "learning_rate": 1.00024591745474e-06, "loss": 0.35940393805503845, "num_tokens": 558551462.0, "step": 619 }, { "epoch": 4.6268656716417915, "grad_norm": 0.2334659825308566, "learning_rate": 1.0000614797836587e-06, "loss": 0.3954239785671234, "num_tokens": 559571713.0, "step": 620 }, { "epoch": 4.6268656716417915, "step": 620, "total_flos": 829937030004736.0, "train_loss": 0.4202386662844689, "train_runtime": 18585.0074, "train_samples_per_second": 1.068, "train_steps_per_second": 0.033 } ], "logging_steps": 1, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 829937030004736.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }