{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2076, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007228044813877846, "grad_norm": 2.43762469291687, "learning_rate": 3.846153846153847e-07, "loss": 0.2014, "step": 5 }, { "epoch": 0.014456089627755691, "grad_norm": 0.7071607112884521, "learning_rate": 8.653846153846154e-07, "loss": 0.1441, "step": 10 }, { "epoch": 0.02168413444163354, "grad_norm": 0.25739526748657227, "learning_rate": 1.3461538461538462e-06, "loss": 0.0935, "step": 15 }, { "epoch": 0.028912179255511383, "grad_norm": 0.1548013836145401, "learning_rate": 1.826923076923077e-06, "loss": 0.0841, "step": 20 }, { "epoch": 0.03614022406938923, "grad_norm": 0.1367483139038086, "learning_rate": 2.307692307692308e-06, "loss": 0.0844, "step": 25 }, { "epoch": 0.04336826888326708, "grad_norm": 0.012530342675745487, "learning_rate": 2.7884615384615386e-06, "loss": 0.0809, "step": 30 }, { "epoch": 0.05059631369714492, "grad_norm": 0.049705736339092255, "learning_rate": 3.2692307692307696e-06, "loss": 0.0819, "step": 35 }, { "epoch": 0.057824358511022765, "grad_norm": 0.051718585193157196, "learning_rate": 3.7500000000000005e-06, "loss": 0.0803, "step": 40 }, { "epoch": 0.06505240332490062, "grad_norm": 0.01605582982301712, "learning_rate": 4.230769230769231e-06, "loss": 0.0813, "step": 45 }, { "epoch": 0.07228044813877846, "grad_norm": 0.06171448528766632, "learning_rate": 4.711538461538462e-06, "loss": 0.0801, "step": 50 }, { "epoch": 0.0795084929526563, "grad_norm": 0.05586954951286316, "learning_rate": 5.192307692307693e-06, "loss": 0.0811, "step": 55 }, { "epoch": 0.08673653776653416, "grad_norm": 0.028248045593500137, "learning_rate": 5.6730769230769235e-06, "loss": 0.0806, "step": 60 }, { "epoch": 0.093964582580412, "grad_norm": 0.04312776029109955, "learning_rate": 6.153846153846155e-06, "loss": 0.0808, "step": 65 }, { "epoch": 0.10119262739428984, "grad_norm": 0.10684467852115631, "learning_rate": 6.6346153846153846e-06, "loss": 0.0808, "step": 70 }, { "epoch": 0.10842067220816769, "grad_norm": 0.07184753566980362, "learning_rate": 7.115384615384616e-06, "loss": 0.0805, "step": 75 }, { "epoch": 0.11564871702204553, "grad_norm": 0.09277470409870148, "learning_rate": 7.5961538461538465e-06, "loss": 0.0807, "step": 80 }, { "epoch": 0.12287676183592339, "grad_norm": 0.02109931781888008, "learning_rate": 8.076923076923077e-06, "loss": 0.0809, "step": 85 }, { "epoch": 0.13010480664980123, "grad_norm": 0.03073902055621147, "learning_rate": 8.557692307692308e-06, "loss": 0.0812, "step": 90 }, { "epoch": 0.13733285146367907, "grad_norm": 0.06802671402692795, "learning_rate": 9.03846153846154e-06, "loss": 0.08, "step": 95 }, { "epoch": 0.14456089627755692, "grad_norm": 0.03209488093852997, "learning_rate": 9.51923076923077e-06, "loss": 0.08, "step": 100 }, { "epoch": 0.15178894109143476, "grad_norm": 0.07769843190908432, "learning_rate": 1e-05, "loss": 0.0817, "step": 105 }, { "epoch": 0.1590169859053126, "grad_norm": 0.06089721992611885, "learning_rate": 1.0480769230769232e-05, "loss": 0.0805, "step": 110 }, { "epoch": 0.16624503071919045, "grad_norm": 0.021150365471839905, "learning_rate": 1.0961538461538464e-05, "loss": 0.0797, "step": 115 }, { "epoch": 0.17347307553306832, "grad_norm": 0.035523343831300735, "learning_rate": 1.1442307692307693e-05, "loss": 0.0808, "step": 120 }, { "epoch": 0.18070112034694616, "grad_norm": 0.019112691283226013, "learning_rate": 1.1923076923076925e-05, "loss": 0.0805, "step": 125 }, { "epoch": 0.187929165160824, "grad_norm": 0.23690395057201385, "learning_rate": 1.2403846153846156e-05, "loss": 0.0836, "step": 130 }, { "epoch": 0.19515720997470185, "grad_norm": 0.08766212314367294, "learning_rate": 1.2884615384615386e-05, "loss": 0.0802, "step": 135 }, { "epoch": 0.2023852547885797, "grad_norm": 0.04037011042237282, "learning_rate": 1.3365384615384615e-05, "loss": 0.0811, "step": 140 }, { "epoch": 0.20961329960245753, "grad_norm": 0.15161241590976715, "learning_rate": 1.3846153846153847e-05, "loss": 0.0829, "step": 145 }, { "epoch": 0.21684134441633537, "grad_norm": 0.09708157926797867, "learning_rate": 1.4326923076923078e-05, "loss": 0.0816, "step": 150 }, { "epoch": 0.22406938923021322, "grad_norm": 0.09547246992588043, "learning_rate": 1.480769230769231e-05, "loss": 0.0801, "step": 155 }, { "epoch": 0.23129743404409106, "grad_norm": 0.015538723208010197, "learning_rate": 1.528846153846154e-05, "loss": 0.0813, "step": 160 }, { "epoch": 0.23852547885796893, "grad_norm": 0.053264446556568146, "learning_rate": 1.576923076923077e-05, "loss": 0.0811, "step": 165 }, { "epoch": 0.24575352367184677, "grad_norm": 0.0034629153087735176, "learning_rate": 1.6250000000000002e-05, "loss": 0.081, "step": 170 }, { "epoch": 0.2529815684857246, "grad_norm": 0.06591220200061798, "learning_rate": 1.673076923076923e-05, "loss": 0.0793, "step": 175 }, { "epoch": 0.26020961329960246, "grad_norm": 0.02226085402071476, "learning_rate": 1.7211538461538465e-05, "loss": 0.0811, "step": 180 }, { "epoch": 0.2674376581134803, "grad_norm": 0.031285785138607025, "learning_rate": 1.7692307692307694e-05, "loss": 0.0819, "step": 185 }, { "epoch": 0.27466570292735815, "grad_norm": 0.014217695221304893, "learning_rate": 1.8173076923076924e-05, "loss": 0.0797, "step": 190 }, { "epoch": 0.281893747741236, "grad_norm": 0.061094146221876144, "learning_rate": 1.8653846153846157e-05, "loss": 0.0792, "step": 195 }, { "epoch": 0.28912179255511383, "grad_norm": 0.1535295695066452, "learning_rate": 1.9134615384615387e-05, "loss": 0.0948, "step": 200 }, { "epoch": 0.2963498373689917, "grad_norm": 0.15612953901290894, "learning_rate": 1.9615384615384617e-05, "loss": 0.0895, "step": 205 }, { "epoch": 0.3035778821828695, "grad_norm": 0.015299557708203793, "learning_rate": 1.999998585783488e-05, "loss": 0.0818, "step": 210 }, { "epoch": 0.31080592699674736, "grad_norm": 0.07040851563215256, "learning_rate": 1.9999490886255767e-05, "loss": 0.0812, "step": 215 }, { "epoch": 0.3180339718106252, "grad_norm": 0.008683345280587673, "learning_rate": 1.999828884642042e-05, "loss": 0.0814, "step": 220 }, { "epoch": 0.32526201662450305, "grad_norm": 0.10729002952575684, "learning_rate": 1.9996379823325586e-05, "loss": 0.0806, "step": 225 }, { "epoch": 0.3324900614383809, "grad_norm": 0.07564987987279892, "learning_rate": 1.9993763951959107e-05, "loss": 0.0815, "step": 230 }, { "epoch": 0.3397181062522588, "grad_norm": 0.10431886464357376, "learning_rate": 1.9990441417290358e-05, "loss": 0.0846, "step": 235 }, { "epoch": 0.34694615106613663, "grad_norm": 0.11196550726890564, "learning_rate": 1.9986412454257178e-05, "loss": 0.0849, "step": 240 }, { "epoch": 0.3541741958800145, "grad_norm": 0.04622909799218178, "learning_rate": 1.998167734774926e-05, "loss": 0.0821, "step": 245 }, { "epoch": 0.3614022406938923, "grad_norm": 0.06048440560698509, "learning_rate": 1.9976236432588002e-05, "loss": 0.0808, "step": 250 }, { "epoch": 0.36863028550777016, "grad_norm": 0.01802447997033596, "learning_rate": 1.997009009350283e-05, "loss": 0.0805, "step": 255 }, { "epoch": 0.375858330321648, "grad_norm": 0.04636721312999725, "learning_rate": 1.996323876510399e-05, "loss": 0.081, "step": 260 }, { "epoch": 0.38308637513552585, "grad_norm": 0.062323447316884995, "learning_rate": 1.9955682931851835e-05, "loss": 0.083, "step": 265 }, { "epoch": 0.3903144199494037, "grad_norm": 0.0894196629524231, "learning_rate": 1.994742312802255e-05, "loss": 0.0815, "step": 270 }, { "epoch": 0.39754246476328153, "grad_norm": 0.0037153863813728094, "learning_rate": 1.993845993767038e-05, "loss": 0.0801, "step": 275 }, { "epoch": 0.4047705095771594, "grad_norm": 0.14254964888095856, "learning_rate": 1.9928793994586323e-05, "loss": 0.0857, "step": 280 }, { "epoch": 0.4119985543910372, "grad_norm": 0.08120843023061752, "learning_rate": 1.9918425982253335e-05, "loss": 0.0822, "step": 285 }, { "epoch": 0.41922659920491506, "grad_norm": 0.029054520651698112, "learning_rate": 1.9907356633797978e-05, "loss": 0.0813, "step": 290 }, { "epoch": 0.4264546440187929, "grad_norm": 0.03537634015083313, "learning_rate": 1.9895586731938593e-05, "loss": 0.0812, "step": 295 }, { "epoch": 0.43368268883267075, "grad_norm": 0.015365133993327618, "learning_rate": 1.9883117108929947e-05, "loss": 0.0825, "step": 300 }, { "epoch": 0.4409107336465486, "grad_norm": 0.029979810118675232, "learning_rate": 1.986994864650439e-05, "loss": 0.0821, "step": 305 }, { "epoch": 0.44813877846042643, "grad_norm": 0.021777283400297165, "learning_rate": 1.9856082275809508e-05, "loss": 0.0817, "step": 310 }, { "epoch": 0.4553668232743043, "grad_norm": 0.005012670066207647, "learning_rate": 1.9841518977342274e-05, "loss": 0.081, "step": 315 }, { "epoch": 0.4625948680881821, "grad_norm": 0.07129844278097153, "learning_rate": 1.9826259780879716e-05, "loss": 0.081, "step": 320 }, { "epoch": 0.46982291290206, "grad_norm": 0.07923093438148499, "learning_rate": 1.981030576540612e-05, "loss": 0.0806, "step": 325 }, { "epoch": 0.47705095771593786, "grad_norm": 0.04692668095231056, "learning_rate": 1.9793658059036697e-05, "loss": 0.0799, "step": 330 }, { "epoch": 0.4842790025298157, "grad_norm": 0.04923313483595848, "learning_rate": 1.977631783893786e-05, "loss": 0.0813, "step": 335 }, { "epoch": 0.49150704734369355, "grad_norm": 0.04896867647767067, "learning_rate": 1.975828633124394e-05, "loss": 0.0805, "step": 340 }, { "epoch": 0.4987350921575714, "grad_norm": 0.004160281270742416, "learning_rate": 1.9739564810970534e-05, "loss": 0.0804, "step": 345 }, { "epoch": 0.5059631369714492, "grad_norm": 0.07328899949789047, "learning_rate": 1.9720154601924295e-05, "loss": 0.0802, "step": 350 }, { "epoch": 0.5131911817853271, "grad_norm": 0.007687863428145647, "learning_rate": 1.9700057076609377e-05, "loss": 0.08, "step": 355 }, { "epoch": 0.5204192265992049, "grad_norm": 0.011853563599288464, "learning_rate": 1.967927365613034e-05, "loss": 0.0802, "step": 360 }, { "epoch": 0.5276472714130828, "grad_norm": 0.007507723290473223, "learning_rate": 1.96578058100917e-05, "loss": 0.0799, "step": 365 }, { "epoch": 0.5348753162269606, "grad_norm": 0.0316060446202755, "learning_rate": 1.963565505649398e-05, "loss": 0.081, "step": 370 }, { "epoch": 0.5421033610408384, "grad_norm": 0.03697923943400383, "learning_rate": 1.961282296162639e-05, "loss": 0.0799, "step": 375 }, { "epoch": 0.5493314058547163, "grad_norm": 0.007007018197327852, "learning_rate": 1.9589311139956086e-05, "loss": 0.0855, "step": 380 }, { "epoch": 0.5565594506685941, "grad_norm": 0.024093549698591232, "learning_rate": 1.956512125401398e-05, "loss": 0.0809, "step": 385 }, { "epoch": 0.563787495482472, "grad_norm": 0.007132918573915958, "learning_rate": 1.9540255014277198e-05, "loss": 0.0796, "step": 390 }, { "epoch": 0.5710155402963498, "grad_norm": 0.055107131600379944, "learning_rate": 1.9514714179048138e-05, "loss": 0.0792, "step": 395 }, { "epoch": 0.5782435851102277, "grad_norm": 0.036154747009277344, "learning_rate": 1.9488500554330126e-05, "loss": 0.0791, "step": 400 }, { "epoch": 0.5854716299241055, "grad_norm": 0.02283984236419201, "learning_rate": 1.946161599369973e-05, "loss": 0.0794, "step": 405 }, { "epoch": 0.5926996747379834, "grad_norm": 0.08648855239152908, "learning_rate": 1.9434062398175667e-05, "loss": 0.0776, "step": 410 }, { "epoch": 0.5999277195518612, "grad_norm": 0.042574405670166016, "learning_rate": 1.9405841716084403e-05, "loss": 0.0772, "step": 415 }, { "epoch": 0.607155764365739, "grad_norm": 0.035125792026519775, "learning_rate": 1.937695594292238e-05, "loss": 0.0765, "step": 420 }, { "epoch": 0.6143838091796169, "grad_norm": 0.05990980565547943, "learning_rate": 1.9347407121214917e-05, "loss": 0.075, "step": 425 }, { "epoch": 0.6216118539934947, "grad_norm": 0.13196605443954468, "learning_rate": 1.9317197340371764e-05, "loss": 0.0706, "step": 430 }, { "epoch": 0.6288398988073726, "grad_norm": 0.04624694585800171, "learning_rate": 1.9286328736539385e-05, "loss": 0.0827, "step": 435 }, { "epoch": 0.6360679436212504, "grad_norm": 0.03560846298933029, "learning_rate": 1.9254803492449894e-05, "loss": 0.0815, "step": 440 }, { "epoch": 0.6432959884351283, "grad_norm": 0.0264581311494112, "learning_rate": 1.922262383726672e-05, "loss": 0.0816, "step": 445 }, { "epoch": 0.6505240332490061, "grad_norm": 0.013737207278609276, "learning_rate": 1.9189792046426972e-05, "loss": 0.0795, "step": 450 }, { "epoch": 0.6577520780628839, "grad_norm": 0.008679666556417942, "learning_rate": 1.9156310441480557e-05, "loss": 0.0795, "step": 455 }, { "epoch": 0.6649801228767618, "grad_norm": 0.011995796114206314, "learning_rate": 1.912218138992601e-05, "loss": 0.0772, "step": 460 }, { "epoch": 0.6722081676906397, "grad_norm": 0.025770675390958786, "learning_rate": 1.9087407305043085e-05, "loss": 0.0732, "step": 465 }, { "epoch": 0.6794362125045176, "grad_norm": 0.11803118139505386, "learning_rate": 1.9051990645722133e-05, "loss": 0.0694, "step": 470 }, { "epoch": 0.6866642573183954, "grad_norm": 0.0760372206568718, "learning_rate": 1.9015933916290202e-05, "loss": 0.0676, "step": 475 }, { "epoch": 0.6938923021322733, "grad_norm": 0.03851527348160744, "learning_rate": 1.8979239666333975e-05, "loss": 0.0635, "step": 480 }, { "epoch": 0.7011203469461511, "grad_norm": 0.08836951106786728, "learning_rate": 1.8941910490519483e-05, "loss": 0.0636, "step": 485 }, { "epoch": 0.708348391760029, "grad_norm": 0.06829584389925003, "learning_rate": 1.8903949028408636e-05, "loss": 0.0648, "step": 490 }, { "epoch": 0.7155764365739068, "grad_norm": 0.02876531518995762, "learning_rate": 1.8865357964272576e-05, "loss": 0.062, "step": 495 }, { "epoch": 0.7228044813877846, "grad_norm": 0.039769161492586136, "learning_rate": 1.8826140026901873e-05, "loss": 0.0607, "step": 500 }, { "epoch": 0.7228044813877846, "eval_loss": 0.07328393310308456, "eval_runtime": 1144.2594, "eval_samples_per_second": 56.111, "eval_steps_per_second": 1.754, "step": 500 }, { "epoch": 0.7300325262016625, "grad_norm": 0.04059358313679695, "learning_rate": 1.878629798941357e-05, "loss": 0.0631, "step": 505 }, { "epoch": 0.7372605710155403, "grad_norm": 0.11944068223237991, "learning_rate": 1.8745834669055085e-05, "loss": 0.064, "step": 510 }, { "epoch": 0.7444886158294182, "grad_norm": 0.04337216168642044, "learning_rate": 1.8704752927005034e-05, "loss": 0.0618, "step": 515 }, { "epoch": 0.751716660643296, "grad_norm": 0.04100070148706436, "learning_rate": 1.8663055668170873e-05, "loss": 0.0613, "step": 520 }, { "epoch": 0.7589447054571739, "grad_norm": 0.0634031817317009, "learning_rate": 1.8620745840983522e-05, "loss": 0.0589, "step": 525 }, { "epoch": 0.7661727502710517, "grad_norm": 0.08053874224424362, "learning_rate": 1.857782643718887e-05, "loss": 0.0578, "step": 530 }, { "epoch": 0.7734007950849295, "grad_norm": 0.037648145109415054, "learning_rate": 1.8534300491636225e-05, "loss": 0.0561, "step": 535 }, { "epoch": 0.7806288398988074, "grad_norm": 0.07604615390300751, "learning_rate": 1.849017108206372e-05, "loss": 0.0615, "step": 540 }, { "epoch": 0.7878568847126852, "grad_norm": 0.07877160608768463, "learning_rate": 1.844544132888068e-05, "loss": 0.0613, "step": 545 }, { "epoch": 0.7950849295265631, "grad_norm": 0.09688904136419296, "learning_rate": 1.8400114394947003e-05, "loss": 0.0546, "step": 550 }, { "epoch": 0.8023129743404409, "grad_norm": 0.0808292031288147, "learning_rate": 1.8354193485349468e-05, "loss": 0.0521, "step": 555 }, { "epoch": 0.8095410191543188, "grad_norm": 0.05046294629573822, "learning_rate": 1.830768184717514e-05, "loss": 0.0581, "step": 560 }, { "epoch": 0.8167690639681966, "grad_norm": 0.045729391276836395, "learning_rate": 1.8260582769281747e-05, "loss": 0.0596, "step": 565 }, { "epoch": 0.8239971087820744, "grad_norm": 0.06313765794038773, "learning_rate": 1.821289958206513e-05, "loss": 0.0588, "step": 570 }, { "epoch": 0.8312251535959523, "grad_norm": 0.04902196675539017, "learning_rate": 1.8164635657223755e-05, "loss": 0.0584, "step": 575 }, { "epoch": 0.8384531984098301, "grad_norm": 0.08288609981536865, "learning_rate": 1.8115794407520287e-05, "loss": 0.0558, "step": 580 }, { "epoch": 0.845681243223708, "grad_norm": 0.04202403128147125, "learning_rate": 1.8066379286540278e-05, "loss": 0.0524, "step": 585 }, { "epoch": 0.8529092880375858, "grad_norm": 0.046127066016197205, "learning_rate": 1.8016393788447964e-05, "loss": 0.0549, "step": 590 }, { "epoch": 0.8601373328514637, "grad_norm": 0.04576544463634491, "learning_rate": 1.7965841447739185e-05, "loss": 0.0491, "step": 595 }, { "epoch": 0.8673653776653415, "grad_norm": 0.03939468041062355, "learning_rate": 1.7914725838991472e-05, "loss": 0.0543, "step": 600 }, { "epoch": 0.8745934224792193, "grad_norm": 0.03499499708414078, "learning_rate": 1.7863050576611267e-05, "loss": 0.051, "step": 605 }, { "epoch": 0.8818214672930972, "grad_norm": 0.04491008073091507, "learning_rate": 1.781081931457837e-05, "loss": 0.0553, "step": 610 }, { "epoch": 0.889049512106975, "grad_norm": 0.06234387680888176, "learning_rate": 1.7758035746187553e-05, "loss": 0.0517, "step": 615 }, { "epoch": 0.8962775569208529, "grad_norm": 0.04003310948610306, "learning_rate": 1.770470360378739e-05, "loss": 0.0545, "step": 620 }, { "epoch": 0.9035056017347307, "grad_norm": 0.08948640525341034, "learning_rate": 1.7650826658516375e-05, "loss": 0.0548, "step": 625 }, { "epoch": 0.9107336465486086, "grad_norm": 0.04328719154000282, "learning_rate": 1.7596408720036232e-05, "loss": 0.0551, "step": 630 }, { "epoch": 0.9179616913624864, "grad_norm": 0.03560628369450569, "learning_rate": 1.754145363626256e-05, "loss": 0.05, "step": 635 }, { "epoch": 0.9251897361763642, "grad_norm": 0.04258381202816963, "learning_rate": 1.748596529309271e-05, "loss": 0.0503, "step": 640 }, { "epoch": 0.9324177809902421, "grad_norm": 0.07129397243261337, "learning_rate": 1.742994761413105e-05, "loss": 0.0488, "step": 645 }, { "epoch": 0.93964582580412, "grad_norm": 0.03454764559864998, "learning_rate": 1.73734045604115e-05, "loss": 0.0553, "step": 650 }, { "epoch": 0.9468738706179979, "grad_norm": 0.053935691714286804, "learning_rate": 1.731634013011745e-05, "loss": 0.0499, "step": 655 }, { "epoch": 0.9541019154318757, "grad_norm": 0.08872876316308975, "learning_rate": 1.7258758358299053e-05, "loss": 0.0521, "step": 660 }, { "epoch": 0.9613299602457536, "grad_norm": 0.057639699429273605, "learning_rate": 1.7200663316587897e-05, "loss": 0.0505, "step": 665 }, { "epoch": 0.9685580050596314, "grad_norm": 0.08045148104429245, "learning_rate": 1.7142059112909107e-05, "loss": 0.0537, "step": 670 }, { "epoch": 0.9757860498735093, "grad_norm": 0.05597732216119766, "learning_rate": 1.708294989119087e-05, "loss": 0.0466, "step": 675 }, { "epoch": 0.9830140946873871, "grad_norm": 0.07576154917478561, "learning_rate": 1.7023339831071408e-05, "loss": 0.0527, "step": 680 }, { "epoch": 0.9902421395012649, "grad_norm": 0.03821377828717232, "learning_rate": 1.696323314760344e-05, "loss": 0.0483, "step": 685 }, { "epoch": 0.9974701843151428, "grad_norm": 0.07789347320795059, "learning_rate": 1.690263409095614e-05, "loss": 0.0483, "step": 690 }, { "epoch": 1.0043368268883268, "grad_norm": 0.03913086652755737, "learning_rate": 1.6841546946114586e-05, "loss": 0.0417, "step": 695 }, { "epoch": 1.0115648717022045, "grad_norm": 0.054379936307668686, "learning_rate": 1.6779976032576792e-05, "loss": 0.0336, "step": 700 }, { "epoch": 1.0187929165160825, "grad_norm": 0.08715476840734482, "learning_rate": 1.6717925704048256e-05, "loss": 0.0416, "step": 705 }, { "epoch": 1.0260209613299602, "grad_norm": 0.0766800120472908, "learning_rate": 1.6655400348134122e-05, "loss": 0.0404, "step": 710 }, { "epoch": 1.0332490061438382, "grad_norm": 0.06571623682975769, "learning_rate": 1.659240438602891e-05, "loss": 0.0431, "step": 715 }, { "epoch": 1.040477050957716, "grad_norm": 0.09106060862541199, "learning_rate": 1.6528942272203912e-05, "loss": 0.0419, "step": 720 }, { "epoch": 1.0477050957715939, "grad_norm": 0.0675068348646164, "learning_rate": 1.6465018494092213e-05, "loss": 0.0401, "step": 725 }, { "epoch": 1.0549331405854716, "grad_norm": 0.06592784821987152, "learning_rate": 1.6400637571771354e-05, "loss": 0.0383, "step": 730 }, { "epoch": 1.0621611853993496, "grad_norm": 0.08610466867685318, "learning_rate": 1.633580405764376e-05, "loss": 0.0355, "step": 735 }, { "epoch": 1.0693892302132273, "grad_norm": 0.09420937299728394, "learning_rate": 1.6270522536114813e-05, "loss": 0.0393, "step": 740 }, { "epoch": 1.0766172750271052, "grad_norm": 0.066034696996212, "learning_rate": 1.6204797623268675e-05, "loss": 0.0353, "step": 745 }, { "epoch": 1.083845319840983, "grad_norm": 0.05469588562846184, "learning_rate": 1.6138633966541905e-05, "loss": 0.0395, "step": 750 }, { "epoch": 1.091073364654861, "grad_norm": 0.05333936959505081, "learning_rate": 1.6072036244394836e-05, "loss": 0.0409, "step": 755 }, { "epoch": 1.0983014094687387, "grad_norm": 0.06300196051597595, "learning_rate": 1.600500916598074e-05, "loss": 0.0382, "step": 760 }, { "epoch": 1.1055294542826166, "grad_norm": 0.08063532412052155, "learning_rate": 1.5937557470812852e-05, "loss": 0.0369, "step": 765 }, { "epoch": 1.1127574990964944, "grad_norm": 0.07369716465473175, "learning_rate": 1.5869685928429253e-05, "loss": 0.0393, "step": 770 }, { "epoch": 1.1199855439103723, "grad_norm": 0.058140210807323456, "learning_rate": 1.5801399338055584e-05, "loss": 0.0352, "step": 775 }, { "epoch": 1.12721358872425, "grad_norm": 0.060627613216638565, "learning_rate": 1.5732702528265716e-05, "loss": 0.0381, "step": 780 }, { "epoch": 1.134441633538128, "grad_norm": 0.07465813308954239, "learning_rate": 1.5663600356640306e-05, "loss": 0.0367, "step": 785 }, { "epoch": 1.1416696783520057, "grad_norm": 0.047345198690891266, "learning_rate": 1.5594097709423316e-05, "loss": 0.0389, "step": 790 }, { "epoch": 1.1488977231658837, "grad_norm": 0.06834863871335983, "learning_rate": 1.552419950117651e-05, "loss": 0.0381, "step": 795 }, { "epoch": 1.1561257679797614, "grad_norm": 0.047312233597040176, "learning_rate": 1.545391067443194e-05, "loss": 0.0343, "step": 800 }, { "epoch": 1.1633538127936394, "grad_norm": 0.06944846361875534, "learning_rate": 1.538323619934247e-05, "loss": 0.0389, "step": 805 }, { "epoch": 1.170581857607517, "grad_norm": 0.04954347014427185, "learning_rate": 1.5312181073330295e-05, "loss": 0.0366, "step": 810 }, { "epoch": 1.177809902421395, "grad_norm": 0.06755795329809189, "learning_rate": 1.524075032073363e-05, "loss": 0.0373, "step": 815 }, { "epoch": 1.1850379472352728, "grad_norm": 0.08281169086694717, "learning_rate": 1.5168948992451382e-05, "loss": 0.0356, "step": 820 }, { "epoch": 1.1922659920491507, "grad_norm": 0.08935344219207764, "learning_rate": 1.5096782165586037e-05, "loss": 0.0365, "step": 825 }, { "epoch": 1.1994940368630285, "grad_norm": 0.04098968952894211, "learning_rate": 1.5024254943084629e-05, "loss": 0.0381, "step": 830 }, { "epoch": 1.2067220816769064, "grad_norm": 0.05055451765656471, "learning_rate": 1.495137245337794e-05, "loss": 0.0367, "step": 835 }, { "epoch": 1.2139501264907842, "grad_norm": 0.06718173623085022, "learning_rate": 1.487813985001782e-05, "loss": 0.0356, "step": 840 }, { "epoch": 1.221178171304662, "grad_norm": 0.05843829736113548, "learning_rate": 1.480456231131283e-05, "loss": 0.0352, "step": 845 }, { "epoch": 1.2284062161185398, "grad_norm": 0.052432768046855927, "learning_rate": 1.4730645039962044e-05, "loss": 0.0364, "step": 850 }, { "epoch": 1.2356342609324178, "grad_norm": 0.05346972495317459, "learning_rate": 1.4656393262687172e-05, "loss": 0.0368, "step": 855 }, { "epoch": 1.2428623057462955, "grad_norm": 0.06525395065546036, "learning_rate": 1.4581812229862993e-05, "loss": 0.0338, "step": 860 }, { "epoch": 1.2500903505601735, "grad_norm": 0.07090573757886887, "learning_rate": 1.4506907215146075e-05, "loss": 0.0328, "step": 865 }, { "epoch": 1.2573183953740514, "grad_norm": 0.05279651656746864, "learning_rate": 1.443168351510189e-05, "loss": 0.0348, "step": 870 }, { "epoch": 1.2645464401879292, "grad_norm": 0.05886390060186386, "learning_rate": 1.4356146448830277e-05, "loss": 0.0341, "step": 875 }, { "epoch": 1.271774485001807, "grad_norm": 0.0537516325712204, "learning_rate": 1.4280301357589349e-05, "loss": 0.0379, "step": 880 }, { "epoch": 1.2790025298156849, "grad_norm": 0.049141135066747665, "learning_rate": 1.4204153604417775e-05, "loss": 0.0339, "step": 885 }, { "epoch": 1.2862305746295628, "grad_norm": 0.05724327638745308, "learning_rate": 1.4127708573755599e-05, "loss": 0.0317, "step": 890 }, { "epoch": 1.2934586194434405, "grad_norm": 0.05847681313753128, "learning_rate": 1.4050971671063464e-05, "loss": 0.0341, "step": 895 }, { "epoch": 1.3006866642573183, "grad_norm": 0.04777985066175461, "learning_rate": 1.3973948322440427e-05, "loss": 0.0388, "step": 900 }, { "epoch": 1.3079147090711962, "grad_norm": 0.062013089656829834, "learning_rate": 1.3896643974240245e-05, "loss": 0.0309, "step": 905 }, { "epoch": 1.3151427538850742, "grad_norm": 0.08561990410089493, "learning_rate": 1.3819064092686278e-05, "loss": 0.0327, "step": 910 }, { "epoch": 1.322370798698952, "grad_norm": 0.08605846017599106, "learning_rate": 1.3741214163484968e-05, "loss": 0.0345, "step": 915 }, { "epoch": 1.3295988435128296, "grad_norm": 0.06043161824345589, "learning_rate": 1.3663099691437945e-05, "loss": 0.0336, "step": 920 }, { "epoch": 1.3368268883267076, "grad_norm": 0.04175262525677681, "learning_rate": 1.3584726200052767e-05, "loss": 0.038, "step": 925 }, { "epoch": 1.3440549331405856, "grad_norm": 0.04762093350291252, "learning_rate": 1.3506099231152366e-05, "loss": 0.0346, "step": 930 }, { "epoch": 1.3512829779544633, "grad_norm": 0.06360676139593124, "learning_rate": 1.3427224344483178e-05, "loss": 0.0296, "step": 935 }, { "epoch": 1.3585110227683412, "grad_norm": 0.06761486828327179, "learning_rate": 1.3348107117322004e-05, "loss": 0.0309, "step": 940 }, { "epoch": 1.365739067582219, "grad_norm": 0.06754028797149658, "learning_rate": 1.3268753144081652e-05, "loss": 0.028, "step": 945 }, { "epoch": 1.372967112396097, "grad_norm": 0.06639332324266434, "learning_rate": 1.3189168035915337e-05, "loss": 0.0331, "step": 950 }, { "epoch": 1.3801951572099747, "grad_norm": 0.05263343080878258, "learning_rate": 1.3109357420319933e-05, "loss": 0.031, "step": 955 }, { "epoch": 1.3874232020238526, "grad_norm": 0.07213468849658966, "learning_rate": 1.3029326940738032e-05, "loss": 0.0338, "step": 960 }, { "epoch": 1.3946512468377303, "grad_norm": 0.05976350978016853, "learning_rate": 1.2949082256158904e-05, "loss": 0.0313, "step": 965 }, { "epoch": 1.4018792916516083, "grad_norm": 0.054479606449604034, "learning_rate": 1.286862904071835e-05, "loss": 0.0324, "step": 970 }, { "epoch": 1.409107336465486, "grad_norm": 0.07411843538284302, "learning_rate": 1.2787972983297472e-05, "loss": 0.0312, "step": 975 }, { "epoch": 1.416335381279364, "grad_norm": 0.05356777831912041, "learning_rate": 1.2707119787120417e-05, "loss": 0.0347, "step": 980 }, { "epoch": 1.4235634260932417, "grad_norm": 0.05905517190694809, "learning_rate": 1.26260751693511e-05, "loss": 0.0317, "step": 985 }, { "epoch": 1.4307914709071197, "grad_norm": 0.07836019992828369, "learning_rate": 1.254484486068893e-05, "loss": 0.0316, "step": 990 }, { "epoch": 1.4380195157209974, "grad_norm": 0.06900329887866974, "learning_rate": 1.24634346049636e-05, "loss": 0.0324, "step": 995 }, { "epoch": 1.4452475605348754, "grad_norm": 0.05929545313119888, "learning_rate": 1.2381850158728952e-05, "loss": 0.029, "step": 1000 }, { "epoch": 1.4452475605348754, "eval_loss": 0.08190815895795822, "eval_runtime": 1141.9961, "eval_samples_per_second": 56.223, "eval_steps_per_second": 1.757, "step": 1000 }, { "epoch": 1.452475605348753, "grad_norm": 0.0604124590754509, "learning_rate": 1.2300097290855887e-05, "loss": 0.0285, "step": 1005 }, { "epoch": 1.459703650162631, "grad_norm": 0.06895657628774643, "learning_rate": 1.2218181782124496e-05, "loss": 0.0316, "step": 1010 }, { "epoch": 1.466931694976509, "grad_norm": 0.048645876348018646, "learning_rate": 1.2136109424815258e-05, "loss": 0.0306, "step": 1015 }, { "epoch": 1.4741597397903867, "grad_norm": 0.06193140521645546, "learning_rate": 1.205388602229949e-05, "loss": 0.03, "step": 1020 }, { "epoch": 1.4813877846042645, "grad_norm": 0.07050759345293045, "learning_rate": 1.1971517388628972e-05, "loss": 0.0341, "step": 1025 }, { "epoch": 1.4886158294181424, "grad_norm": 0.05533516779541969, "learning_rate": 1.1889009348124857e-05, "loss": 0.0303, "step": 1030 }, { "epoch": 1.4958438742320204, "grad_norm": 0.04415017366409302, "learning_rate": 1.180636773496579e-05, "loss": 0.0276, "step": 1035 }, { "epoch": 1.503071919045898, "grad_norm": 0.059612423181533813, "learning_rate": 1.1723598392775415e-05, "loss": 0.0273, "step": 1040 }, { "epoch": 1.5102999638597758, "grad_norm": 0.06513796001672745, "learning_rate": 1.1640707174209147e-05, "loss": 0.0299, "step": 1045 }, { "epoch": 1.5175280086736538, "grad_norm": 0.056087836623191833, "learning_rate": 1.1557699940540321e-05, "loss": 0.0308, "step": 1050 }, { "epoch": 1.5247560534875317, "grad_norm": 0.06388755887746811, "learning_rate": 1.1474582561245767e-05, "loss": 0.0278, "step": 1055 }, { "epoch": 1.5319840983014095, "grad_norm": 0.06793609261512756, "learning_rate": 1.1391360913590736e-05, "loss": 0.0342, "step": 1060 }, { "epoch": 1.5392121431152872, "grad_norm": 0.04267344996333122, "learning_rate": 1.1308040882213363e-05, "loss": 0.03, "step": 1065 }, { "epoch": 1.5464401879291652, "grad_norm": 0.06034848093986511, "learning_rate": 1.122462835870852e-05, "loss": 0.0279, "step": 1070 }, { "epoch": 1.553668232743043, "grad_norm": 0.06860997527837753, "learning_rate": 1.1141129241211246e-05, "loss": 0.0263, "step": 1075 }, { "epoch": 1.5608962775569208, "grad_norm": 0.04508688300848007, "learning_rate": 1.1057549433979675e-05, "loss": 0.0365, "step": 1080 }, { "epoch": 1.5681243223707986, "grad_norm": 0.04600263386964798, "learning_rate": 1.0973894846977548e-05, "loss": 0.0286, "step": 1085 }, { "epoch": 1.5753523671846765, "grad_norm": 0.05820371210575104, "learning_rate": 1.089017139545631e-05, "loss": 0.0264, "step": 1090 }, { "epoch": 1.5825804119985545, "grad_norm": 0.0669277012348175, "learning_rate": 1.0806384999536857e-05, "loss": 0.028, "step": 1095 }, { "epoch": 1.5898084568124322, "grad_norm": 0.05904907360672951, "learning_rate": 1.0722541583790898e-05, "loss": 0.0247, "step": 1100 }, { "epoch": 1.59703650162631, "grad_norm": 0.05929577723145485, "learning_rate": 1.0638647076822041e-05, "loss": 0.0305, "step": 1105 }, { "epoch": 1.604264546440188, "grad_norm": 0.058572858572006226, "learning_rate": 1.0554707410846585e-05, "loss": 0.0294, "step": 1110 }, { "epoch": 1.6114925912540659, "grad_norm": 0.05682854354381561, "learning_rate": 1.0470728521274028e-05, "loss": 0.028, "step": 1115 }, { "epoch": 1.6187206360679436, "grad_norm": 0.0703597441315651, "learning_rate": 1.0386716346287398e-05, "loss": 0.0278, "step": 1120 }, { "epoch": 1.6259486808818213, "grad_norm": 0.07455068826675415, "learning_rate": 1.030267682642334e-05, "loss": 0.0312, "step": 1125 }, { "epoch": 1.6331767256956993, "grad_norm": 0.06019241735339165, "learning_rate": 1.0218615904152067e-05, "loss": 0.027, "step": 1130 }, { "epoch": 1.6404047705095772, "grad_norm": 0.05638565123081207, "learning_rate": 1.0134539523457172e-05, "loss": 0.0301, "step": 1135 }, { "epoch": 1.647632815323455, "grad_norm": 0.06251167505979538, "learning_rate": 1.0050453629415317e-05, "loss": 0.027, "step": 1140 }, { "epoch": 1.6548608601373327, "grad_norm": 0.08058342337608337, "learning_rate": 9.966364167775851e-06, "loss": 0.0307, "step": 1145 }, { "epoch": 1.6620889049512106, "grad_norm": 0.05652245879173279, "learning_rate": 9.882277084540399e-06, "loss": 0.0267, "step": 1150 }, { "epoch": 1.6693169497650886, "grad_norm": 0.06871891021728516, "learning_rate": 9.798198325542399e-06, "loss": 0.0255, "step": 1155 }, { "epoch": 1.6765449945789666, "grad_norm": 0.07430125027894974, "learning_rate": 9.714133836026687e-06, "loss": 0.0277, "step": 1160 }, { "epoch": 1.6837730393928443, "grad_norm": 0.058816712349653244, "learning_rate": 9.630089560229088e-06, "loss": 0.0248, "step": 1165 }, { "epoch": 1.691001084206722, "grad_norm": 0.06506705284118652, "learning_rate": 9.546071440956115e-06, "loss": 0.0298, "step": 1170 }, { "epoch": 1.6982291290206, "grad_norm": 0.06538432091474533, "learning_rate": 9.46208541916474e-06, "loss": 0.0308, "step": 1175 }, { "epoch": 1.705457173834478, "grad_norm": 0.057376306504011154, "learning_rate": 9.378137433542305e-06, "loss": 0.0293, "step": 1180 }, { "epoch": 1.7126852186483557, "grad_norm": 0.04726172983646393, "learning_rate": 9.294233420086604e-06, "loss": 0.0271, "step": 1185 }, { "epoch": 1.7199132634622334, "grad_norm": 0.05788370966911316, "learning_rate": 9.210379311686129e-06, "loss": 0.0293, "step": 1190 }, { "epoch": 1.7271413082761113, "grad_norm": 0.04595355689525604, "learning_rate": 9.12658103770058e-06, "loss": 0.0289, "step": 1195 }, { "epoch": 1.7343693530899893, "grad_norm": 0.06266051530838013, "learning_rate": 9.042844523541572e-06, "loss": 0.0286, "step": 1200 }, { "epoch": 1.741597397903867, "grad_norm": 0.049365997314453125, "learning_rate": 8.95917569025366e-06, "loss": 0.0275, "step": 1205 }, { "epoch": 1.7488254427177448, "grad_norm": 0.056487612426280975, "learning_rate": 8.875580454095651e-06, "loss": 0.0239, "step": 1210 }, { "epoch": 1.7560534875316227, "grad_norm": 0.04812345653772354, "learning_rate": 8.792064726122275e-06, "loss": 0.0262, "step": 1215 }, { "epoch": 1.7632815323455007, "grad_norm": 0.06868524849414825, "learning_rate": 8.708634411766195e-06, "loss": 0.0277, "step": 1220 }, { "epoch": 1.7705095771593784, "grad_norm": 0.07294084876775742, "learning_rate": 8.625295410420451e-06, "loss": 0.0235, "step": 1225 }, { "epoch": 1.7777376219732561, "grad_norm": 0.05644133314490318, "learning_rate": 8.542053615021291e-06, "loss": 0.0271, "step": 1230 }, { "epoch": 1.784965666787134, "grad_norm": 0.059861283749341965, "learning_rate": 8.4589149116315e-06, "loss": 0.025, "step": 1235 }, { "epoch": 1.792193711601012, "grad_norm": 0.06358060985803604, "learning_rate": 8.375885179024175e-06, "loss": 0.0294, "step": 1240 }, { "epoch": 1.7994217564148898, "grad_norm": 0.03532201051712036, "learning_rate": 8.292970288267043e-06, "loss": 0.0239, "step": 1245 }, { "epoch": 1.8066498012287675, "grad_norm": 0.047285765409469604, "learning_rate": 8.21017610230732e-06, "loss": 0.0312, "step": 1250 }, { "epoch": 1.8138778460426455, "grad_norm": 0.044171951711177826, "learning_rate": 8.12750847555713e-06, "loss": 0.0288, "step": 1255 }, { "epoch": 1.8211058908565234, "grad_norm": 0.05230150744318962, "learning_rate": 8.044973253479544e-06, "loss": 0.0242, "step": 1260 }, { "epoch": 1.8283339356704011, "grad_norm": 0.04772350192070007, "learning_rate": 7.96257627217524e-06, "loss": 0.0292, "step": 1265 }, { "epoch": 1.8355619804842789, "grad_norm": 0.04245223104953766, "learning_rate": 7.880323357969838e-06, "loss": 0.0239, "step": 1270 }, { "epoch": 1.8427900252981568, "grad_norm": 0.05859874188899994, "learning_rate": 7.798220327001898e-06, "loss": 0.0245, "step": 1275 }, { "epoch": 1.8500180701120348, "grad_norm": 0.06144941225647926, "learning_rate": 7.716272984811688e-06, "loss": 0.0261, "step": 1280 }, { "epoch": 1.8572461149259125, "grad_norm": 0.03744060546159744, "learning_rate": 7.634487125930649e-06, "loss": 0.0259, "step": 1285 }, { "epoch": 1.8644741597397902, "grad_norm": 0.06158106401562691, "learning_rate": 7.55286853347167e-06, "loss": 0.0237, "step": 1290 }, { "epoch": 1.8717022045536682, "grad_norm": 0.05013835057616234, "learning_rate": 7.471422978720162e-06, "loss": 0.0244, "step": 1295 }, { "epoch": 1.8789302493675462, "grad_norm": 0.06363669037818909, "learning_rate": 7.3901562207259555e-06, "loss": 0.0245, "step": 1300 }, { "epoch": 1.8861582941814239, "grad_norm": 0.05522134155035019, "learning_rate": 7.309074005896103e-06, "loss": 0.0216, "step": 1305 }, { "epoch": 1.8933863389953016, "grad_norm": 0.06466201692819595, "learning_rate": 7.228182067588518e-06, "loss": 0.0278, "step": 1310 }, { "epoch": 1.9006143838091796, "grad_norm": 0.047263894230127335, "learning_rate": 7.1474861257065866e-06, "loss": 0.0258, "step": 1315 }, { "epoch": 1.9078424286230575, "grad_norm": 0.051960770040750504, "learning_rate": 7.066991886294702e-06, "loss": 0.0227, "step": 1320 }, { "epoch": 1.9150704734369353, "grad_norm": 0.06168799102306366, "learning_rate": 6.9867050411347955e-06, "loss": 0.0333, "step": 1325 }, { "epoch": 1.922298518250813, "grad_norm": 0.03766432777047157, "learning_rate": 6.906631267343849e-06, "loss": 0.0235, "step": 1330 }, { "epoch": 1.929526563064691, "grad_norm": 0.04953250661492348, "learning_rate": 6.826776226972489e-06, "loss": 0.0247, "step": 1335 }, { "epoch": 1.936754607878569, "grad_norm": 0.047898851335048676, "learning_rate": 6.747145566604605e-06, "loss": 0.0281, "step": 1340 }, { "epoch": 1.9439826526924469, "grad_norm": 0.062446679919958115, "learning_rate": 6.667744916958085e-06, "loss": 0.0242, "step": 1345 }, { "epoch": 1.9512106975063246, "grad_norm": 0.050179507583379745, "learning_rate": 6.588579892486657e-06, "loss": 0.0254, "step": 1350 }, { "epoch": 1.9584387423202023, "grad_norm": 0.052683789283037186, "learning_rate": 6.5096560909828855e-06, "loss": 0.0206, "step": 1355 }, { "epoch": 1.9656667871340803, "grad_norm": 0.06998462975025177, "learning_rate": 6.430979093182372e-06, "loss": 0.0223, "step": 1360 }, { "epoch": 1.9728948319479582, "grad_norm": 0.07918884605169296, "learning_rate": 6.352554462369112e-06, "loss": 0.0281, "step": 1365 }, { "epoch": 1.980122876761836, "grad_norm": 0.06278680264949799, "learning_rate": 6.274387743982127e-06, "loss": 0.0234, "step": 1370 }, { "epoch": 1.9873509215757137, "grad_norm": 0.04667511582374573, "learning_rate": 6.196484465223343e-06, "loss": 0.0218, "step": 1375 }, { "epoch": 1.9945789663895916, "grad_norm": 0.05683530122041702, "learning_rate": 6.1188501346667536e-06, "loss": 0.0267, "step": 1380 }, { "epoch": 2.0014456089627757, "grad_norm": 0.021900292485952377, "learning_rate": 6.04149024186891e-06, "loss": 0.0196, "step": 1385 }, { "epoch": 2.0086736537766536, "grad_norm": 0.014646291732788086, "learning_rate": 5.964410256980762e-06, "loss": 0.0085, "step": 1390 }, { "epoch": 2.015901698590531, "grad_norm": 0.018468832597136497, "learning_rate": 5.887615630360836e-06, "loss": 0.0064, "step": 1395 }, { "epoch": 2.023129743404409, "grad_norm": 0.02347305603325367, "learning_rate": 5.811111792189873e-06, "loss": 0.0058, "step": 1400 }, { "epoch": 2.030357788218287, "grad_norm": 0.022464651614427567, "learning_rate": 5.734904152086829e-06, "loss": 0.0052, "step": 1405 }, { "epoch": 2.037585833032165, "grad_norm": 0.027769049629569054, "learning_rate": 5.658998098726361e-06, "loss": 0.0054, "step": 1410 }, { "epoch": 2.0448138778460425, "grad_norm": 0.03556771203875542, "learning_rate": 5.583398999457812e-06, "loss": 0.0043, "step": 1415 }, { "epoch": 2.0520419226599205, "grad_norm": 0.030191823840141296, "learning_rate": 5.508112199925659e-06, "loss": 0.0041, "step": 1420 }, { "epoch": 2.0592699674737984, "grad_norm": 0.03760818764567375, "learning_rate": 5.433143023691547e-06, "loss": 0.0038, "step": 1425 }, { "epoch": 2.0664980122876764, "grad_norm": 0.014797261916100979, "learning_rate": 5.358496771857831e-06, "loss": 0.0044, "step": 1430 }, { "epoch": 2.073726057101554, "grad_norm": 0.06623335927724838, "learning_rate": 5.284178722692743e-06, "loss": 0.0052, "step": 1435 }, { "epoch": 2.080954101915432, "grad_norm": 0.016557743772864342, "learning_rate": 5.2101941312571724e-06, "loss": 0.0037, "step": 1440 }, { "epoch": 2.08818214672931, "grad_norm": 0.03200926259160042, "learning_rate": 5.136548229033065e-06, "loss": 0.0047, "step": 1445 }, { "epoch": 2.0954101915431878, "grad_norm": 0.03867388516664505, "learning_rate": 5.063246223553509e-06, "loss": 0.0052, "step": 1450 }, { "epoch": 2.1026382363570653, "grad_norm": 0.009015249088406563, "learning_rate": 4.990293298034505e-06, "loss": 0.0045, "step": 1455 }, { "epoch": 2.109866281170943, "grad_norm": 0.01907913200557232, "learning_rate": 4.917694611008477e-06, "loss": 0.0041, "step": 1460 }, { "epoch": 2.117094325984821, "grad_norm": 0.02901625819504261, "learning_rate": 4.845455295959468e-06, "loss": 0.0051, "step": 1465 }, { "epoch": 2.124322370798699, "grad_norm": 0.03763509541749954, "learning_rate": 4.773580460960195e-06, "loss": 0.0065, "step": 1470 }, { "epoch": 2.1315504156125766, "grad_norm": 0.02262153849005699, "learning_rate": 4.702075188310826e-06, "loss": 0.0052, "step": 1475 }, { "epoch": 2.1387784604264546, "grad_norm": 0.02351069077849388, "learning_rate": 4.6309445341796286e-06, "loss": 0.0048, "step": 1480 }, { "epoch": 2.1460065052403325, "grad_norm": 0.009482895024120808, "learning_rate": 4.5601935282454255e-06, "loss": 0.0035, "step": 1485 }, { "epoch": 2.1532345500542105, "grad_norm": 0.039236586540937424, "learning_rate": 4.489827173341957e-06, "loss": 0.0046, "step": 1490 }, { "epoch": 2.160462594868088, "grad_norm": 0.029299462214112282, "learning_rate": 4.419850445104126e-06, "loss": 0.0066, "step": 1495 }, { "epoch": 2.167690639681966, "grad_norm": 0.038081999868154526, "learning_rate": 4.350268291616166e-06, "loss": 0.0058, "step": 1500 }, { "epoch": 2.167690639681966, "eval_loss": 0.15236619114875793, "eval_runtime": 1142.5896, "eval_samples_per_second": 56.193, "eval_steps_per_second": 1.757, "step": 1500 }, { "epoch": 2.174918684495844, "grad_norm": 0.05242437124252319, "learning_rate": 4.281085633061764e-06, "loss": 0.0047, "step": 1505 }, { "epoch": 2.182146729309722, "grad_norm": 0.04368291050195694, "learning_rate": 4.212307361376146e-06, "loss": 0.0078, "step": 1510 }, { "epoch": 2.1893747741235994, "grad_norm": 0.033737700432538986, "learning_rate": 4.1439383399001865e-06, "loss": 0.0043, "step": 1515 }, { "epoch": 2.1966028189374773, "grad_norm": 0.05659673735499382, "learning_rate": 4.075983403036479e-06, "loss": 0.0045, "step": 1520 }, { "epoch": 2.2038308637513553, "grad_norm": 0.0330926850438118, "learning_rate": 4.0084473559075335e-06, "loss": 0.0073, "step": 1525 }, { "epoch": 2.2110589085652332, "grad_norm": 0.01673804223537445, "learning_rate": 3.941334974015981e-06, "loss": 0.006, "step": 1530 }, { "epoch": 2.2182869533791107, "grad_norm": 0.013828652910888195, "learning_rate": 3.874651002906915e-06, "loss": 0.0047, "step": 1535 }, { "epoch": 2.2255149981929887, "grad_norm": 0.02410770393908024, "learning_rate": 3.8084001578323093e-06, "loss": 0.0049, "step": 1540 }, { "epoch": 2.2327430430068667, "grad_norm": 0.04236437752842903, "learning_rate": 3.7425871234176134e-06, "loss": 0.0069, "step": 1545 }, { "epoch": 2.2399710878207446, "grad_norm": 0.017214614897966385, "learning_rate": 3.6772165533305024e-06, "loss": 0.0034, "step": 1550 }, { "epoch": 2.2471991326346226, "grad_norm": 0.007290941663086414, "learning_rate": 3.6122930699518057e-06, "loss": 0.0045, "step": 1555 }, { "epoch": 2.2544271774485, "grad_norm": 0.02728499099612236, "learning_rate": 3.5478212640486652e-06, "loss": 0.0049, "step": 1560 }, { "epoch": 2.261655222262378, "grad_norm": 0.023531029000878334, "learning_rate": 3.483805694449913e-06, "loss": 0.0034, "step": 1565 }, { "epoch": 2.268883267076256, "grad_norm": 0.01806485652923584, "learning_rate": 3.420250887723722e-06, "loss": 0.0044, "step": 1570 }, { "epoch": 2.2761113118901335, "grad_norm": 0.022033169865608215, "learning_rate": 3.357161337857523e-06, "loss": 0.0037, "step": 1575 }, { "epoch": 2.2833393567040114, "grad_norm": 0.03150279447436333, "learning_rate": 3.2945415059402363e-06, "loss": 0.0051, "step": 1580 }, { "epoch": 2.2905674015178894, "grad_norm": 0.01539881806820631, "learning_rate": 3.232395819846824e-06, "loss": 0.0036, "step": 1585 }, { "epoch": 2.2977954463317674, "grad_norm": 0.022633062675595284, "learning_rate": 3.170728673925206e-06, "loss": 0.003, "step": 1590 }, { "epoch": 2.3050234911456453, "grad_norm": 0.07029638439416885, "learning_rate": 3.1095444286855112e-06, "loss": 0.0056, "step": 1595 }, { "epoch": 2.312251535959523, "grad_norm": 0.02109723724424839, "learning_rate": 3.04884741049176e-06, "loss": 0.0047, "step": 1600 }, { "epoch": 2.3194795807734008, "grad_norm": 0.03571590408682823, "learning_rate": 2.9886419112559396e-06, "loss": 0.005, "step": 1605 }, { "epoch": 2.3267076255872787, "grad_norm": 0.047896191477775574, "learning_rate": 2.9289321881345257e-06, "loss": 0.0065, "step": 1610 }, { "epoch": 2.3339356704011567, "grad_norm": 0.018771937116980553, "learning_rate": 2.86972246322745e-06, "loss": 0.0029, "step": 1615 }, { "epoch": 2.341163715215034, "grad_norm": 0.013248492032289505, "learning_rate": 2.8110169232795615e-06, "loss": 0.0029, "step": 1620 }, { "epoch": 2.348391760028912, "grad_norm": 0.0571102574467659, "learning_rate": 2.752819719384573e-06, "loss": 0.0059, "step": 1625 }, { "epoch": 2.35561980484279, "grad_norm": 0.037497229874134064, "learning_rate": 2.6951349666915404e-06, "loss": 0.0046, "step": 1630 }, { "epoch": 2.362847849656668, "grad_norm": 0.050996676087379456, "learning_rate": 2.637966744113877e-06, "loss": 0.005, "step": 1635 }, { "epoch": 2.3700758944705456, "grad_norm": 0.017970601096749306, "learning_rate": 2.581319094040927e-06, "loss": 0.0046, "step": 1640 }, { "epoch": 2.3773039392844235, "grad_norm": 0.012019157409667969, "learning_rate": 2.5251960220521422e-06, "loss": 0.0036, "step": 1645 }, { "epoch": 2.3845319840983015, "grad_norm": 0.04049897938966751, "learning_rate": 2.4696014966338267e-06, "loss": 0.0043, "step": 1650 }, { "epoch": 2.3917600289121794, "grad_norm": 0.011841571889817715, "learning_rate": 2.4145394488985307e-06, "loss": 0.0031, "step": 1655 }, { "epoch": 2.398988073726057, "grad_norm": 0.05339455232024193, "learning_rate": 2.360013772307086e-06, "loss": 0.0039, "step": 1660 }, { "epoch": 2.406216118539935, "grad_norm": 0.017674589529633522, "learning_rate": 2.3060283223932876e-06, "loss": 0.0041, "step": 1665 }, { "epoch": 2.413444163353813, "grad_norm": 0.036407146602869034, "learning_rate": 2.252586916491275e-06, "loss": 0.0062, "step": 1670 }, { "epoch": 2.420672208167691, "grad_norm": 0.05213891342282295, "learning_rate": 2.1996933334656044e-06, "loss": 0.0048, "step": 1675 }, { "epoch": 2.4279002529815683, "grad_norm": 0.027244996279478073, "learning_rate": 2.1473513134440425e-06, "loss": 0.0064, "step": 1680 }, { "epoch": 2.4351282977954463, "grad_norm": 0.020630542188882828, "learning_rate": 2.0955645575531e-06, "loss": 0.005, "step": 1685 }, { "epoch": 2.442356342609324, "grad_norm": 0.01828751713037491, "learning_rate": 2.0443367276563277e-06, "loss": 0.0056, "step": 1690 }, { "epoch": 2.449584387423202, "grad_norm": 0.07830678671598434, "learning_rate": 1.9936714460953743e-06, "loss": 0.0058, "step": 1695 }, { "epoch": 2.4568124322370797, "grad_norm": 0.04249007627367973, "learning_rate": 1.9435722954338675e-06, "loss": 0.0038, "step": 1700 }, { "epoch": 2.4640404770509576, "grad_norm": 0.04109486937522888, "learning_rate": 1.8940428182040715e-06, "loss": 0.0042, "step": 1705 }, { "epoch": 2.4712685218648356, "grad_norm": 0.011558642610907555, "learning_rate": 1.8450865166564003e-06, "loss": 0.0029, "step": 1710 }, { "epoch": 2.4784965666787135, "grad_norm": 0.0303326603025198, "learning_rate": 1.7967068525117658e-06, "loss": 0.0041, "step": 1715 }, { "epoch": 2.485724611492591, "grad_norm": 0.016660748049616814, "learning_rate": 1.7489072467168166e-06, "loss": 0.0039, "step": 1720 }, { "epoch": 2.492952656306469, "grad_norm": 0.044775962829589844, "learning_rate": 1.7016910792020191e-06, "loss": 0.0062, "step": 1725 }, { "epoch": 2.500180701120347, "grad_norm": 0.018128257244825363, "learning_rate": 1.6550616886426718e-06, "loss": 0.0033, "step": 1730 }, { "epoch": 2.507408745934225, "grad_norm": 0.04440128430724144, "learning_rate": 1.609022372222827e-06, "loss": 0.005, "step": 1735 }, { "epoch": 2.514636790748103, "grad_norm": 0.013020209036767483, "learning_rate": 1.5635763854021424e-06, "loss": 0.004, "step": 1740 }, { "epoch": 2.5218648355619804, "grad_norm": 0.04560156539082527, "learning_rate": 1.5187269416856875e-06, "loss": 0.0044, "step": 1745 }, { "epoch": 2.5290928803758583, "grad_norm": 0.020239338278770447, "learning_rate": 1.474477212396712e-06, "loss": 0.003, "step": 1750 }, { "epoch": 2.5363209251897363, "grad_norm": 0.020898908376693726, "learning_rate": 1.4308303264524115e-06, "loss": 0.0046, "step": 1755 }, { "epoch": 2.543548970003614, "grad_norm": 0.01411470677703619, "learning_rate": 1.3877893701426637e-06, "loss": 0.0037, "step": 1760 }, { "epoch": 2.5507770148174918, "grad_norm": 0.027346884831786156, "learning_rate": 1.3453573869118097e-06, "loss": 0.0056, "step": 1765 }, { "epoch": 2.5580050596313697, "grad_norm": 0.017516661435365677, "learning_rate": 1.3035373771434356e-06, "loss": 0.0063, "step": 1770 }, { "epoch": 2.5652331044452477, "grad_norm": 0.01695055328309536, "learning_rate": 1.2623322979482355e-06, "loss": 0.0049, "step": 1775 }, { "epoch": 2.5724611492591256, "grad_norm": 0.03533555567264557, "learning_rate": 1.2217450629548955e-06, "loss": 0.0042, "step": 1780 }, { "epoch": 2.579689194073003, "grad_norm": 0.012438401579856873, "learning_rate": 1.181778542104075e-06, "loss": 0.0025, "step": 1785 }, { "epoch": 2.586917238886881, "grad_norm": 0.02840145118534565, "learning_rate": 1.1424355614454718e-06, "loss": 0.0047, "step": 1790 }, { "epoch": 2.594145283700759, "grad_norm": 0.03050726279616356, "learning_rate": 1.1037189029379925e-06, "loss": 0.0078, "step": 1795 }, { "epoch": 2.6013733285146365, "grad_norm": 0.025619490072131157, "learning_rate": 1.0656313042530376e-06, "loss": 0.0051, "step": 1800 }, { "epoch": 2.6086013733285145, "grad_norm": 0.03565088286995888, "learning_rate": 1.028175458580918e-06, "loss": 0.0057, "step": 1805 }, { "epoch": 2.6158294181423924, "grad_norm": 0.048903122544288635, "learning_rate": 9.913540144404254e-07, "loss": 0.0029, "step": 1810 }, { "epoch": 2.6230574629562704, "grad_norm": 0.06714732199907303, "learning_rate": 9.551695754915447e-07, "loss": 0.0058, "step": 1815 }, { "epoch": 2.6302855077701484, "grad_norm": 0.03730113059282303, "learning_rate": 9.196247003513537e-07, "loss": 0.0056, "step": 1820 }, { "epoch": 2.637513552584026, "grad_norm": 0.008724790997803211, "learning_rate": 8.84721902413097e-07, "loss": 0.0042, "step": 1825 }, { "epoch": 2.644741597397904, "grad_norm": 0.03197433799505234, "learning_rate": 8.50463649668477e-07, "loss": 0.0043, "step": 1830 }, { "epoch": 2.6519696422117818, "grad_norm": 0.05495726689696312, "learning_rate": 8.168523645331216e-07, "loss": 0.0047, "step": 1835 }, { "epoch": 2.6591976870256593, "grad_norm": 0.01701589673757553, "learning_rate": 7.838904236753087e-07, "loss": 0.0041, "step": 1840 }, { "epoch": 2.6664257318395372, "grad_norm": 0.02677042968571186, "learning_rate": 7.515801578479032e-07, "loss": 0.0065, "step": 1845 }, { "epoch": 2.673653776653415, "grad_norm": 0.014987285248935223, "learning_rate": 7.199238517235541e-07, "loss": 0.003, "step": 1850 }, { "epoch": 2.680881821467293, "grad_norm": 0.011919076554477215, "learning_rate": 6.889237437331398e-07, "loss": 0.0036, "step": 1855 }, { "epoch": 2.688109866281171, "grad_norm": 0.05691038444638252, "learning_rate": 6.585820259074882e-07, "loss": 0.005, "step": 1860 }, { "epoch": 2.6953379110950486, "grad_norm": 0.021997489035129547, "learning_rate": 6.289008437223798e-07, "loss": 0.0061, "step": 1865 }, { "epoch": 2.7025659559089266, "grad_norm": 0.04470158740878105, "learning_rate": 5.998822959468409e-07, "loss": 0.0044, "step": 1870 }, { "epoch": 2.7097940007228045, "grad_norm": 0.023458922281861305, "learning_rate": 5.715284344947358e-07, "loss": 0.0052, "step": 1875 }, { "epoch": 2.7170220455366825, "grad_norm": 0.007212420925498009, "learning_rate": 5.438412642796686e-07, "loss": 0.004, "step": 1880 }, { "epoch": 2.7242500903505604, "grad_norm": 0.021170541644096375, "learning_rate": 5.168227430732353e-07, "loss": 0.0046, "step": 1885 }, { "epoch": 2.731478135164438, "grad_norm": 0.04506688937544823, "learning_rate": 4.904747813665656e-07, "loss": 0.005, "step": 1890 }, { "epoch": 2.738706179978316, "grad_norm": 0.03043074533343315, "learning_rate": 4.6479924223524655e-07, "loss": 0.0056, "step": 1895 }, { "epoch": 2.745934224792194, "grad_norm": 0.06903711706399918, "learning_rate": 4.39797941207577e-07, "loss": 0.004, "step": 1900 }, { "epoch": 2.7531622696060714, "grad_norm": 0.008001566864550114, "learning_rate": 4.1547264613619243e-07, "loss": 0.0052, "step": 1905 }, { "epoch": 2.7603903144199493, "grad_norm": 0.016265859827399254, "learning_rate": 3.9182507707305915e-07, "loss": 0.0055, "step": 1910 }, { "epoch": 2.7676183592338273, "grad_norm": 0.019273990765213966, "learning_rate": 3.6885690614785197e-07, "loss": 0.0043, "step": 1915 }, { "epoch": 2.774846404047705, "grad_norm": 0.051555391401052475, "learning_rate": 3.4656975744970846e-07, "loss": 0.0046, "step": 1920 }, { "epoch": 2.782074448861583, "grad_norm": 0.010696332901716232, "learning_rate": 3.249652069124032e-07, "loss": 0.0028, "step": 1925 }, { "epoch": 2.7893024936754607, "grad_norm": 0.04233001545071602, "learning_rate": 3.040447822028958e-07, "loss": 0.0048, "step": 1930 }, { "epoch": 2.7965305384893386, "grad_norm": 0.07739260792732239, "learning_rate": 2.838099626133206e-07, "loss": 0.004, "step": 1935 }, { "epoch": 2.8037585833032166, "grad_norm": 0.05327356979250908, "learning_rate": 2.642621789563848e-07, "loss": 0.0069, "step": 1940 }, { "epoch": 2.810986628117094, "grad_norm": 0.027605120092630386, "learning_rate": 2.4540281346418946e-07, "loss": 0.0037, "step": 1945 }, { "epoch": 2.818214672930972, "grad_norm": 0.019115762785077095, "learning_rate": 2.2723319969049307e-07, "loss": 0.0036, "step": 1950 }, { "epoch": 2.82544271774485, "grad_norm": 0.02181391790509224, "learning_rate": 2.0975462241642042e-07, "loss": 0.0024, "step": 1955 }, { "epoch": 2.832670762558728, "grad_norm": 0.009364648722112179, "learning_rate": 1.9296831755960753e-07, "loss": 0.0052, "step": 1960 }, { "epoch": 2.839898807372606, "grad_norm": 0.01776730641722679, "learning_rate": 1.76875472086816e-07, "loss": 0.0037, "step": 1965 }, { "epoch": 2.8471268521864834, "grad_norm": 0.022552713751792908, "learning_rate": 1.6147722392999887e-07, "loss": 0.0038, "step": 1970 }, { "epoch": 2.8543548970003614, "grad_norm": 0.05256934091448784, "learning_rate": 1.467746619058341e-07, "loss": 0.0049, "step": 1975 }, { "epoch": 2.8615829418142393, "grad_norm": 0.01704435609281063, "learning_rate": 1.327688256387416e-07, "loss": 0.0042, "step": 1980 }, { "epoch": 2.868810986628117, "grad_norm": 0.008595878258347511, "learning_rate": 1.1946070548736532e-07, "loss": 0.0035, "step": 1985 }, { "epoch": 2.876039031441995, "grad_norm": 0.025025706738233566, "learning_rate": 1.0685124247454159e-07, "loss": 0.0038, "step": 1990 }, { "epoch": 2.8832670762558728, "grad_norm": 0.018959928303956985, "learning_rate": 9.494132822077007e-08, "loss": 0.0043, "step": 1995 }, { "epoch": 2.8904951210697507, "grad_norm": 0.01306887436658144, "learning_rate": 8.373180488115529e-08, "loss": 0.005, "step": 2000 }, { "epoch": 2.8904951210697507, "eval_loss": 0.1443248987197876, "eval_runtime": 1199.7992, "eval_samples_per_second": 53.514, "eval_steps_per_second": 1.673, "step": 2000 }, { "epoch": 2.8977231658836287, "grad_norm": 0.07084480673074722, "learning_rate": 7.322346508586209e-08, "loss": 0.0043, "step": 2005 }, { "epoch": 2.904951210697506, "grad_norm": 0.04223432019352913, "learning_rate": 6.341705188407043e-08, "loss": 0.0048, "step": 2010 }, { "epoch": 2.912179255511384, "grad_norm": 0.019037162885069847, "learning_rate": 5.431325869143189e-08, "loss": 0.0054, "step": 2015 }, { "epoch": 2.919407300325262, "grad_norm": 0.014710099436342716, "learning_rate": 4.5912729241036624e-08, "loss": 0.0038, "step": 2020 }, { "epoch": 2.9266353451391396, "grad_norm": 0.012474890798330307, "learning_rate": 3.821605753789648e-08, "loss": 0.0038, "step": 2025 }, { "epoch": 2.933863389953018, "grad_norm": 0.01217850111424923, "learning_rate": 3.122378781694524e-08, "loss": 0.0028, "step": 2030 }, { "epoch": 2.9410914347668955, "grad_norm": 0.0497884601354599, "learning_rate": 2.493641450454942e-08, "loss": 0.0041, "step": 2035 }, { "epoch": 2.9483194795807735, "grad_norm": 0.013583734631538391, "learning_rate": 1.93543821835529e-08, "loss": 0.0037, "step": 2040 }, { "epoch": 2.9555475243946514, "grad_norm": 0.046894483268260956, "learning_rate": 1.4478085561835387e-08, "loss": 0.0041, "step": 2045 }, { "epoch": 2.962775569208529, "grad_norm": 0.011021456681191921, "learning_rate": 1.0307869444406981e-08, "loss": 0.0043, "step": 2050 }, { "epoch": 2.970003614022407, "grad_norm": 0.008113077841699123, "learning_rate": 6.844028709024342e-09, "loss": 0.0053, "step": 2055 }, { "epoch": 2.977231658836285, "grad_norm": 0.02427099458873272, "learning_rate": 4.086808285338472e-09, "loss": 0.0043, "step": 2060 }, { "epoch": 2.9844597036501628, "grad_norm": 0.0245045255869627, "learning_rate": 2.0364031375819104e-09, "loss": 0.0034, "step": 2065 }, { "epoch": 2.9916877484640407, "grad_norm": 0.016123216599225998, "learning_rate": 6.929582507719801e-10, "loss": 0.0055, "step": 2070 }, { "epoch": 2.9989157932779182, "grad_norm": 0.04469776526093483, "learning_rate": 5.6568620471209035e-11, "loss": 0.0047, "step": 2075 }, { "epoch": 3.0, "step": 2076, "total_flos": 6.274047864041636e+18, "train_loss": 0.03664169063040653, "train_runtime": 42810.6565, "train_samples_per_second": 6.203, "train_steps_per_second": 0.048 } ], "logging_steps": 5, "max_steps": 2076, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.274047864041636e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }