{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5314, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000940910801656003, "grad_norm": 60.01997358091019, "learning_rate": 2.8195488721804507e-06, "loss": 3.7648, "step": 5 }, { "epoch": 0.001881821603312006, "grad_norm": 43.13189358709948, "learning_rate": 5.6390977443609015e-06, "loss": 3.7027, "step": 10 }, { "epoch": 0.002822732404968009, "grad_norm": 21.06725791656804, "learning_rate": 8.458646616541352e-06, "loss": 2.9202, "step": 15 }, { "epoch": 0.003763643206624012, "grad_norm": 10.467981092560883, "learning_rate": 1.1278195488721803e-05, "loss": 2.2189, "step": 20 }, { "epoch": 0.004704554008280015, "grad_norm": 4.304956998410812, "learning_rate": 1.4097744360902254e-05, "loss": 1.7602, "step": 25 }, { "epoch": 0.005645464809936018, "grad_norm": 2.039731988186442, "learning_rate": 1.6917293233082704e-05, "loss": 1.421, "step": 30 }, { "epoch": 0.006586375611592021, "grad_norm": 1.142378863342944, "learning_rate": 1.9736842105263155e-05, "loss": 1.2312, "step": 35 }, { "epoch": 0.007527286413248024, "grad_norm": 0.560089452128016, "learning_rate": 2.2556390977443606e-05, "loss": 1.1372, "step": 40 }, { "epoch": 0.008468197214904027, "grad_norm": 0.5228484190309799, "learning_rate": 2.5375939849624057e-05, "loss": 1.0764, "step": 45 }, { "epoch": 0.00940910801656003, "grad_norm": 0.34335801536002225, "learning_rate": 2.8195488721804508e-05, "loss": 1.0306, "step": 50 }, { "epoch": 0.010350018818216034, "grad_norm": 0.3215605332639539, "learning_rate": 3.101503759398496e-05, "loss": 0.9845, "step": 55 }, { "epoch": 0.011290929619872036, "grad_norm": 0.22796155046045471, "learning_rate": 3.383458646616541e-05, "loss": 1.0076, "step": 60 }, { "epoch": 0.012231840421528039, "grad_norm": 0.19722227209725182, "learning_rate": 3.665413533834586e-05, "loss": 1.0149, "step": 65 }, { "epoch": 0.013172751223184042, "grad_norm": 0.21801488393560356, "learning_rate": 3.947368421052631e-05, "loss": 0.9928, "step": 70 }, { "epoch": 0.014113662024840046, "grad_norm": 0.2160945691538292, "learning_rate": 4.2293233082706764e-05, "loss": 0.9793, "step": 75 }, { "epoch": 0.015054572826496047, "grad_norm": 0.1709423275538542, "learning_rate": 4.511278195488721e-05, "loss": 0.9415, "step": 80 }, { "epoch": 0.015995483628152053, "grad_norm": 0.1984231147448857, "learning_rate": 4.7932330827067666e-05, "loss": 0.9456, "step": 85 }, { "epoch": 0.016936394429808054, "grad_norm": 0.16064378379367358, "learning_rate": 5.0751879699248114e-05, "loss": 0.937, "step": 90 }, { "epoch": 0.017877305231464056, "grad_norm": 0.17183543893075945, "learning_rate": 5.357142857142857e-05, "loss": 0.939, "step": 95 }, { "epoch": 0.01881821603312006, "grad_norm": 0.13904432551213514, "learning_rate": 5.6390977443609016e-05, "loss": 0.9358, "step": 100 }, { "epoch": 0.019759126834776063, "grad_norm": 0.14843412643116974, "learning_rate": 5.921052631578947e-05, "loss": 0.9221, "step": 105 }, { "epoch": 0.020700037636432068, "grad_norm": 0.14289835041826568, "learning_rate": 6.203007518796992e-05, "loss": 0.9335, "step": 110 }, { "epoch": 0.02164094843808807, "grad_norm": 0.12123282065253636, "learning_rate": 6.484962406015037e-05, "loss": 0.9428, "step": 115 }, { "epoch": 0.02258185923974407, "grad_norm": 0.12416935651368655, "learning_rate": 6.766917293233081e-05, "loss": 0.9183, "step": 120 }, { "epoch": 0.023522770041400076, "grad_norm": 0.09188631695179955, "learning_rate": 7.048872180451127e-05, "loss": 0.9055, "step": 125 }, { "epoch": 0.024463680843056078, "grad_norm": 0.0876665353586765, "learning_rate": 7.330827067669172e-05, "loss": 0.9103, "step": 130 }, { "epoch": 0.025404591644712083, "grad_norm": 0.08243508531454458, "learning_rate": 7.612781954887218e-05, "loss": 0.8813, "step": 135 }, { "epoch": 0.026345502446368085, "grad_norm": 0.09264651144613481, "learning_rate": 7.894736842105262e-05, "loss": 0.9165, "step": 140 }, { "epoch": 0.027286413248024086, "grad_norm": 0.10132147434306207, "learning_rate": 8.176691729323307e-05, "loss": 0.9094, "step": 145 }, { "epoch": 0.02822732404968009, "grad_norm": 0.07777446691897923, "learning_rate": 8.458646616541353e-05, "loss": 0.919, "step": 150 }, { "epoch": 0.029168234851336093, "grad_norm": 0.0877254827678949, "learning_rate": 8.740601503759398e-05, "loss": 0.9092, "step": 155 }, { "epoch": 0.030109145652992095, "grad_norm": 0.0698681608512868, "learning_rate": 9.022556390977442e-05, "loss": 0.8901, "step": 160 }, { "epoch": 0.0310500564546481, "grad_norm": 0.07865064250292489, "learning_rate": 9.304511278195488e-05, "loss": 0.9177, "step": 165 }, { "epoch": 0.031990967256304105, "grad_norm": 0.07217084698029017, "learning_rate": 9.586466165413533e-05, "loss": 0.8612, "step": 170 }, { "epoch": 0.03293187805796011, "grad_norm": 0.07821397459573423, "learning_rate": 9.868421052631579e-05, "loss": 0.9028, "step": 175 }, { "epoch": 0.03387278885961611, "grad_norm": 0.07354195280117189, "learning_rate": 0.00010150375939849623, "loss": 0.8853, "step": 180 }, { "epoch": 0.03481369966127211, "grad_norm": 0.07326370701811867, "learning_rate": 0.00010432330827067668, "loss": 0.8976, "step": 185 }, { "epoch": 0.03575461046292811, "grad_norm": 0.0771216114229083, "learning_rate": 0.00010714285714285714, "loss": 0.8974, "step": 190 }, { "epoch": 0.03669552126458412, "grad_norm": 0.07423933567725373, "learning_rate": 0.00010996240601503759, "loss": 0.8983, "step": 195 }, { "epoch": 0.03763643206624012, "grad_norm": 0.07359448396484719, "learning_rate": 0.00011278195488721803, "loss": 0.8855, "step": 200 }, { "epoch": 0.038577342867896124, "grad_norm": 0.0705322988260988, "learning_rate": 0.00011560150375939849, "loss": 0.8902, "step": 205 }, { "epoch": 0.039518253669552125, "grad_norm": 0.07553017144819835, "learning_rate": 0.00011842105263157894, "loss": 0.8979, "step": 210 }, { "epoch": 0.04045916447120813, "grad_norm": 0.06639469040581934, "learning_rate": 0.0001212406015037594, "loss": 0.8726, "step": 215 }, { "epoch": 0.041400075272864136, "grad_norm": 0.07164418665916035, "learning_rate": 0.00012406015037593984, "loss": 0.8805, "step": 220 }, { "epoch": 0.04234098607452014, "grad_norm": 0.08024783310853623, "learning_rate": 0.00012687969924812028, "loss": 0.8846, "step": 225 }, { "epoch": 0.04328189687617614, "grad_norm": 0.06967695982721508, "learning_rate": 0.00012969924812030075, "loss": 0.8562, "step": 230 }, { "epoch": 0.04422280767783214, "grad_norm": 0.06791598862412057, "learning_rate": 0.0001325187969924812, "loss": 0.8981, "step": 235 }, { "epoch": 0.04516371847948814, "grad_norm": 0.06926389506888998, "learning_rate": 0.00013533834586466163, "loss": 0.8889, "step": 240 }, { "epoch": 0.04610462928114415, "grad_norm": 0.07683826514393188, "learning_rate": 0.0001381578947368421, "loss": 0.8801, "step": 245 }, { "epoch": 0.04704554008280015, "grad_norm": 0.08813591407993811, "learning_rate": 0.00014097744360902254, "loss": 0.8885, "step": 250 }, { "epoch": 0.047986450884456154, "grad_norm": 0.07234142660651707, "learning_rate": 0.000143796992481203, "loss": 0.8517, "step": 255 }, { "epoch": 0.048927361686112156, "grad_norm": 0.0648457622804495, "learning_rate": 0.00014661654135338345, "loss": 0.8912, "step": 260 }, { "epoch": 0.04986827248776816, "grad_norm": 0.07356170701724246, "learning_rate": 0.0001494360902255639, "loss": 0.9047, "step": 265 }, { "epoch": 0.050809183289424166, "grad_norm": 0.06615172685965298, "learning_rate": 0.00015225563909774436, "loss": 0.8807, "step": 270 }, { "epoch": 0.05175009409108017, "grad_norm": 0.06082657075881043, "learning_rate": 0.0001550751879699248, "loss": 0.8834, "step": 275 }, { "epoch": 0.05269100489273617, "grad_norm": 0.06342890619901809, "learning_rate": 0.00015789473684210524, "loss": 0.864, "step": 280 }, { "epoch": 0.05363191569439217, "grad_norm": 0.06319688554101384, "learning_rate": 0.0001607142857142857, "loss": 0.8684, "step": 285 }, { "epoch": 0.05457282649604817, "grad_norm": 0.06334243363623335, "learning_rate": 0.00016353383458646615, "loss": 0.8909, "step": 290 }, { "epoch": 0.055513737297704174, "grad_norm": 0.06333186578028213, "learning_rate": 0.00016635338345864662, "loss": 0.9081, "step": 295 }, { "epoch": 0.05645464809936018, "grad_norm": 0.07330638979979831, "learning_rate": 0.00016917293233082706, "loss": 0.8736, "step": 300 }, { "epoch": 0.057395558901016185, "grad_norm": 0.061476218135212986, "learning_rate": 0.00017199248120300752, "loss": 0.8572, "step": 305 }, { "epoch": 0.058336469702672186, "grad_norm": 0.06885650230162942, "learning_rate": 0.00017481203007518797, "loss": 0.8825, "step": 310 }, { "epoch": 0.05927738050432819, "grad_norm": 0.06201718392161657, "learning_rate": 0.00017763157894736838, "loss": 0.8791, "step": 315 }, { "epoch": 0.06021829130598419, "grad_norm": 0.05952975722354683, "learning_rate": 0.00018045112781954885, "loss": 0.8722, "step": 320 }, { "epoch": 0.0611592021076402, "grad_norm": 0.0845033558899056, "learning_rate": 0.0001832706766917293, "loss": 0.8779, "step": 325 }, { "epoch": 0.0621001129092962, "grad_norm": 0.0696024287202025, "learning_rate": 0.00018609022556390976, "loss": 0.8589, "step": 330 }, { "epoch": 0.0630410237109522, "grad_norm": 0.06743400038358317, "learning_rate": 0.00018890977443609022, "loss": 0.8943, "step": 335 }, { "epoch": 0.06398193451260821, "grad_norm": 0.06448702771901722, "learning_rate": 0.00019172932330827067, "loss": 0.8706, "step": 340 }, { "epoch": 0.0649228453142642, "grad_norm": 0.06648697936479611, "learning_rate": 0.00019454887218045113, "loss": 0.8715, "step": 345 }, { "epoch": 0.06586375611592021, "grad_norm": 0.07133760531354241, "learning_rate": 0.00019736842105263157, "loss": 0.8909, "step": 350 }, { "epoch": 0.06680466691757621, "grad_norm": 0.07400726358762413, "learning_rate": 0.000200187969924812, "loss": 0.8829, "step": 355 }, { "epoch": 0.06774557771923222, "grad_norm": 0.066751649475411, "learning_rate": 0.00020300751879699246, "loss": 0.8602, "step": 360 }, { "epoch": 0.06868648852088823, "grad_norm": 0.07170376738210353, "learning_rate": 0.0002058270676691729, "loss": 0.8519, "step": 365 }, { "epoch": 0.06962739932254422, "grad_norm": 0.06344374065815822, "learning_rate": 0.00020864661654135337, "loss": 0.8373, "step": 370 }, { "epoch": 0.07056831012420023, "grad_norm": 0.06254663540702332, "learning_rate": 0.0002114661654135338, "loss": 0.9122, "step": 375 }, { "epoch": 0.07150922092585622, "grad_norm": 0.06630769532444106, "learning_rate": 0.00021428571428571427, "loss": 0.8654, "step": 380 }, { "epoch": 0.07245013172751223, "grad_norm": 0.07154196055342273, "learning_rate": 0.00021710526315789472, "loss": 0.8698, "step": 385 }, { "epoch": 0.07339104252916824, "grad_norm": 0.06473991172822582, "learning_rate": 0.00021992481203007518, "loss": 0.8558, "step": 390 }, { "epoch": 0.07433195333082424, "grad_norm": 0.0641271642923791, "learning_rate": 0.0002227443609022556, "loss": 0.8825, "step": 395 }, { "epoch": 0.07527286413248024, "grad_norm": 0.07171043288638893, "learning_rate": 0.00022556390977443607, "loss": 0.8715, "step": 400 }, { "epoch": 0.07621377493413624, "grad_norm": 0.07945386130978546, "learning_rate": 0.0002283834586466165, "loss": 0.8718, "step": 405 }, { "epoch": 0.07715468573579225, "grad_norm": 0.06502361339008453, "learning_rate": 0.00023120300751879697, "loss": 0.8945, "step": 410 }, { "epoch": 0.07809559653744826, "grad_norm": 0.07560856132420715, "learning_rate": 0.00023402255639097742, "loss": 0.8527, "step": 415 }, { "epoch": 0.07903650733910425, "grad_norm": 0.06652060140595817, "learning_rate": 0.00023684210526315788, "loss": 0.8757, "step": 420 }, { "epoch": 0.07997741814076026, "grad_norm": 0.0634870168645207, "learning_rate": 0.00023966165413533832, "loss": 0.8795, "step": 425 }, { "epoch": 0.08091832894241625, "grad_norm": 0.06045441546519768, "learning_rate": 0.0002424812030075188, "loss": 0.8696, "step": 430 }, { "epoch": 0.08185923974407226, "grad_norm": 0.061655818665132714, "learning_rate": 0.00024530075187969923, "loss": 0.8558, "step": 435 }, { "epoch": 0.08280015054572827, "grad_norm": 0.06182009827579582, "learning_rate": 0.0002481203007518797, "loss": 0.8547, "step": 440 }, { "epoch": 0.08374106134738427, "grad_norm": 0.0639219310698945, "learning_rate": 0.0002509398496240601, "loss": 0.8752, "step": 445 }, { "epoch": 0.08468197214904027, "grad_norm": 0.05045686701916079, "learning_rate": 0.00025375939849624056, "loss": 0.8377, "step": 450 }, { "epoch": 0.08562288295069627, "grad_norm": 0.06119130976338784, "learning_rate": 0.00025657894736842105, "loss": 0.8795, "step": 455 }, { "epoch": 0.08656379375235228, "grad_norm": 0.05851515205626573, "learning_rate": 0.0002593984962406015, "loss": 0.8466, "step": 460 }, { "epoch": 0.08750470455400829, "grad_norm": 0.05891589185564019, "learning_rate": 0.00026221804511278193, "loss": 0.8508, "step": 465 }, { "epoch": 0.08844561535566428, "grad_norm": 0.06568583634178668, "learning_rate": 0.0002650375939849624, "loss": 0.8887, "step": 470 }, { "epoch": 0.08938652615732029, "grad_norm": 0.06559964245515083, "learning_rate": 0.00026785714285714287, "loss": 0.8517, "step": 475 }, { "epoch": 0.09032743695897628, "grad_norm": 0.06489995786075839, "learning_rate": 0.00027067669172932326, "loss": 0.876, "step": 480 }, { "epoch": 0.09126834776063229, "grad_norm": 0.059994926099869166, "learning_rate": 0.0002734962406015037, "loss": 0.8905, "step": 485 }, { "epoch": 0.0922092585622883, "grad_norm": 0.05813814908142138, "learning_rate": 0.0002763157894736842, "loss": 0.8935, "step": 490 }, { "epoch": 0.0931501693639443, "grad_norm": 0.06773262538564226, "learning_rate": 0.00027913533834586463, "loss": 0.8724, "step": 495 }, { "epoch": 0.0940910801656003, "grad_norm": 0.06172481466819506, "learning_rate": 0.0002819548872180451, "loss": 0.8488, "step": 500 }, { "epoch": 0.0950319909672563, "grad_norm": 0.05892455966797863, "learning_rate": 0.00028477443609022557, "loss": 0.8969, "step": 505 }, { "epoch": 0.09597290176891231, "grad_norm": 0.07375087726120111, "learning_rate": 0.000287593984962406, "loss": 0.8699, "step": 510 }, { "epoch": 0.09691381257056832, "grad_norm": 0.06156991814488305, "learning_rate": 0.00029041353383458645, "loss": 0.871, "step": 515 }, { "epoch": 0.09785472337222431, "grad_norm": 0.062492373867649643, "learning_rate": 0.0002932330827067669, "loss": 0.8804, "step": 520 }, { "epoch": 0.09879563417388032, "grad_norm": 0.05942165284065285, "learning_rate": 0.00029605263157894733, "loss": 0.8812, "step": 525 }, { "epoch": 0.09973654497553631, "grad_norm": 0.059197976767297204, "learning_rate": 0.0002988721804511278, "loss": 0.888, "step": 530 }, { "epoch": 0.10067745577719232, "grad_norm": 0.060788308115763814, "learning_rate": 0.00029999970867065386, "loss": 0.8777, "step": 535 }, { "epoch": 0.10161836657884833, "grad_norm": 0.06549563064661894, "learning_rate": 0.0002999979283287479, "loss": 0.8973, "step": 540 }, { "epoch": 0.10255927738050433, "grad_norm": 0.05637984655649405, "learning_rate": 0.0002999945295137593, "loss": 0.864, "step": 545 }, { "epoch": 0.10350018818216034, "grad_norm": 0.05419448738098367, "learning_rate": 0.00029998951226236113, "loss": 0.8528, "step": 550 }, { "epoch": 0.10444109898381633, "grad_norm": 0.055413842575868566, "learning_rate": 0.0002999828766286894, "loss": 0.8721, "step": 555 }, { "epoch": 0.10538200978547234, "grad_norm": 0.05124313574780934, "learning_rate": 0.0002999746226843424, "loss": 0.8867, "step": 560 }, { "epoch": 0.10632292058712833, "grad_norm": 0.054834397457305846, "learning_rate": 0.0002999647505183799, "loss": 0.861, "step": 565 }, { "epoch": 0.10726383138878434, "grad_norm": 0.05532784610616103, "learning_rate": 0.00029995326023732235, "loss": 0.896, "step": 570 }, { "epoch": 0.10820474219044035, "grad_norm": 0.056119336353656256, "learning_rate": 0.00029994015196514945, "loss": 0.8922, "step": 575 }, { "epoch": 0.10914565299209635, "grad_norm": 0.054803911827884036, "learning_rate": 0.00029992542584329914, "loss": 0.8748, "step": 580 }, { "epoch": 0.11008656379375235, "grad_norm": 0.0575432175173428, "learning_rate": 0.00029990908203066574, "loss": 0.8804, "step": 585 }, { "epoch": 0.11102747459540835, "grad_norm": 0.0522611031206057, "learning_rate": 0.00029989112070359853, "loss": 0.8619, "step": 590 }, { "epoch": 0.11196838539706436, "grad_norm": 0.053009150267980906, "learning_rate": 0.0002998715420558997, "loss": 0.8849, "step": 595 }, { "epoch": 0.11290929619872037, "grad_norm": 0.05254302852123155, "learning_rate": 0.0002998503462988222, "loss": 0.8939, "step": 600 }, { "epoch": 0.11385020700037636, "grad_norm": 0.05929685311259276, "learning_rate": 0.0002998275336610677, "loss": 0.8651, "step": 605 }, { "epoch": 0.11479111780203237, "grad_norm": 0.055640971519342844, "learning_rate": 0.0002998031043887838, "loss": 0.9132, "step": 610 }, { "epoch": 0.11573202860368836, "grad_norm": 0.06515281749731798, "learning_rate": 0.0002997770587455616, "loss": 0.8854, "step": 615 }, { "epoch": 0.11667293940534437, "grad_norm": 0.05226198007608227, "learning_rate": 0.00029974939701243284, "loss": 0.8936, "step": 620 }, { "epoch": 0.11761385020700038, "grad_norm": 0.05246222413311401, "learning_rate": 0.00029972011948786677, "loss": 0.8709, "step": 625 }, { "epoch": 0.11855476100865638, "grad_norm": 0.05531148888103947, "learning_rate": 0.0002996892264877669, "loss": 0.8652, "step": 630 }, { "epoch": 0.11949567181031238, "grad_norm": 0.05499488496867662, "learning_rate": 0.00029965671834546794, "loss": 0.8678, "step": 635 }, { "epoch": 0.12043658261196838, "grad_norm": 0.05755758230328177, "learning_rate": 0.0002996225954117316, "loss": 0.8574, "step": 640 }, { "epoch": 0.12137749341362439, "grad_norm": 0.05120044581677358, "learning_rate": 0.0002995868580547434, "loss": 0.8647, "step": 645 }, { "epoch": 0.1223184042152804, "grad_norm": 0.05183325596536476, "learning_rate": 0.00029954950666010827, "loss": 0.8572, "step": 650 }, { "epoch": 0.12325931501693639, "grad_norm": 0.05295080137171756, "learning_rate": 0.0002995105416308466, "loss": 0.8766, "step": 655 }, { "epoch": 0.1242002258185924, "grad_norm": 0.05489170605693284, "learning_rate": 0.0002994699633873899, "loss": 0.8791, "step": 660 }, { "epoch": 0.1251411366202484, "grad_norm": 0.05376647275139867, "learning_rate": 0.00029942777236757626, "loss": 0.8876, "step": 665 }, { "epoch": 0.1260820474219044, "grad_norm": 0.05253119730747103, "learning_rate": 0.0002993839690266454, "loss": 0.8817, "step": 670 }, { "epoch": 0.1270229582235604, "grad_norm": 0.0517706614738255, "learning_rate": 0.00029933855383723406, "loss": 0.8762, "step": 675 }, { "epoch": 0.12796386902521642, "grad_norm": 0.05981220672493737, "learning_rate": 0.00029929152728937067, "loss": 0.8721, "step": 680 }, { "epoch": 0.1289047798268724, "grad_norm": 0.052639270170391564, "learning_rate": 0.00029924288989047047, "loss": 0.8645, "step": 685 }, { "epoch": 0.1298456906285284, "grad_norm": 0.04870706752581317, "learning_rate": 0.0002991926421653293, "loss": 0.8406, "step": 690 }, { "epoch": 0.13078660143018442, "grad_norm": 0.051293819330064536, "learning_rate": 0.00029914078465611866, "loss": 0.8571, "step": 695 }, { "epoch": 0.13172751223184043, "grad_norm": 0.05202337778091494, "learning_rate": 0.0002990873179223796, "loss": 0.8337, "step": 700 }, { "epoch": 0.13266842303349644, "grad_norm": 0.05886832645070351, "learning_rate": 0.00029903224254101637, "loss": 0.8948, "step": 705 }, { "epoch": 0.13360933383515242, "grad_norm": 0.05784259384372831, "learning_rate": 0.00029897555910629077, "loss": 0.8893, "step": 710 }, { "epoch": 0.13455024463680842, "grad_norm": 0.049720758505550806, "learning_rate": 0.0002989172682298153, "loss": 0.8561, "step": 715 }, { "epoch": 0.13549115543846443, "grad_norm": 0.057567182371576854, "learning_rate": 0.00029885737054054673, "loss": 0.853, "step": 720 }, { "epoch": 0.13643206624012044, "grad_norm": 0.05157693817638803, "learning_rate": 0.00029879586668477936, "loss": 0.8359, "step": 725 }, { "epoch": 0.13737297704177645, "grad_norm": 0.0533272964269685, "learning_rate": 0.0002987327573261379, "loss": 0.908, "step": 730 }, { "epoch": 0.13831388784343243, "grad_norm": 0.05233796692330438, "learning_rate": 0.00029866804314557043, "loss": 0.8819, "step": 735 }, { "epoch": 0.13925479864508844, "grad_norm": 0.0573014910728429, "learning_rate": 0.0002986017248413409, "loss": 0.8973, "step": 740 }, { "epoch": 0.14019570944674445, "grad_norm": 0.052849475942016416, "learning_rate": 0.00029853380312902186, "loss": 0.8419, "step": 745 }, { "epoch": 0.14113662024840046, "grad_norm": 0.06011994209298975, "learning_rate": 0.0002984642787414865, "loss": 0.8804, "step": 750 }, { "epoch": 0.14207753105005647, "grad_norm": 0.05499188180020471, "learning_rate": 0.00029839315242890087, "loss": 0.8634, "step": 755 }, { "epoch": 0.14301844185171245, "grad_norm": 0.05499133991676239, "learning_rate": 0.00029832042495871576, "loss": 0.8583, "step": 760 }, { "epoch": 0.14395935265336846, "grad_norm": 0.05426291762052608, "learning_rate": 0.00029824609711565824, "loss": 0.8725, "step": 765 }, { "epoch": 0.14490026345502446, "grad_norm": 0.047322871594277255, "learning_rate": 0.0002981701697017236, "loss": 0.8561, "step": 770 }, { "epoch": 0.14584117425668047, "grad_norm": 0.05466867627510976, "learning_rate": 0.0002980926435361662, "loss": 0.8737, "step": 775 }, { "epoch": 0.14678208505833648, "grad_norm": 0.051544041730181685, "learning_rate": 0.0002980135194554911, "loss": 0.8562, "step": 780 }, { "epoch": 0.14772299585999246, "grad_norm": 0.0512691349349204, "learning_rate": 0.00029793279831344475, "loss": 0.8512, "step": 785 }, { "epoch": 0.14866390666164847, "grad_norm": 0.04756716736330136, "learning_rate": 0.0002978504809810057, "loss": 0.8612, "step": 790 }, { "epoch": 0.14960481746330448, "grad_norm": 0.04882280569803268, "learning_rate": 0.00029776656834637553, "loss": 0.8514, "step": 795 }, { "epoch": 0.1505457282649605, "grad_norm": 0.054783783095015, "learning_rate": 0.00029768106131496905, "loss": 0.8796, "step": 800 }, { "epoch": 0.1514866390666165, "grad_norm": 0.0531473951393396, "learning_rate": 0.0002975939608094045, "loss": 0.8565, "step": 805 }, { "epoch": 0.15242754986827248, "grad_norm": 0.05185385237557039, "learning_rate": 0.00029750526776949364, "loss": 0.8371, "step": 810 }, { "epoch": 0.15336846066992849, "grad_norm": 0.0505734130462192, "learning_rate": 0.00029741498315223174, "loss": 0.8626, "step": 815 }, { "epoch": 0.1543093714715845, "grad_norm": 0.055795428961054876, "learning_rate": 0.000297323107931787, "loss": 0.8642, "step": 820 }, { "epoch": 0.1552502822732405, "grad_norm": 0.048819133510848926, "learning_rate": 0.0002972296430994903, "loss": 0.8873, "step": 825 }, { "epoch": 0.1561911930748965, "grad_norm": 0.05322784832445891, "learning_rate": 0.00029713458966382434, "loss": 0.8578, "step": 830 }, { "epoch": 0.1571321038765525, "grad_norm": 0.05681519640146074, "learning_rate": 0.00029703794865041283, "loss": 0.8791, "step": 835 }, { "epoch": 0.1580730146782085, "grad_norm": 0.05999945592046972, "learning_rate": 0.0002969397211020093, "loss": 0.849, "step": 840 }, { "epoch": 0.1590139254798645, "grad_norm": 0.04571088658610958, "learning_rate": 0.00029683990807848596, "loss": 0.8483, "step": 845 }, { "epoch": 0.15995483628152052, "grad_norm": 0.06089448054645934, "learning_rate": 0.00029673851065682244, "loss": 0.863, "step": 850 }, { "epoch": 0.16089574708317653, "grad_norm": 0.06061988663918843, "learning_rate": 0.00029663552993109375, "loss": 0.8794, "step": 855 }, { "epoch": 0.1618366578848325, "grad_norm": 0.05182284732026814, "learning_rate": 0.0002965309670124588, "loss": 0.8607, "step": 860 }, { "epoch": 0.16277756868648852, "grad_norm": 0.04524287407933527, "learning_rate": 0.0002964248230291483, "loss": 0.844, "step": 865 }, { "epoch": 0.16371847948814452, "grad_norm": 0.051529772912372, "learning_rate": 0.0002963170991264526, "loss": 0.851, "step": 870 }, { "epoch": 0.16465939028980053, "grad_norm": 0.04587024085305175, "learning_rate": 0.0002962077964667093, "loss": 0.8595, "step": 875 }, { "epoch": 0.16560030109145654, "grad_norm": 0.05060611998562655, "learning_rate": 0.0002960969162292908, "loss": 0.8677, "step": 880 }, { "epoch": 0.16654121189311252, "grad_norm": 0.04499302822555919, "learning_rate": 0.00029598445961059156, "loss": 0.8271, "step": 885 }, { "epoch": 0.16748212269476853, "grad_norm": 0.05129556847823224, "learning_rate": 0.000295870427824015, "loss": 0.8881, "step": 890 }, { "epoch": 0.16842303349642454, "grad_norm": 0.04871115226254147, "learning_rate": 0.00029575482209996055, "loss": 0.8336, "step": 895 }, { "epoch": 0.16936394429808055, "grad_norm": 0.0566108024774645, "learning_rate": 0.0002956376436858106, "loss": 0.8494, "step": 900 }, { "epoch": 0.17030485509973656, "grad_norm": 0.048845770084260345, "learning_rate": 0.00029551889384591665, "loss": 0.8986, "step": 905 }, { "epoch": 0.17124576590139254, "grad_norm": 0.048327818597270665, "learning_rate": 0.0002953985738615858, "loss": 0.8693, "step": 910 }, { "epoch": 0.17218667670304855, "grad_norm": 0.05247614713322588, "learning_rate": 0.000295276685031067, "loss": 0.8485, "step": 915 }, { "epoch": 0.17312758750470456, "grad_norm": 0.047710655547393406, "learning_rate": 0.0002951532286695371, "loss": 0.8581, "step": 920 }, { "epoch": 0.17406849830636056, "grad_norm": 0.048026071717255266, "learning_rate": 0.0002950282061090864, "loss": 0.8417, "step": 925 }, { "epoch": 0.17500940910801657, "grad_norm": 0.047035746855751244, "learning_rate": 0.0002949016186987046, "loss": 0.877, "step": 930 }, { "epoch": 0.17595031990967255, "grad_norm": 0.046963321258380666, "learning_rate": 0.00029477346780426605, "loss": 0.8792, "step": 935 }, { "epoch": 0.17689123071132856, "grad_norm": 0.04969923101676293, "learning_rate": 0.0002946437548085148, "loss": 0.8598, "step": 940 }, { "epoch": 0.17783214151298457, "grad_norm": 0.04621552171294473, "learning_rate": 0.0002945124811110504, "loss": 0.8335, "step": 945 }, { "epoch": 0.17877305231464058, "grad_norm": 0.051617110249976034, "learning_rate": 0.0002943796481283118, "loss": 0.8719, "step": 950 }, { "epoch": 0.1797139631162966, "grad_norm": 0.043058880781635334, "learning_rate": 0.000294245257293563, "loss": 0.8558, "step": 955 }, { "epoch": 0.18065487391795257, "grad_norm": 0.05388724186288067, "learning_rate": 0.00029410931005687696, "loss": 0.8519, "step": 960 }, { "epoch": 0.18159578471960858, "grad_norm": 0.04857900819806878, "learning_rate": 0.00029397180788512026, "loss": 0.8527, "step": 965 }, { "epoch": 0.18253669552126459, "grad_norm": 0.05424964878908578, "learning_rate": 0.0002938327522619371, "loss": 0.8568, "step": 970 }, { "epoch": 0.1834776063229206, "grad_norm": 0.05227365290335698, "learning_rate": 0.0002936921446877334, "loss": 0.8633, "step": 975 }, { "epoch": 0.1844185171245766, "grad_norm": 0.051325801180247675, "learning_rate": 0.0002935499866796607, "loss": 0.8848, "step": 980 }, { "epoch": 0.18535942792623258, "grad_norm": 0.05574668624448867, "learning_rate": 0.00029340627977159957, "loss": 0.8387, "step": 985 }, { "epoch": 0.1863003387278886, "grad_norm": 0.05634617811405129, "learning_rate": 0.0002932610255141431, "loss": 0.8611, "step": 990 }, { "epoch": 0.1872412495295446, "grad_norm": 0.04490025753044342, "learning_rate": 0.0002931142254745804, "loss": 0.8696, "step": 995 }, { "epoch": 0.1881821603312006, "grad_norm": 0.04554083227376874, "learning_rate": 0.0002929658812368794, "loss": 0.8518, "step": 1000 }, { "epoch": 0.18912307113285662, "grad_norm": 0.04781802172474072, "learning_rate": 0.0002928159944016698, "loss": 0.8371, "step": 1005 }, { "epoch": 0.1900639819345126, "grad_norm": 0.0489899852900461, "learning_rate": 0.00029266456658622617, "loss": 0.8826, "step": 1010 }, { "epoch": 0.1910048927361686, "grad_norm": 0.047291616554045394, "learning_rate": 0.0002925115994244499, "loss": 0.8193, "step": 1015 }, { "epoch": 0.19194580353782462, "grad_norm": 0.04800411582285608, "learning_rate": 0.000292357094566852, "loss": 0.8566, "step": 1020 }, { "epoch": 0.19288671433948062, "grad_norm": 0.05017688668028036, "learning_rate": 0.00029220105368053535, "loss": 0.8278, "step": 1025 }, { "epoch": 0.19382762514113663, "grad_norm": 0.05255812021463712, "learning_rate": 0.0002920434784491762, "loss": 0.8518, "step": 1030 }, { "epoch": 0.19476853594279261, "grad_norm": 0.04644990001608543, "learning_rate": 0.00029188437057300654, "loss": 0.8295, "step": 1035 }, { "epoch": 0.19570944674444862, "grad_norm": 0.051889984533039224, "learning_rate": 0.00029172373176879554, "loss": 0.8511, "step": 1040 }, { "epoch": 0.19665035754610463, "grad_norm": 0.05006046094748636, "learning_rate": 0.000291561563769831, "loss": 0.8699, "step": 1045 }, { "epoch": 0.19759126834776064, "grad_norm": 0.04471198696865864, "learning_rate": 0.00029139786832590075, "loss": 0.8204, "step": 1050 }, { "epoch": 0.19853217914941665, "grad_norm": 0.046864689834249275, "learning_rate": 0.00029123264720327355, "loss": 0.8482, "step": 1055 }, { "epoch": 0.19947308995107263, "grad_norm": 0.05135087246231078, "learning_rate": 0.0002910659021846803, "loss": 0.8483, "step": 1060 }, { "epoch": 0.20041400075272864, "grad_norm": 0.04785073870168607, "learning_rate": 0.00029089763506929476, "loss": 0.8503, "step": 1065 }, { "epoch": 0.20135491155438465, "grad_norm": 0.05489000507522068, "learning_rate": 0.0002907278476727139, "loss": 0.8273, "step": 1070 }, { "epoch": 0.20229582235604066, "grad_norm": 0.04772968129164812, "learning_rate": 0.0002905565418269386, "loss": 0.869, "step": 1075 }, { "epoch": 0.20323673315769666, "grad_norm": 0.0468360392868491, "learning_rate": 0.0002903837193803537, "loss": 0.8593, "step": 1080 }, { "epoch": 0.20417764395935264, "grad_norm": 0.05048292893225177, "learning_rate": 0.00029020938219770815, "loss": 0.8509, "step": 1085 }, { "epoch": 0.20511855476100865, "grad_norm": 0.050674095814671125, "learning_rate": 0.0002900335321600949, "loss": 0.8687, "step": 1090 }, { "epoch": 0.20605946556266466, "grad_norm": 0.04788733400998382, "learning_rate": 0.00028985617116493044, "loss": 0.8472, "step": 1095 }, { "epoch": 0.20700037636432067, "grad_norm": 0.04582231610895905, "learning_rate": 0.0002896773011259345, "loss": 0.8471, "step": 1100 }, { "epoch": 0.20794128716597668, "grad_norm": 0.0443686344112356, "learning_rate": 0.0002894969239731094, "loss": 0.8372, "step": 1105 }, { "epoch": 0.20888219796763266, "grad_norm": 0.06909605924612522, "learning_rate": 0.00028931504165271915, "loss": 0.8449, "step": 1110 }, { "epoch": 0.20982310876928867, "grad_norm": 0.053698108677639904, "learning_rate": 0.0002891316561272684, "loss": 0.8488, "step": 1115 }, { "epoch": 0.21076401957094468, "grad_norm": 0.04652295229508992, "learning_rate": 0.0002889467693754814, "loss": 0.8185, "step": 1120 }, { "epoch": 0.21170493037260069, "grad_norm": 0.048539262759809874, "learning_rate": 0.0002887603833922806, "loss": 0.8384, "step": 1125 }, { "epoch": 0.21264584117425667, "grad_norm": 0.04776277603446453, "learning_rate": 0.00028857250018876504, "loss": 0.8314, "step": 1130 }, { "epoch": 0.21358675197591268, "grad_norm": 0.04923483629023402, "learning_rate": 0.0002883831217921889, "loss": 0.8522, "step": 1135 }, { "epoch": 0.21452766277756868, "grad_norm": 0.04258834829093326, "learning_rate": 0.00028819225024593915, "loss": 0.8478, "step": 1140 }, { "epoch": 0.2154685735792247, "grad_norm": 0.04700949749104464, "learning_rate": 0.00028799988760951404, "loss": 0.8493, "step": 1145 }, { "epoch": 0.2164094843808807, "grad_norm": 0.05210570592265317, "learning_rate": 0.00028780603595850054, "loss": 0.8432, "step": 1150 }, { "epoch": 0.21735039518253668, "grad_norm": 0.04967685491585681, "learning_rate": 0.0002876106973845521, "loss": 0.8752, "step": 1155 }, { "epoch": 0.2182913059841927, "grad_norm": 0.04829576891783422, "learning_rate": 0.00028741387399536597, "loss": 0.8335, "step": 1160 }, { "epoch": 0.2192322167858487, "grad_norm": 0.04740474919455731, "learning_rate": 0.00028721556791466056, "loss": 0.8192, "step": 1165 }, { "epoch": 0.2201731275875047, "grad_norm": 0.048080208780232926, "learning_rate": 0.0002870157812821525, "loss": 0.8368, "step": 1170 }, { "epoch": 0.22111403838916072, "grad_norm": 0.044197737136251934, "learning_rate": 0.0002868145162535333, "loss": 0.8462, "step": 1175 }, { "epoch": 0.2220549491908167, "grad_norm": 0.0469168238676528, "learning_rate": 0.0002866117750004466, "loss": 0.8455, "step": 1180 }, { "epoch": 0.2229958599924727, "grad_norm": 0.058277013619873835, "learning_rate": 0.00028640755971046436, "loss": 0.8337, "step": 1185 }, { "epoch": 0.22393677079412871, "grad_norm": 0.04233910712545471, "learning_rate": 0.00028620187258706335, "loss": 0.8564, "step": 1190 }, { "epoch": 0.22487768159578472, "grad_norm": 0.04697864084139272, "learning_rate": 0.00028599471584960136, "loss": 0.824, "step": 1195 }, { "epoch": 0.22581859239744073, "grad_norm": 0.04788326739922656, "learning_rate": 0.0002857860917332933, "loss": 0.8129, "step": 1200 }, { "epoch": 0.2267595031990967, "grad_norm": 0.04394998686275356, "learning_rate": 0.0002855760024891869, "loss": 0.8271, "step": 1205 }, { "epoch": 0.22770041400075272, "grad_norm": 0.04704392654736644, "learning_rate": 0.0002853644503841389, "loss": 0.844, "step": 1210 }, { "epoch": 0.22864132480240873, "grad_norm": 0.05122535529682313, "learning_rate": 0.0002851514377007901, "loss": 0.8494, "step": 1215 }, { "epoch": 0.22958223560406474, "grad_norm": 0.0502260796215441, "learning_rate": 0.00028493696673754067, "loss": 0.8433, "step": 1220 }, { "epoch": 0.23052314640572075, "grad_norm": 0.043494324671621835, "learning_rate": 0.0002847210398085259, "loss": 0.8443, "step": 1225 }, { "epoch": 0.23146405720737673, "grad_norm": 0.05049651139793639, "learning_rate": 0.00028450365924359073, "loss": 0.8042, "step": 1230 }, { "epoch": 0.23240496800903274, "grad_norm": 0.05610756684714545, "learning_rate": 0.000284284827388265, "loss": 0.8256, "step": 1235 }, { "epoch": 0.23334587881068874, "grad_norm": 0.047968557913656215, "learning_rate": 0.00028406454660373753, "loss": 0.8365, "step": 1240 }, { "epoch": 0.23428678961234475, "grad_norm": 0.042261765294726174, "learning_rate": 0.0002838428192668315, "loss": 0.855, "step": 1245 }, { "epoch": 0.23522770041400076, "grad_norm": 0.05021581504261803, "learning_rate": 0.00028361964776997794, "loss": 0.8191, "step": 1250 }, { "epoch": 0.23616861121565674, "grad_norm": 0.05030219142064272, "learning_rate": 0.00028339503452119063, "loss": 0.8325, "step": 1255 }, { "epoch": 0.23710952201731275, "grad_norm": 0.04874399457655963, "learning_rate": 0.0002831689819440397, "loss": 0.8423, "step": 1260 }, { "epoch": 0.23805043281896876, "grad_norm": 0.04744286047223305, "learning_rate": 0.00028294149247762545, "loss": 0.859, "step": 1265 }, { "epoch": 0.23899134362062477, "grad_norm": 0.04338685105449176, "learning_rate": 0.00028271256857655244, "loss": 0.8065, "step": 1270 }, { "epoch": 0.23993225442228078, "grad_norm": 0.047153540286283435, "learning_rate": 0.0002824822127109026, "loss": 0.85, "step": 1275 }, { "epoch": 0.24087316522393676, "grad_norm": 0.050020086051956694, "learning_rate": 0.0002822504273662086, "loss": 0.8611, "step": 1280 }, { "epoch": 0.24181407602559277, "grad_norm": 0.04356866414442012, "learning_rate": 0.0002820172150434274, "loss": 0.8375, "step": 1285 }, { "epoch": 0.24275498682724878, "grad_norm": 0.04859564580518782, "learning_rate": 0.0002817825782589127, "loss": 0.851, "step": 1290 }, { "epoch": 0.24369589762890478, "grad_norm": 0.04736842332804661, "learning_rate": 0.0002815465195443884, "loss": 0.8232, "step": 1295 }, { "epoch": 0.2446368084305608, "grad_norm": 0.04546754699344671, "learning_rate": 0.0002813090414469208, "loss": 0.802, "step": 1300 }, { "epoch": 0.24557771923221677, "grad_norm": 0.04668790834139243, "learning_rate": 0.0002810701465288913, "loss": 0.8164, "step": 1305 }, { "epoch": 0.24651863003387278, "grad_norm": 0.04806767256497892, "learning_rate": 0.0002808298373679688, "loss": 0.844, "step": 1310 }, { "epoch": 0.2474595408355288, "grad_norm": 0.04430706443035298, "learning_rate": 0.00028058811655708193, "loss": 0.8402, "step": 1315 }, { "epoch": 0.2484004516371848, "grad_norm": 0.046991071291030626, "learning_rate": 0.00028034498670439085, "loss": 0.8165, "step": 1320 }, { "epoch": 0.2493413624388408, "grad_norm": 0.043876851948307184, "learning_rate": 0.00028010045043325925, "loss": 0.8373, "step": 1325 }, { "epoch": 0.2502822732404968, "grad_norm": 0.04464800453485912, "learning_rate": 0.000279854510382226, "loss": 0.8491, "step": 1330 }, { "epoch": 0.2512231840421528, "grad_norm": 0.05208965977018001, "learning_rate": 0.0002796071692049769, "loss": 0.8519, "step": 1335 }, { "epoch": 0.2521640948438088, "grad_norm": 0.0464137513178153, "learning_rate": 0.00027935842957031563, "loss": 0.8362, "step": 1340 }, { "epoch": 0.2531050056454648, "grad_norm": 0.048796355664881524, "learning_rate": 0.00027910829416213527, "loss": 0.8505, "step": 1345 }, { "epoch": 0.2540459164471208, "grad_norm": 0.048709351068813245, "learning_rate": 0.0002788567656793893, "loss": 0.8544, "step": 1350 }, { "epoch": 0.2549868272487768, "grad_norm": 0.045892021177835134, "learning_rate": 0.00027860384683606236, "loss": 0.8271, "step": 1355 }, { "epoch": 0.25592773805043284, "grad_norm": 0.05265800000922237, "learning_rate": 0.00027834954036114114, "loss": 0.885, "step": 1360 }, { "epoch": 0.2568686488520888, "grad_norm": 0.04551430213945619, "learning_rate": 0.00027809384899858474, "loss": 0.8225, "step": 1365 }, { "epoch": 0.2578095596537448, "grad_norm": 0.04793102535540072, "learning_rate": 0.00027783677550729515, "loss": 0.8339, "step": 1370 }, { "epoch": 0.25875047045540084, "grad_norm": 0.04205095375417017, "learning_rate": 0.0002775783226610875, "loss": 0.808, "step": 1375 }, { "epoch": 0.2596913812570568, "grad_norm": 0.047025422686677955, "learning_rate": 0.00027731849324866026, "loss": 0.8525, "step": 1380 }, { "epoch": 0.26063229205871286, "grad_norm": 0.043984335241227406, "learning_rate": 0.00027705729007356476, "loss": 0.8241, "step": 1385 }, { "epoch": 0.26157320286036884, "grad_norm": 0.04490799345162232, "learning_rate": 0.00027679471595417536, "loss": 0.843, "step": 1390 }, { "epoch": 0.2625141136620248, "grad_norm": 0.050208847690443474, "learning_rate": 0.00027653077372365886, "loss": 0.8233, "step": 1395 }, { "epoch": 0.26345502446368085, "grad_norm": 0.04338736887570603, "learning_rate": 0.00027626546622994374, "loss": 0.8177, "step": 1400 }, { "epoch": 0.26439593526533683, "grad_norm": 0.043029065881549575, "learning_rate": 0.00027599879633568994, "loss": 0.8201, "step": 1405 }, { "epoch": 0.26533684606699287, "grad_norm": 0.04528220902743319, "learning_rate": 0.0002757307669182575, "loss": 0.8182, "step": 1410 }, { "epoch": 0.26627775686864885, "grad_norm": 0.04189572516578376, "learning_rate": 0.0002754613808696756, "loss": 0.8093, "step": 1415 }, { "epoch": 0.26721866767030483, "grad_norm": 0.050018443121759355, "learning_rate": 0.00027519064109661153, "loss": 0.8408, "step": 1420 }, { "epoch": 0.26815957847196087, "grad_norm": 0.04477497802076473, "learning_rate": 0.00027491855052033925, "loss": 0.8037, "step": 1425 }, { "epoch": 0.26910048927361685, "grad_norm": 0.04392548330312801, "learning_rate": 0.00027464511207670773, "loss": 0.8507, "step": 1430 }, { "epoch": 0.2700414000752729, "grad_norm": 0.049777060488396405, "learning_rate": 0.0002743703287161095, "loss": 0.7992, "step": 1435 }, { "epoch": 0.27098231087692887, "grad_norm": 0.042140230036610965, "learning_rate": 0.00027409420340344866, "loss": 0.8167, "step": 1440 }, { "epoch": 0.27192322167858485, "grad_norm": 0.05002427399165945, "learning_rate": 0.00027381673911810897, "loss": 0.8121, "step": 1445 }, { "epoch": 0.2728641324802409, "grad_norm": 0.04163684703416787, "learning_rate": 0.00027353793885392155, "loss": 0.8244, "step": 1450 }, { "epoch": 0.27380504328189686, "grad_norm": 0.044059819695211666, "learning_rate": 0.00027325780561913277, "loss": 0.8164, "step": 1455 }, { "epoch": 0.2747459540835529, "grad_norm": 0.04563812665390404, "learning_rate": 0.00027297634243637176, "loss": 0.8366, "step": 1460 }, { "epoch": 0.2756868648852089, "grad_norm": 0.04827848887354726, "learning_rate": 0.00027269355234261773, "loss": 0.8281, "step": 1465 }, { "epoch": 0.27662777568686486, "grad_norm": 0.04999760434208785, "learning_rate": 0.000272409438389167, "loss": 0.8467, "step": 1470 }, { "epoch": 0.2775686864885209, "grad_norm": 0.05162297961569586, "learning_rate": 0.00027212400364160075, "loss": 0.8487, "step": 1475 }, { "epoch": 0.2785095972901769, "grad_norm": 0.04646244942560704, "learning_rate": 0.000271837251179751, "loss": 0.8084, "step": 1480 }, { "epoch": 0.2794505080918329, "grad_norm": 0.05039420456957765, "learning_rate": 0.0002715491840976682, "loss": 0.8471, "step": 1485 }, { "epoch": 0.2803914188934889, "grad_norm": 0.042544806912826326, "learning_rate": 0.00027125980550358743, "loss": 0.7844, "step": 1490 }, { "epoch": 0.2813323296951449, "grad_norm": 0.045410393598617226, "learning_rate": 0.0002709691185198948, "loss": 0.8618, "step": 1495 }, { "epoch": 0.2822732404968009, "grad_norm": 0.04318140661358377, "learning_rate": 0.0002706771262830941, "loss": 0.8097, "step": 1500 }, { "epoch": 0.2832141512984569, "grad_norm": 0.03985460964577001, "learning_rate": 0.0002703838319437727, "loss": 0.8328, "step": 1505 }, { "epoch": 0.28415506210011293, "grad_norm": 0.045705481658739236, "learning_rate": 0.0002700892386665675, "loss": 0.8194, "step": 1510 }, { "epoch": 0.2850959729017689, "grad_norm": 0.048028831085993116, "learning_rate": 0.0002697933496301311, "loss": 0.8338, "step": 1515 }, { "epoch": 0.2860368837034249, "grad_norm": 0.04693051612093597, "learning_rate": 0.00026949616802709716, "loss": 0.8425, "step": 1520 }, { "epoch": 0.28697779450508093, "grad_norm": 0.04357129292417143, "learning_rate": 0.0002691976970640461, "loss": 0.8246, "step": 1525 }, { "epoch": 0.2879187053067369, "grad_norm": 0.04698363226395185, "learning_rate": 0.00026889793996147057, "loss": 0.8242, "step": 1530 }, { "epoch": 0.28885961610839295, "grad_norm": 0.046155464725970596, "learning_rate": 0.00026859689995374056, "loss": 0.8137, "step": 1535 }, { "epoch": 0.28980052691004893, "grad_norm": 0.05204306070924651, "learning_rate": 0.0002682945802890686, "loss": 0.8264, "step": 1540 }, { "epoch": 0.2907414377117049, "grad_norm": 0.043611637100207754, "learning_rate": 0.00026799098422947474, "loss": 0.821, "step": 1545 }, { "epoch": 0.29168234851336095, "grad_norm": 0.05503848520813138, "learning_rate": 0.00026768611505075115, "loss": 0.8291, "step": 1550 }, { "epoch": 0.2926232593150169, "grad_norm": 0.05240199900772415, "learning_rate": 0.000267379976042427, "loss": 0.8345, "step": 1555 }, { "epoch": 0.29356417011667296, "grad_norm": 0.04555420445769968, "learning_rate": 0.0002670725705077329, "loss": 0.8361, "step": 1560 }, { "epoch": 0.29450508091832894, "grad_norm": 0.04207672123987573, "learning_rate": 0.0002667639017635651, "loss": 0.8083, "step": 1565 }, { "epoch": 0.2954459917199849, "grad_norm": 0.043154758358048634, "learning_rate": 0.0002664539731404502, "loss": 0.8507, "step": 1570 }, { "epoch": 0.29638690252164096, "grad_norm": 0.048023978799794254, "learning_rate": 0.0002661427879825084, "loss": 0.8311, "step": 1575 }, { "epoch": 0.29732781332329694, "grad_norm": 0.04479903851737126, "learning_rate": 0.0002658303496474182, "loss": 0.8177, "step": 1580 }, { "epoch": 0.298268724124953, "grad_norm": 0.04075712688100381, "learning_rate": 0.0002655166615063797, "loss": 0.8231, "step": 1585 }, { "epoch": 0.29920963492660896, "grad_norm": 0.04516519211583324, "learning_rate": 0.00026520172694407835, "loss": 0.8342, "step": 1590 }, { "epoch": 0.30015054572826494, "grad_norm": 0.04163573857623555, "learning_rate": 0.0002648855493586485, "loss": 0.8217, "step": 1595 }, { "epoch": 0.301091456529921, "grad_norm": 0.04703202421649344, "learning_rate": 0.00026456813216163674, "loss": 0.8375, "step": 1600 }, { "epoch": 0.30203236733157696, "grad_norm": 0.0435527583985287, "learning_rate": 0.0002642494787779649, "loss": 0.7929, "step": 1605 }, { "epoch": 0.302973278133233, "grad_norm": 0.04112659440879095, "learning_rate": 0.0002639295926458934, "loss": 0.8409, "step": 1610 }, { "epoch": 0.303914188934889, "grad_norm": 0.0455623778767916, "learning_rate": 0.0002636084772169838, "loss": 0.8398, "step": 1615 }, { "epoch": 0.30485509973654495, "grad_norm": 0.04205050261795213, "learning_rate": 0.00026328613595606173, "loss": 0.8041, "step": 1620 }, { "epoch": 0.305796010538201, "grad_norm": 0.04850951164010859, "learning_rate": 0.0002629625723411797, "loss": 0.7985, "step": 1625 }, { "epoch": 0.30673692133985697, "grad_norm": 0.046499953964171024, "learning_rate": 0.0002626377898635792, "loss": 0.8404, "step": 1630 }, { "epoch": 0.307677832141513, "grad_norm": 0.05087753755535387, "learning_rate": 0.00026231179202765336, "loss": 0.8247, "step": 1635 }, { "epoch": 0.308618742943169, "grad_norm": 0.05107289807504844, "learning_rate": 0.00026198458235090886, "loss": 0.8373, "step": 1640 }, { "epoch": 0.30955965374482497, "grad_norm": 0.047849970649591435, "learning_rate": 0.00026165616436392815, "loss": 0.8526, "step": 1645 }, { "epoch": 0.310500564546481, "grad_norm": 0.0452846113299356, "learning_rate": 0.00026132654161033133, "loss": 0.8257, "step": 1650 }, { "epoch": 0.311441475348137, "grad_norm": 0.046584198634579535, "learning_rate": 0.00026099571764673786, "loss": 0.8272, "step": 1655 }, { "epoch": 0.312382386149793, "grad_norm": 0.04195956869333814, "learning_rate": 0.00026066369604272835, "loss": 0.8295, "step": 1660 }, { "epoch": 0.313323296951449, "grad_norm": 0.047276897040517386, "learning_rate": 0.00026033048038080563, "loss": 0.851, "step": 1665 }, { "epoch": 0.314264207753105, "grad_norm": 0.04611674028446184, "learning_rate": 0.0002599960742563566, "loss": 0.8269, "step": 1670 }, { "epoch": 0.315205118554761, "grad_norm": 0.04599603285571535, "learning_rate": 0.0002596604812776133, "loss": 0.8253, "step": 1675 }, { "epoch": 0.316146029356417, "grad_norm": 0.04173568003131483, "learning_rate": 0.00025932370506561364, "loss": 0.8123, "step": 1680 }, { "epoch": 0.31708694015807304, "grad_norm": 0.0425349576791169, "learning_rate": 0.0002589857492541627, "loss": 0.8343, "step": 1685 }, { "epoch": 0.318027850959729, "grad_norm": 0.04610564920795079, "learning_rate": 0.0002586466174897934, "loss": 0.8215, "step": 1690 }, { "epoch": 0.318968761761385, "grad_norm": 0.04293233769403105, "learning_rate": 0.00025830631343172727, "loss": 0.7962, "step": 1695 }, { "epoch": 0.31990967256304104, "grad_norm": 0.040726277131415696, "learning_rate": 0.00025796484075183465, "loss": 0.8119, "step": 1700 }, { "epoch": 0.320850583364697, "grad_norm": 0.0444419737096656, "learning_rate": 0.00025762220313459535, "loss": 0.7843, "step": 1705 }, { "epoch": 0.32179149416635305, "grad_norm": 0.048133240264347225, "learning_rate": 0.0002572784042770588, "loss": 0.8307, "step": 1710 }, { "epoch": 0.32273240496800903, "grad_norm": 0.045535597036081774, "learning_rate": 0.0002569334478888044, "loss": 0.8228, "step": 1715 }, { "epoch": 0.323673315769665, "grad_norm": 0.04973249883926558, "learning_rate": 0.0002565873376919008, "loss": 0.8245, "step": 1720 }, { "epoch": 0.32461422657132105, "grad_norm": 0.044317466816763997, "learning_rate": 0.0002562400774208668, "loss": 0.7937, "step": 1725 }, { "epoch": 0.32555513737297703, "grad_norm": 0.04422454108768711, "learning_rate": 0.00025589167082263, "loss": 0.8492, "step": 1730 }, { "epoch": 0.32649604817463307, "grad_norm": 0.057793214248518535, "learning_rate": 0.0002555421216564869, "loss": 0.838, "step": 1735 }, { "epoch": 0.32743695897628905, "grad_norm": 0.050491165924182006, "learning_rate": 0.00025519143369406253, "loss": 0.8136, "step": 1740 }, { "epoch": 0.32837786977794503, "grad_norm": 0.039453461402937505, "learning_rate": 0.00025483961071926924, "loss": 0.833, "step": 1745 }, { "epoch": 0.32931878057960107, "grad_norm": 0.0427804162848213, "learning_rate": 0.00025448665652826627, "loss": 0.8182, "step": 1750 }, { "epoch": 0.33025969138125705, "grad_norm": 0.04278324709181072, "learning_rate": 0.0002541325749294186, "loss": 0.8501, "step": 1755 }, { "epoch": 0.3312006021829131, "grad_norm": 0.04007795533618747, "learning_rate": 0.0002537773697432559, "loss": 0.8122, "step": 1760 }, { "epoch": 0.33214151298456907, "grad_norm": 0.043309227799558356, "learning_rate": 0.0002534210448024313, "loss": 0.8096, "step": 1765 }, { "epoch": 0.33308242378622505, "grad_norm": 0.03941333694707204, "learning_rate": 0.0002530636039516801, "loss": 0.7938, "step": 1770 }, { "epoch": 0.3340233345878811, "grad_norm": 0.10747125716613212, "learning_rate": 0.0002527050510477782, "loss": 0.8307, "step": 1775 }, { "epoch": 0.33496424538953706, "grad_norm": 0.04590988600337092, "learning_rate": 0.00025234538995950047, "loss": 0.8122, "step": 1780 }, { "epoch": 0.3359051561911931, "grad_norm": 0.04172494378663382, "learning_rate": 0.00025198462456757915, "loss": 0.8345, "step": 1785 }, { "epoch": 0.3368460669928491, "grad_norm": 0.045093200155212766, "learning_rate": 0.0002516227587646619, "loss": 0.8347, "step": 1790 }, { "epoch": 0.33778697779450506, "grad_norm": 0.04401063743206211, "learning_rate": 0.0002512597964552696, "loss": 0.8285, "step": 1795 }, { "epoch": 0.3387278885961611, "grad_norm": 0.04213320681777121, "learning_rate": 0.00025089574155575463, "loss": 0.8113, "step": 1800 }, { "epoch": 0.3396687993978171, "grad_norm": 0.041757171559943365, "learning_rate": 0.0002505305979942582, "loss": 0.8252, "step": 1805 }, { "epoch": 0.3406097101994731, "grad_norm": 0.04786009269846987, "learning_rate": 0.00025016436971066837, "loss": 0.8228, "step": 1810 }, { "epoch": 0.3415506210011291, "grad_norm": 0.041726632525803914, "learning_rate": 0.000249797060656577, "loss": 0.824, "step": 1815 }, { "epoch": 0.3424915318027851, "grad_norm": 0.04595397683798158, "learning_rate": 0.00024942867479523764, "loss": 0.8391, "step": 1820 }, { "epoch": 0.3434324426044411, "grad_norm": 0.046502559025392236, "learning_rate": 0.0002490592161015227, "loss": 0.7999, "step": 1825 }, { "epoch": 0.3443733534060971, "grad_norm": 0.042052936652114195, "learning_rate": 0.00024868868856188, "loss": 0.8248, "step": 1830 }, { "epoch": 0.34531426420775313, "grad_norm": 0.05203965598763733, "learning_rate": 0.0002483170961742905, "loss": 0.812, "step": 1835 }, { "epoch": 0.3462551750094091, "grad_norm": 0.044682898482156765, "learning_rate": 0.00024794444294822486, "loss": 0.8477, "step": 1840 }, { "epoch": 0.3471960858110651, "grad_norm": 0.04216691856239276, "learning_rate": 0.0002475707329046, "loss": 0.7979, "step": 1845 }, { "epoch": 0.34813699661272113, "grad_norm": 0.04345355127718824, "learning_rate": 0.0002471959700757358, "loss": 0.8135, "step": 1850 }, { "epoch": 0.3490779074143771, "grad_norm": 0.042957408510867594, "learning_rate": 0.00024682015850531193, "loss": 0.8064, "step": 1855 }, { "epoch": 0.35001881821603315, "grad_norm": 0.04072622346447232, "learning_rate": 0.00024644330224832375, "loss": 0.8111, "step": 1860 }, { "epoch": 0.3509597290176891, "grad_norm": 0.045585863196081676, "learning_rate": 0.0002460654053710388, "loss": 0.8015, "step": 1865 }, { "epoch": 0.3519006398193451, "grad_norm": 0.04805889924747525, "learning_rate": 0.0002456864719509529, "loss": 0.8289, "step": 1870 }, { "epoch": 0.35284155062100114, "grad_norm": 0.040060429859775856, "learning_rate": 0.0002453065060767461, "loss": 0.8073, "step": 1875 }, { "epoch": 0.3537824614226571, "grad_norm": 0.04018127215217534, "learning_rate": 0.0002449255118482386, "loss": 0.798, "step": 1880 }, { "epoch": 0.35472337222431316, "grad_norm": 0.04853936791390769, "learning_rate": 0.0002445434933763466, "loss": 0.8419, "step": 1885 }, { "epoch": 0.35566428302596914, "grad_norm": 0.04367744556341427, "learning_rate": 0.0002441604547830378, "loss": 0.838, "step": 1890 }, { "epoch": 0.3566051938276251, "grad_norm": 0.04205522207764387, "learning_rate": 0.0002437764002012868, "loss": 0.8158, "step": 1895 }, { "epoch": 0.35754610462928116, "grad_norm": 0.04343667094277572, "learning_rate": 0.00024339133377503103, "loss": 0.8208, "step": 1900 }, { "epoch": 0.35848701543093714, "grad_norm": 0.03959475872321939, "learning_rate": 0.0002430052596591255, "loss": 0.814, "step": 1905 }, { "epoch": 0.3594279262325932, "grad_norm": 0.04256159840011562, "learning_rate": 0.00024261818201929813, "loss": 0.8369, "step": 1910 }, { "epoch": 0.36036883703424916, "grad_norm": 0.04672817842045811, "learning_rate": 0.00024223010503210483, "loss": 0.8023, "step": 1915 }, { "epoch": 0.36130974783590514, "grad_norm": 0.04766444087870317, "learning_rate": 0.00024184103288488456, "loss": 0.7827, "step": 1920 }, { "epoch": 0.3622506586375612, "grad_norm": 0.04922825160196635, "learning_rate": 0.0002414509697757139, "loss": 0.8146, "step": 1925 }, { "epoch": 0.36319156943921715, "grad_norm": 0.04305093896719169, "learning_rate": 0.00024105991991336197, "loss": 0.8519, "step": 1930 }, { "epoch": 0.3641324802408732, "grad_norm": 0.0509218525892877, "learning_rate": 0.00024066788751724483, "loss": 0.8323, "step": 1935 }, { "epoch": 0.36507339104252917, "grad_norm": 0.041068187949031935, "learning_rate": 0.00024027487681738016, "loss": 0.8015, "step": 1940 }, { "epoch": 0.36601430184418515, "grad_norm": 0.04156745894982358, "learning_rate": 0.0002398808920543414, "loss": 0.804, "step": 1945 }, { "epoch": 0.3669552126458412, "grad_norm": 0.03811372093805523, "learning_rate": 0.00023948593747921226, "loss": 0.7874, "step": 1950 }, { "epoch": 0.36789612344749717, "grad_norm": 0.04142805534519082, "learning_rate": 0.0002390900173535405, "loss": 0.8635, "step": 1955 }, { "epoch": 0.3688370342491532, "grad_norm": 0.041550575720770686, "learning_rate": 0.00023869313594929222, "loss": 0.813, "step": 1960 }, { "epoch": 0.3697779450508092, "grad_norm": 0.04523950919170376, "learning_rate": 0.00023829529754880574, "loss": 0.8025, "step": 1965 }, { "epoch": 0.37071885585246517, "grad_norm": 0.04169165852779439, "learning_rate": 0.00023789650644474527, "loss": 0.8074, "step": 1970 }, { "epoch": 0.3716597666541212, "grad_norm": 0.04010793628530859, "learning_rate": 0.0002374967669400547, "loss": 0.8093, "step": 1975 }, { "epoch": 0.3726006774557772, "grad_norm": 0.04324010723990065, "learning_rate": 0.00023709608334791113, "loss": 0.8072, "step": 1980 }, { "epoch": 0.3735415882574332, "grad_norm": 0.042840112648990035, "learning_rate": 0.00023669445999167834, "loss": 0.8213, "step": 1985 }, { "epoch": 0.3744824990590892, "grad_norm": 0.04684866437996544, "learning_rate": 0.00023629190120486002, "loss": 0.8238, "step": 1990 }, { "epoch": 0.3754234098607452, "grad_norm": 0.0411724284111032, "learning_rate": 0.00023588841133105338, "loss": 0.8117, "step": 1995 }, { "epoch": 0.3763643206624012, "grad_norm": 0.04450924105296537, "learning_rate": 0.00023548399472390178, "loss": 0.8218, "step": 2000 }, { "epoch": 0.3773052314640572, "grad_norm": 0.04011920691264243, "learning_rate": 0.0002350786557470482, "loss": 0.8244, "step": 2005 }, { "epoch": 0.37824614226571324, "grad_norm": 0.04712123262049659, "learning_rate": 0.00023467239877408773, "loss": 0.8227, "step": 2010 }, { "epoch": 0.3791870530673692, "grad_norm": 0.044712637958609615, "learning_rate": 0.00023426522818852086, "loss": 0.8216, "step": 2015 }, { "epoch": 0.3801279638690252, "grad_norm": 0.04150493938713221, "learning_rate": 0.00023385714838370582, "loss": 0.8148, "step": 2020 }, { "epoch": 0.38106887467068123, "grad_norm": 0.048742142112492115, "learning_rate": 0.0002334481637628112, "loss": 0.8348, "step": 2025 }, { "epoch": 0.3820097854723372, "grad_norm": 0.0415305433820995, "learning_rate": 0.0002330382787387687, "loss": 0.8151, "step": 2030 }, { "epoch": 0.38295069627399325, "grad_norm": 0.0453536583022637, "learning_rate": 0.00023262749773422518, "loss": 0.8157, "step": 2035 }, { "epoch": 0.38389160707564923, "grad_norm": 0.044005851429759116, "learning_rate": 0.00023221582518149526, "loss": 0.7728, "step": 2040 }, { "epoch": 0.3848325178773052, "grad_norm": 0.04867846228714434, "learning_rate": 0.00023180326552251323, "loss": 0.7984, "step": 2045 }, { "epoch": 0.38577342867896125, "grad_norm": 0.04136667249456415, "learning_rate": 0.0002313898232087852, "loss": 0.799, "step": 2050 }, { "epoch": 0.38671433948061723, "grad_norm": 0.0405221793452781, "learning_rate": 0.00023097550270134124, "loss": 0.8146, "step": 2055 }, { "epoch": 0.38765525028227327, "grad_norm": 0.04778677014197557, "learning_rate": 0.000230560308470687, "loss": 0.8383, "step": 2060 }, { "epoch": 0.38859616108392925, "grad_norm": 0.04391158932493037, "learning_rate": 0.00023014424499675555, "loss": 0.8648, "step": 2065 }, { "epoch": 0.38953707188558523, "grad_norm": 0.046018778807677026, "learning_rate": 0.00022972731676885913, "loss": 0.8043, "step": 2070 }, { "epoch": 0.39047798268724127, "grad_norm": 0.04560887374781656, "learning_rate": 0.00022930952828564073, "loss": 0.7997, "step": 2075 }, { "epoch": 0.39141889348889725, "grad_norm": 0.040583749331980695, "learning_rate": 0.00022889088405502522, "loss": 0.8008, "step": 2080 }, { "epoch": 0.3923598042905533, "grad_norm": 0.040731714295921484, "learning_rate": 0.00022847138859417114, "loss": 0.7964, "step": 2085 }, { "epoch": 0.39330071509220926, "grad_norm": 0.0428768624710387, "learning_rate": 0.00022805104642942186, "loss": 0.8211, "step": 2090 }, { "epoch": 0.39424162589386524, "grad_norm": 0.03600894815100002, "learning_rate": 0.00022762986209625644, "loss": 0.8043, "step": 2095 }, { "epoch": 0.3951825366955213, "grad_norm": 0.03886248110556022, "learning_rate": 0.00022720784013924102, "loss": 0.7894, "step": 2100 }, { "epoch": 0.39612344749717726, "grad_norm": 0.04550262987220453, "learning_rate": 0.00022678498511197976, "loss": 0.8123, "step": 2105 }, { "epoch": 0.3970643582988333, "grad_norm": 0.04253985842426856, "learning_rate": 0.0002263613015770655, "loss": 0.8094, "step": 2110 }, { "epoch": 0.3980052691004893, "grad_norm": 0.04139895688562898, "learning_rate": 0.00022593679410603062, "loss": 0.8101, "step": 2115 }, { "epoch": 0.39894617990214526, "grad_norm": 0.04611695349566767, "learning_rate": 0.00022551146727929793, "loss": 0.7969, "step": 2120 }, { "epoch": 0.3998870907038013, "grad_norm": 0.03767361228914821, "learning_rate": 0.00022508532568613087, "loss": 0.777, "step": 2125 }, { "epoch": 0.4008280015054573, "grad_norm": 0.03917014861794376, "learning_rate": 0.0002246583739245843, "loss": 0.8062, "step": 2130 }, { "epoch": 0.4017689123071133, "grad_norm": 0.047039791387090205, "learning_rate": 0.00022423061660145467, "loss": 0.8055, "step": 2135 }, { "epoch": 0.4027098231087693, "grad_norm": 0.03980335720604302, "learning_rate": 0.00022380205833223062, "loss": 0.8114, "step": 2140 }, { "epoch": 0.4036507339104253, "grad_norm": 0.040218404097981254, "learning_rate": 0.00022337270374104268, "loss": 0.7878, "step": 2145 }, { "epoch": 0.4045916447120813, "grad_norm": 0.03770658683678954, "learning_rate": 0.0002229425574606139, "loss": 0.7626, "step": 2150 }, { "epoch": 0.4055325555137373, "grad_norm": 0.044949683935009095, "learning_rate": 0.00022251162413220956, "loss": 0.8187, "step": 2155 }, { "epoch": 0.40647346631539333, "grad_norm": 0.03983952278617522, "learning_rate": 0.0002220799084055872, "loss": 0.7852, "step": 2160 }, { "epoch": 0.4074143771170493, "grad_norm": 0.05174431759947435, "learning_rate": 0.00022164741493894624, "loss": 0.8323, "step": 2165 }, { "epoch": 0.4083552879187053, "grad_norm": 0.04243001886382076, "learning_rate": 0.00022121414839887813, "loss": 0.7876, "step": 2170 }, { "epoch": 0.4092961987203613, "grad_norm": 0.037037966238162086, "learning_rate": 0.00022078011346031572, "loss": 0.7927, "step": 2175 }, { "epoch": 0.4102371095220173, "grad_norm": 0.04184803898066816, "learning_rate": 0.0002203453148064826, "loss": 0.7871, "step": 2180 }, { "epoch": 0.41117802032367334, "grad_norm": 0.0428590620625003, "learning_rate": 0.00021990975712884322, "loss": 0.8127, "step": 2185 }, { "epoch": 0.4121189311253293, "grad_norm": 0.03582098140612815, "learning_rate": 0.0002194734451270515, "loss": 0.7809, "step": 2190 }, { "epoch": 0.4130598419269853, "grad_norm": 0.039671091493742434, "learning_rate": 0.00021903638350890078, "loss": 0.8145, "step": 2195 }, { "epoch": 0.41400075272864134, "grad_norm": 0.043079186328936876, "learning_rate": 0.00021859857699027256, "loss": 0.8163, "step": 2200 }, { "epoch": 0.4149416635302973, "grad_norm": 0.04821222253606962, "learning_rate": 0.00021816003029508587, "loss": 0.8108, "step": 2205 }, { "epoch": 0.41588257433195336, "grad_norm": 0.04643606468121251, "learning_rate": 0.0002177207481552462, "loss": 0.8143, "step": 2210 }, { "epoch": 0.41682348513360934, "grad_norm": 0.05088614638476515, "learning_rate": 0.0002172807353105945, "loss": 0.8093, "step": 2215 }, { "epoch": 0.4177643959352653, "grad_norm": 0.04275749284477858, "learning_rate": 0.00021683999650885598, "loss": 0.8043, "step": 2220 }, { "epoch": 0.41870530673692136, "grad_norm": 0.04036391126072423, "learning_rate": 0.00021639853650558884, "loss": 0.7994, "step": 2225 }, { "epoch": 0.41964621753857734, "grad_norm": 0.0431659963056544, "learning_rate": 0.00021595636006413308, "loss": 0.8197, "step": 2230 }, { "epoch": 0.4205871283402333, "grad_norm": 0.04225940473147677, "learning_rate": 0.00021551347195555916, "loss": 0.7891, "step": 2235 }, { "epoch": 0.42152803914188935, "grad_norm": 0.04116212403602597, "learning_rate": 0.00021506987695861618, "loss": 0.8418, "step": 2240 }, { "epoch": 0.42246894994354534, "grad_norm": 0.04039520834463586, "learning_rate": 0.00021462557985968075, "loss": 0.7952, "step": 2245 }, { "epoch": 0.42340986074520137, "grad_norm": 0.048662820756663734, "learning_rate": 0.000214180585452705, "loss": 0.8019, "step": 2250 }, { "epoch": 0.42435077154685735, "grad_norm": 0.045423020413219564, "learning_rate": 0.00021373489853916499, "loss": 0.7975, "step": 2255 }, { "epoch": 0.42529168234851333, "grad_norm": 0.04640593668762107, "learning_rate": 0.00021328852392800906, "loss": 0.8312, "step": 2260 }, { "epoch": 0.42623259315016937, "grad_norm": 0.04277782133320467, "learning_rate": 0.00021284146643560562, "loss": 0.8104, "step": 2265 }, { "epoch": 0.42717350395182535, "grad_norm": 0.04040549088992285, "learning_rate": 0.00021239373088569142, "loss": 0.7887, "step": 2270 }, { "epoch": 0.4281144147534814, "grad_norm": 0.03889124568442978, "learning_rate": 0.00021194532210931945, "loss": 0.8061, "step": 2275 }, { "epoch": 0.42905532555513737, "grad_norm": 0.043868472214118666, "learning_rate": 0.00021149624494480674, "loss": 0.7947, "step": 2280 }, { "epoch": 0.42999623635679335, "grad_norm": 0.03981679327310523, "learning_rate": 0.00021104650423768218, "loss": 0.7891, "step": 2285 }, { "epoch": 0.4309371471584494, "grad_norm": 0.04037233565136616, "learning_rate": 0.00021059610484063437, "loss": 0.8016, "step": 2290 }, { "epoch": 0.43187805796010537, "grad_norm": 0.041805266956483865, "learning_rate": 0.00021014505161345915, "loss": 0.8037, "step": 2295 }, { "epoch": 0.4328189687617614, "grad_norm": 0.03748382253401212, "learning_rate": 0.00020969334942300702, "loss": 0.8073, "step": 2300 }, { "epoch": 0.4337598795634174, "grad_norm": 0.06578354317476753, "learning_rate": 0.00020924100314313092, "loss": 0.8075, "step": 2305 }, { "epoch": 0.43470079036507336, "grad_norm": 0.04239179218818634, "learning_rate": 0.00020878801765463343, "loss": 0.8027, "step": 2310 }, { "epoch": 0.4356417011667294, "grad_norm": 0.042779310945398136, "learning_rate": 0.00020833439784521423, "loss": 0.8102, "step": 2315 }, { "epoch": 0.4365826119683854, "grad_norm": 0.04206757037096745, "learning_rate": 0.00020788014860941717, "loss": 0.8008, "step": 2320 }, { "epoch": 0.4375235227700414, "grad_norm": 0.04286947710250131, "learning_rate": 0.00020742527484857778, "loss": 0.8038, "step": 2325 }, { "epoch": 0.4384644335716974, "grad_norm": 0.04156846594623706, "learning_rate": 0.00020696978147077006, "loss": 0.7946, "step": 2330 }, { "epoch": 0.4394053443733534, "grad_norm": 0.041413097020774656, "learning_rate": 0.00020651367339075366, "loss": 0.7986, "step": 2335 }, { "epoch": 0.4403462551750094, "grad_norm": 0.04220729277505702, "learning_rate": 0.00020605695552992093, "loss": 0.8294, "step": 2340 }, { "epoch": 0.4412871659766654, "grad_norm": 0.03874549638262173, "learning_rate": 0.00020559963281624376, "loss": 0.7638, "step": 2345 }, { "epoch": 0.44222807677832143, "grad_norm": 0.04109206825097649, "learning_rate": 0.00020514171018422015, "loss": 0.8068, "step": 2350 }, { "epoch": 0.4431689875799774, "grad_norm": 0.043772113393048005, "learning_rate": 0.0002046831925748215, "loss": 0.8087, "step": 2355 }, { "epoch": 0.4441098983816334, "grad_norm": 0.04314130063844934, "learning_rate": 0.00020422408493543878, "loss": 0.8176, "step": 2360 }, { "epoch": 0.44505080918328943, "grad_norm": 0.04227709213575997, "learning_rate": 0.00020376439221982953, "loss": 0.7848, "step": 2365 }, { "epoch": 0.4459917199849454, "grad_norm": 0.047483555049165516, "learning_rate": 0.0002033041193880641, "loss": 0.7906, "step": 2370 }, { "epoch": 0.44693263078660145, "grad_norm": 0.04328503293360676, "learning_rate": 0.00020284327140647238, "loss": 0.8087, "step": 2375 }, { "epoch": 0.44787354158825743, "grad_norm": 0.043229994930452834, "learning_rate": 0.00020238185324759005, "loss": 0.7977, "step": 2380 }, { "epoch": 0.4488144523899134, "grad_norm": 0.04138845464473692, "learning_rate": 0.00020191986989010497, "loss": 0.805, "step": 2385 }, { "epoch": 0.44975536319156945, "grad_norm": 0.042452324658930944, "learning_rate": 0.0002014573263188036, "loss": 0.7953, "step": 2390 }, { "epoch": 0.4506962739932254, "grad_norm": 0.044241588342132626, "learning_rate": 0.0002009942275245169, "loss": 0.8349, "step": 2395 }, { "epoch": 0.45163718479488146, "grad_norm": 0.04179408805570128, "learning_rate": 0.00020053057850406687, "loss": 0.7995, "step": 2400 }, { "epoch": 0.45257809559653744, "grad_norm": 0.048094796140699454, "learning_rate": 0.00020006638426021226, "loss": 0.811, "step": 2405 }, { "epoch": 0.4535190063981934, "grad_norm": 0.041380019778814475, "learning_rate": 0.00019960164980159484, "loss": 0.8142, "step": 2410 }, { "epoch": 0.45445991719984946, "grad_norm": 0.04287459814218423, "learning_rate": 0.0001991363801426853, "loss": 0.7988, "step": 2415 }, { "epoch": 0.45540082800150544, "grad_norm": 0.050121488389194564, "learning_rate": 0.00019867058030372916, "loss": 0.8017, "step": 2420 }, { "epoch": 0.4563417388031615, "grad_norm": 0.04424030813075273, "learning_rate": 0.00019820425531069235, "loss": 0.7994, "step": 2425 }, { "epoch": 0.45728264960481746, "grad_norm": 0.03912669330614894, "learning_rate": 0.0001977374101952075, "loss": 0.8172, "step": 2430 }, { "epoch": 0.45822356040647344, "grad_norm": 0.04228385679812487, "learning_rate": 0.00019727004999451917, "loss": 0.7907, "step": 2435 }, { "epoch": 0.4591644712081295, "grad_norm": 0.04975994860517981, "learning_rate": 0.00019680217975142963, "loss": 0.816, "step": 2440 }, { "epoch": 0.46010538200978546, "grad_norm": 0.044510558468411905, "learning_rate": 0.00019633380451424473, "loss": 0.7835, "step": 2445 }, { "epoch": 0.4610462928114415, "grad_norm": 0.04486213312302885, "learning_rate": 0.00019586492933671885, "loss": 0.8223, "step": 2450 }, { "epoch": 0.4619872036130975, "grad_norm": 0.04563165216828993, "learning_rate": 0.00019539555927800098, "loss": 0.795, "step": 2455 }, { "epoch": 0.46292811441475346, "grad_norm": 0.040840435500794685, "learning_rate": 0.00019492569940257972, "loss": 0.7782, "step": 2460 }, { "epoch": 0.4638690252164095, "grad_norm": 0.04957991037956064, "learning_rate": 0.0001944553547802289, "loss": 0.8075, "step": 2465 }, { "epoch": 0.4648099360180655, "grad_norm": 0.042607119092549925, "learning_rate": 0.00019398453048595268, "loss": 0.8056, "step": 2470 }, { "epoch": 0.4657508468197215, "grad_norm": 0.04192235755301655, "learning_rate": 0.00019351323159993083, "loss": 0.8347, "step": 2475 }, { "epoch": 0.4666917576213775, "grad_norm": 0.03928466435364261, "learning_rate": 0.00019304146320746397, "loss": 0.7899, "step": 2480 }, { "epoch": 0.46763266842303347, "grad_norm": 0.040044034579578336, "learning_rate": 0.00019256923039891877, "loss": 0.7933, "step": 2485 }, { "epoch": 0.4685735792246895, "grad_norm": 0.04312279650442732, "learning_rate": 0.00019209653826967276, "loss": 0.7883, "step": 2490 }, { "epoch": 0.4695144900263455, "grad_norm": 0.04085403683152945, "learning_rate": 0.00019162339192005972, "loss": 0.7579, "step": 2495 }, { "epoch": 0.4704554008280015, "grad_norm": 0.043873630317500224, "learning_rate": 0.00019114979645531437, "loss": 0.7929, "step": 2500 }, { "epoch": 0.4713963116296575, "grad_norm": 0.040154803681733534, "learning_rate": 0.00019067575698551728, "loss": 0.79, "step": 2505 }, { "epoch": 0.4723372224313135, "grad_norm": 0.03684855703789701, "learning_rate": 0.00019020127862554, "loss": 0.7897, "step": 2510 }, { "epoch": 0.4732781332329695, "grad_norm": 0.04422268883476745, "learning_rate": 0.0001897263664949896, "loss": 0.792, "step": 2515 }, { "epoch": 0.4742190440346255, "grad_norm": 0.043857810173637005, "learning_rate": 0.00018925102571815344, "loss": 0.7851, "step": 2520 }, { "epoch": 0.47515995483628154, "grad_norm": 0.042539435296578026, "learning_rate": 0.00018877526142394404, "loss": 0.7886, "step": 2525 }, { "epoch": 0.4761008656379375, "grad_norm": 0.04214932919499374, "learning_rate": 0.00018829907874584376, "loss": 0.7906, "step": 2530 }, { "epoch": 0.4770417764395935, "grad_norm": 0.0384479257214575, "learning_rate": 0.0001878224828218491, "loss": 0.7825, "step": 2535 }, { "epoch": 0.47798268724124954, "grad_norm": 0.04162096424062071, "learning_rate": 0.0001873454787944156, "loss": 0.7957, "step": 2540 }, { "epoch": 0.4789235980429055, "grad_norm": 0.03883485962874196, "learning_rate": 0.0001868680718104023, "loss": 0.7909, "step": 2545 }, { "epoch": 0.47986450884456155, "grad_norm": 0.03749476318685997, "learning_rate": 0.0001863902670210159, "loss": 0.7817, "step": 2550 }, { "epoch": 0.48080541964621754, "grad_norm": 0.03910242002103014, "learning_rate": 0.0001859120695817556, "loss": 0.7817, "step": 2555 }, { "epoch": 0.4817463304478735, "grad_norm": 0.038950913555839455, "learning_rate": 0.0001854334846523572, "loss": 0.7796, "step": 2560 }, { "epoch": 0.48268724124952955, "grad_norm": 0.04324847711598021, "learning_rate": 0.00018495451739673757, "loss": 0.797, "step": 2565 }, { "epoch": 0.48362815205118553, "grad_norm": 0.039000544800866135, "learning_rate": 0.0001844751729829388, "loss": 0.7777, "step": 2570 }, { "epoch": 0.48456906285284157, "grad_norm": 0.040232908364916414, "learning_rate": 0.0001839954565830725, "loss": 0.7885, "step": 2575 }, { "epoch": 0.48550997365449755, "grad_norm": 0.03871796270192426, "learning_rate": 0.00018351537337326404, "loss": 0.8016, "step": 2580 }, { "epoch": 0.48645088445615353, "grad_norm": 0.037364272818028524, "learning_rate": 0.0001830349285335967, "loss": 0.7796, "step": 2585 }, { "epoch": 0.48739179525780957, "grad_norm": 0.03990561450519654, "learning_rate": 0.00018255412724805557, "loss": 0.7863, "step": 2590 }, { "epoch": 0.48833270605946555, "grad_norm": 0.04185146560129521, "learning_rate": 0.00018207297470447206, "loss": 0.8122, "step": 2595 }, { "epoch": 0.4892736168611216, "grad_norm": 0.040063511223070405, "learning_rate": 0.00018159147609446728, "loss": 0.7948, "step": 2600 }, { "epoch": 0.49021452766277757, "grad_norm": 0.044541898554158056, "learning_rate": 0.00018110963661339675, "loss": 0.7987, "step": 2605 }, { "epoch": 0.49115543846443355, "grad_norm": 0.04209970580203627, "learning_rate": 0.00018062746146029374, "loss": 0.7982, "step": 2610 }, { "epoch": 0.4920963492660896, "grad_norm": 0.03954771319272124, "learning_rate": 0.00018014495583781344, "loss": 0.7631, "step": 2615 }, { "epoch": 0.49303726006774556, "grad_norm": 0.043084260115585415, "learning_rate": 0.00017966212495217686, "loss": 0.77, "step": 2620 }, { "epoch": 0.4939781708694016, "grad_norm": 0.045997910488941696, "learning_rate": 0.00017917897401311465, "loss": 0.8029, "step": 2625 }, { "epoch": 0.4949190816710576, "grad_norm": 0.04061225203519024, "learning_rate": 0.0001786955082338106, "loss": 0.7776, "step": 2630 }, { "epoch": 0.49585999247271356, "grad_norm": 0.03975991003304855, "learning_rate": 0.00017821173283084584, "loss": 0.7995, "step": 2635 }, { "epoch": 0.4968009032743696, "grad_norm": 0.04073622084361536, "learning_rate": 0.00017772765302414228, "loss": 0.7881, "step": 2640 }, { "epoch": 0.4977418140760256, "grad_norm": 0.04246319256025743, "learning_rate": 0.0001772432740369062, "loss": 0.7851, "step": 2645 }, { "epoch": 0.4986827248776816, "grad_norm": 0.040200342333617425, "learning_rate": 0.00017675860109557225, "loss": 0.7874, "step": 2650 }, { "epoch": 0.4996236356793376, "grad_norm": 0.04313427466781039, "learning_rate": 0.00017627363942974663, "loss": 0.7662, "step": 2655 }, { "epoch": 0.5005645464809936, "grad_norm": 0.04470911106937541, "learning_rate": 0.00017578839427215102, "loss": 0.777, "step": 2660 }, { "epoch": 0.5015054572826496, "grad_norm": 0.04174530809793399, "learning_rate": 0.00017530287085856583, "loss": 0.7591, "step": 2665 }, { "epoch": 0.5024463680843057, "grad_norm": 0.04344420655267131, "learning_rate": 0.00017481707442777402, "loss": 0.7915, "step": 2670 }, { "epoch": 0.5033872788859616, "grad_norm": 0.04411942787559072, "learning_rate": 0.0001743310102215042, "loss": 0.8101, "step": 2675 }, { "epoch": 0.5043281896876176, "grad_norm": 0.03995728857524497, "learning_rate": 0.00017384468348437447, "loss": 0.7676, "step": 2680 }, { "epoch": 0.5052691004892736, "grad_norm": 0.04144099448309982, "learning_rate": 0.00017335809946383542, "loss": 0.7839, "step": 2685 }, { "epoch": 0.5062100112909296, "grad_norm": 0.03635972182712348, "learning_rate": 0.00017287126341011396, "loss": 0.7652, "step": 2690 }, { "epoch": 0.5071509220925856, "grad_norm": 0.04144440650346267, "learning_rate": 0.00017238418057615611, "loss": 0.7691, "step": 2695 }, { "epoch": 0.5080918328942416, "grad_norm": 0.0388799423795147, "learning_rate": 0.000171896856217571, "loss": 0.7745, "step": 2700 }, { "epoch": 0.5090327436958977, "grad_norm": 0.04098955077677498, "learning_rate": 0.0001714092955925735, "loss": 0.7638, "step": 2705 }, { "epoch": 0.5099736544975536, "grad_norm": 0.05176471641470045, "learning_rate": 0.000170921503961928, "loss": 0.7915, "step": 2710 }, { "epoch": 0.5109145652992096, "grad_norm": 0.03903056061857688, "learning_rate": 0.00017043348658889133, "loss": 0.7582, "step": 2715 }, { "epoch": 0.5118554761008657, "grad_norm": 0.03918100612671428, "learning_rate": 0.00016994524873915613, "loss": 0.7876, "step": 2720 }, { "epoch": 0.5127963869025216, "grad_norm": 0.04124280214103889, "learning_rate": 0.0001694567956807939, "loss": 0.7761, "step": 2725 }, { "epoch": 0.5137372977041776, "grad_norm": 0.042496128838645876, "learning_rate": 0.00016896813268419824, "loss": 0.7563, "step": 2730 }, { "epoch": 0.5146782085058337, "grad_norm": 0.042727195504920386, "learning_rate": 0.00016847926502202814, "loss": 0.7715, "step": 2735 }, { "epoch": 0.5156191193074896, "grad_norm": 0.0453239652902863, "learning_rate": 0.00016799019796915067, "loss": 0.7786, "step": 2740 }, { "epoch": 0.5165600301091456, "grad_norm": 0.045118257069757896, "learning_rate": 0.00016750093680258454, "loss": 0.7824, "step": 2745 }, { "epoch": 0.5175009409108017, "grad_norm": 0.038825006570890906, "learning_rate": 0.00016701148680144277, "loss": 0.7749, "step": 2750 }, { "epoch": 0.5184418517124577, "grad_norm": 0.041921133819207645, "learning_rate": 0.00016652185324687605, "loss": 0.7762, "step": 2755 }, { "epoch": 0.5193827625141136, "grad_norm": 0.04014068407855972, "learning_rate": 0.0001660320414220155, "loss": 0.7965, "step": 2760 }, { "epoch": 0.5203236733157697, "grad_norm": 0.0389817444179057, "learning_rate": 0.0001655420566119158, "loss": 0.7803, "step": 2765 }, { "epoch": 0.5212645841174257, "grad_norm": 0.04084074773574928, "learning_rate": 0.00016505190410349817, "loss": 0.7728, "step": 2770 }, { "epoch": 0.5222054949190816, "grad_norm": 0.03931726740857862, "learning_rate": 0.00016456158918549328, "loss": 0.7673, "step": 2775 }, { "epoch": 0.5231464057207377, "grad_norm": 0.04263330287791737, "learning_rate": 0.00016407111714838407, "loss": 0.7768, "step": 2780 }, { "epoch": 0.5240873165223937, "grad_norm": 0.03868707213111586, "learning_rate": 0.00016358049328434903, "loss": 0.7975, "step": 2785 }, { "epoch": 0.5250282273240496, "grad_norm": 0.04065607527535718, "learning_rate": 0.00016308972288720466, "loss": 0.793, "step": 2790 }, { "epoch": 0.5259691381257057, "grad_norm": 0.03586574957051923, "learning_rate": 0.00016259881125234863, "loss": 0.7704, "step": 2795 }, { "epoch": 0.5269100489273617, "grad_norm": 0.04097298758383249, "learning_rate": 0.00016210776367670253, "loss": 0.7503, "step": 2800 }, { "epoch": 0.5278509597290177, "grad_norm": 0.044803423222512996, "learning_rate": 0.00016161658545865473, "loss": 0.806, "step": 2805 }, { "epoch": 0.5287918705306737, "grad_norm": 0.04270347978524801, "learning_rate": 0.00016112528189800334, "loss": 0.8056, "step": 2810 }, { "epoch": 0.5297327813323297, "grad_norm": 0.04047987808565142, "learning_rate": 0.00016063385829589874, "loss": 0.754, "step": 2815 }, { "epoch": 0.5306736921339857, "grad_norm": 0.037879635517597286, "learning_rate": 0.0001601423199547867, "loss": 0.7573, "step": 2820 }, { "epoch": 0.5316146029356417, "grad_norm": 0.03784641405503706, "learning_rate": 0.00015965067217835093, "loss": 0.7672, "step": 2825 }, { "epoch": 0.5325555137372977, "grad_norm": 0.04190542796765804, "learning_rate": 0.00015915892027145603, "loss": 0.7646, "step": 2830 }, { "epoch": 0.5334964245389537, "grad_norm": 0.0410546092280587, "learning_rate": 0.00015866706954009005, "loss": 0.7823, "step": 2835 }, { "epoch": 0.5344373353406097, "grad_norm": 0.040014735565657274, "learning_rate": 0.00015817512529130748, "loss": 0.7619, "step": 2840 }, { "epoch": 0.5353782461422657, "grad_norm": 0.04109448908128682, "learning_rate": 0.00015768309283317175, "loss": 0.7749, "step": 2845 }, { "epoch": 0.5363191569439217, "grad_norm": 0.042386121106642534, "learning_rate": 0.0001571909774746981, "loss": 0.759, "step": 2850 }, { "epoch": 0.5372600677455778, "grad_norm": 0.03948724800684583, "learning_rate": 0.00015669878452579633, "loss": 0.8001, "step": 2855 }, { "epoch": 0.5382009785472337, "grad_norm": 0.039488312405531356, "learning_rate": 0.00015620651929721335, "loss": 0.7394, "step": 2860 }, { "epoch": 0.5391418893488897, "grad_norm": 0.04561518902409399, "learning_rate": 0.00015571418710047597, "loss": 0.8038, "step": 2865 }, { "epoch": 0.5400828001505458, "grad_norm": 0.042232271266558645, "learning_rate": 0.00015522179324783364, "loss": 0.7848, "step": 2870 }, { "epoch": 0.5410237109522017, "grad_norm": 0.042459217752880335, "learning_rate": 0.0001547293430522011, "loss": 0.787, "step": 2875 }, { "epoch": 0.5419646217538577, "grad_norm": 0.04563613956360711, "learning_rate": 0.0001542368418271009, "loss": 0.7922, "step": 2880 }, { "epoch": 0.5429055325555138, "grad_norm": 0.04471760423803294, "learning_rate": 0.0001537442948866063, "loss": 0.7768, "step": 2885 }, { "epoch": 0.5438464433571697, "grad_norm": 0.04367786244918379, "learning_rate": 0.00015325170754528376, "loss": 0.7551, "step": 2890 }, { "epoch": 0.5447873541588257, "grad_norm": 0.04050477501250885, "learning_rate": 0.00015275908511813583, "loss": 0.7591, "step": 2895 }, { "epoch": 0.5457282649604818, "grad_norm": 0.03457399407540036, "learning_rate": 0.00015226643292054335, "loss": 0.7867, "step": 2900 }, { "epoch": 0.5466691757621378, "grad_norm": 0.03851340729404049, "learning_rate": 0.00015177375626820866, "loss": 0.7514, "step": 2905 }, { "epoch": 0.5476100865637937, "grad_norm": 0.04344603262075698, "learning_rate": 0.00015128106047709782, "loss": 0.7818, "step": 2910 }, { "epoch": 0.5485509973654498, "grad_norm": 0.0385972605358632, "learning_rate": 0.00015078835086338333, "loss": 0.776, "step": 2915 }, { "epoch": 0.5494919081671058, "grad_norm": 0.04105948951389995, "learning_rate": 0.00015029563274338711, "loss": 0.7799, "step": 2920 }, { "epoch": 0.5504328189687617, "grad_norm": 0.0373428818729792, "learning_rate": 0.00014980291143352253, "loss": 0.787, "step": 2925 }, { "epoch": 0.5513737297704178, "grad_norm": 0.04145957845710554, "learning_rate": 0.00014931019225023764, "loss": 0.7903, "step": 2930 }, { "epoch": 0.5523146405720738, "grad_norm": 0.03832821967374709, "learning_rate": 0.0001488174805099573, "loss": 0.7743, "step": 2935 }, { "epoch": 0.5532555513737297, "grad_norm": 0.04371721164045727, "learning_rate": 0.00014832478152902633, "loss": 0.7917, "step": 2940 }, { "epoch": 0.5541964621753858, "grad_norm": 0.045192576709012396, "learning_rate": 0.0001478321006236517, "loss": 0.819, "step": 2945 }, { "epoch": 0.5551373729770418, "grad_norm": 0.041868786627357824, "learning_rate": 0.00014733944310984533, "loss": 0.7836, "step": 2950 }, { "epoch": 0.5560782837786977, "grad_norm": 0.040054306103734574, "learning_rate": 0.00014684681430336688, "loss": 0.7581, "step": 2955 }, { "epoch": 0.5570191945803538, "grad_norm": 0.04498485518590034, "learning_rate": 0.00014635421951966613, "loss": 0.7804, "step": 2960 }, { "epoch": 0.5579601053820098, "grad_norm": 0.03795169554357177, "learning_rate": 0.00014586166407382585, "loss": 0.76, "step": 2965 }, { "epoch": 0.5589010161836658, "grad_norm": 0.039089524876466074, "learning_rate": 0.0001453691532805043, "loss": 0.7692, "step": 2970 }, { "epoch": 0.5598419269853218, "grad_norm": 0.03791366653189066, "learning_rate": 0.00014487669245387793, "loss": 0.7979, "step": 2975 }, { "epoch": 0.5607828377869778, "grad_norm": 0.048004979557690994, "learning_rate": 0.00014438428690758415, "loss": 0.746, "step": 2980 }, { "epoch": 0.5617237485886338, "grad_norm": 0.040058739496838455, "learning_rate": 0.00014389194195466373, "loss": 0.7711, "step": 2985 }, { "epoch": 0.5626646593902898, "grad_norm": 0.03652076270611612, "learning_rate": 0.00014339966290750374, "loss": 0.7344, "step": 2990 }, { "epoch": 0.5636055701919458, "grad_norm": 0.042636228941696684, "learning_rate": 0.00014290745507778018, "loss": 0.7819, "step": 2995 }, { "epoch": 0.5645464809936018, "grad_norm": 0.03580210308957461, "learning_rate": 0.00014241532377640056, "loss": 0.7699, "step": 3000 }, { "epoch": 0.5654873917952578, "grad_norm": 0.038702251641132904, "learning_rate": 0.00014192327431344654, "loss": 0.7587, "step": 3005 }, { "epoch": 0.5664283025969138, "grad_norm": 0.043869218575217146, "learning_rate": 0.00014143131199811695, "loss": 0.7598, "step": 3010 }, { "epoch": 0.5673692133985698, "grad_norm": 0.03942521728979009, "learning_rate": 0.00014093944213867027, "loss": 0.7779, "step": 3015 }, { "epoch": 0.5683101242002259, "grad_norm": 0.04379986996024926, "learning_rate": 0.00014044767004236708, "loss": 0.7721, "step": 3020 }, { "epoch": 0.5692510350018818, "grad_norm": 0.041493273088253166, "learning_rate": 0.00013995600101541358, "loss": 0.7864, "step": 3025 }, { "epoch": 0.5701919458035378, "grad_norm": 0.04809266982465616, "learning_rate": 0.0001394644403629035, "loss": 0.7826, "step": 3030 }, { "epoch": 0.5711328566051939, "grad_norm": 0.03877586910463833, "learning_rate": 0.0001389729933887613, "loss": 0.7555, "step": 3035 }, { "epoch": 0.5720737674068498, "grad_norm": 0.0424860755202435, "learning_rate": 0.00013848166539568495, "loss": 0.749, "step": 3040 }, { "epoch": 0.5730146782085058, "grad_norm": 0.04037436043604233, "learning_rate": 0.00013799046168508851, "loss": 0.7779, "step": 3045 }, { "epoch": 0.5739555890101619, "grad_norm": 0.04095161246609761, "learning_rate": 0.00013749938755704504, "loss": 0.7682, "step": 3050 }, { "epoch": 0.5748964998118178, "grad_norm": 0.03996299861070224, "learning_rate": 0.00013700844831022948, "loss": 0.7495, "step": 3055 }, { "epoch": 0.5758374106134738, "grad_norm": 0.04127396019541893, "learning_rate": 0.00013651764924186142, "loss": 0.8048, "step": 3060 }, { "epoch": 0.5767783214151299, "grad_norm": 0.03925183560100803, "learning_rate": 0.0001360269956476477, "loss": 0.7614, "step": 3065 }, { "epoch": 0.5777192322167859, "grad_norm": 0.04104317482073666, "learning_rate": 0.00013553649282172588, "loss": 0.7875, "step": 3070 }, { "epoch": 0.5786601430184418, "grad_norm": 0.04047442818024267, "learning_rate": 0.00013504614605660642, "loss": 0.8087, "step": 3075 }, { "epoch": 0.5796010538200979, "grad_norm": 0.03790235486672248, "learning_rate": 0.00013455596064311593, "loss": 0.7414, "step": 3080 }, { "epoch": 0.5805419646217539, "grad_norm": 0.039394611832599055, "learning_rate": 0.00013406594187034026, "loss": 0.7511, "step": 3085 }, { "epoch": 0.5814828754234098, "grad_norm": 0.04013793427774893, "learning_rate": 0.00013357609502556697, "loss": 0.7666, "step": 3090 }, { "epoch": 0.5824237862250659, "grad_norm": 0.04085601620139591, "learning_rate": 0.00013308642539422858, "loss": 0.7471, "step": 3095 }, { "epoch": 0.5833646970267219, "grad_norm": 0.04129620694156995, "learning_rate": 0.00013259693825984562, "loss": 0.8013, "step": 3100 }, { "epoch": 0.5843056078283778, "grad_norm": 0.03631629884354763, "learning_rate": 0.0001321076389039693, "loss": 0.7534, "step": 3105 }, { "epoch": 0.5852465186300339, "grad_norm": 0.08206893663995983, "learning_rate": 0.00013161853260612474, "loss": 0.7885, "step": 3110 }, { "epoch": 0.5861874294316899, "grad_norm": 0.046338835239627396, "learning_rate": 0.0001311296246437541, "loss": 0.7386, "step": 3115 }, { "epoch": 0.5871283402333459, "grad_norm": 0.04213777780082758, "learning_rate": 0.0001306409202921594, "loss": 0.7798, "step": 3120 }, { "epoch": 0.5880692510350018, "grad_norm": 0.04362316942259357, "learning_rate": 0.00013015242482444564, "loss": 0.8252, "step": 3125 }, { "epoch": 0.5890101618366579, "grad_norm": 0.041766857211591846, "learning_rate": 0.0001296641435114642, "loss": 0.7943, "step": 3130 }, { "epoch": 0.5899510726383139, "grad_norm": 0.04441519901114515, "learning_rate": 0.0001291760816217555, "loss": 0.7652, "step": 3135 }, { "epoch": 0.5908919834399698, "grad_norm": 0.038449411700647515, "learning_rate": 0.00012868824442149242, "loss": 0.7802, "step": 3140 }, { "epoch": 0.5918328942416259, "grad_norm": 0.03939930370264386, "learning_rate": 0.00012820063717442366, "loss": 0.7715, "step": 3145 }, { "epoch": 0.5927738050432819, "grad_norm": 0.040400026135115875, "learning_rate": 0.00012771326514181646, "loss": 0.7749, "step": 3150 }, { "epoch": 0.5937147158449378, "grad_norm": 0.04110702660939482, "learning_rate": 0.00012722613358240022, "loss": 0.7564, "step": 3155 }, { "epoch": 0.5946556266465939, "grad_norm": 0.03784848807322662, "learning_rate": 0.00012673924775230972, "loss": 0.7424, "step": 3160 }, { "epoch": 0.5955965374482499, "grad_norm": 0.04018189304197721, "learning_rate": 0.00012625261290502823, "loss": 0.7974, "step": 3165 }, { "epoch": 0.596537448249906, "grad_norm": 0.03834107117175049, "learning_rate": 0.00012576623429133089, "loss": 0.7441, "step": 3170 }, { "epoch": 0.5974783590515619, "grad_norm": 0.04188273882175529, "learning_rate": 0.00012528011715922822, "loss": 0.7666, "step": 3175 }, { "epoch": 0.5984192698532179, "grad_norm": 0.04048234489441125, "learning_rate": 0.0001247942667539092, "loss": 0.7686, "step": 3180 }, { "epoch": 0.599360180654874, "grad_norm": 0.042753366871858815, "learning_rate": 0.00012430868831768505, "loss": 0.7692, "step": 3185 }, { "epoch": 0.6003010914565299, "grad_norm": 0.03965154904470419, "learning_rate": 0.0001238233870899322, "loss": 0.7621, "step": 3190 }, { "epoch": 0.6012420022581859, "grad_norm": 0.040895100280079935, "learning_rate": 0.00012333836830703615, "loss": 0.7711, "step": 3195 }, { "epoch": 0.602182913059842, "grad_norm": 0.039036996904750175, "learning_rate": 0.00012285363720233484, "loss": 0.7403, "step": 3200 }, { "epoch": 0.6031238238614979, "grad_norm": 0.03984475123838604, "learning_rate": 0.00012236919900606214, "loss": 0.8089, "step": 3205 }, { "epoch": 0.6040647346631539, "grad_norm": 0.03602768052095753, "learning_rate": 0.0001218850589452914, "loss": 0.772, "step": 3210 }, { "epoch": 0.60500564546481, "grad_norm": 0.03775213464617447, "learning_rate": 0.00012140122224387924, "loss": 0.7517, "step": 3215 }, { "epoch": 0.605946556266466, "grad_norm": 0.03568230380993887, "learning_rate": 0.00012091769412240889, "loss": 0.7606, "step": 3220 }, { "epoch": 0.6068874670681219, "grad_norm": 0.03614446877998065, "learning_rate": 0.000120434479798134, "loss": 0.7758, "step": 3225 }, { "epoch": 0.607828377869778, "grad_norm": 0.04101207650613775, "learning_rate": 0.00011995158448492257, "loss": 0.7326, "step": 3230 }, { "epoch": 0.608769288671434, "grad_norm": 0.03872714799207714, "learning_rate": 0.00011946901339320025, "loss": 0.7583, "step": 3235 }, { "epoch": 0.6097101994730899, "grad_norm": 0.04499673060260799, "learning_rate": 0.0001189867717298944, "loss": 0.7808, "step": 3240 }, { "epoch": 0.610651110274746, "grad_norm": 0.037363074205504686, "learning_rate": 0.000118504864698378, "loss": 0.7854, "step": 3245 }, { "epoch": 0.611592021076402, "grad_norm": 0.037838990173753896, "learning_rate": 0.00011802329749841316, "loss": 0.7946, "step": 3250 }, { "epoch": 0.6125329318780579, "grad_norm": 0.04050006759000257, "learning_rate": 0.00011754207532609534, "loss": 0.7541, "step": 3255 }, { "epoch": 0.6134738426797139, "grad_norm": 0.038242266145084615, "learning_rate": 0.00011706120337379718, "loss": 0.7483, "step": 3260 }, { "epoch": 0.61441475348137, "grad_norm": 0.0392400144645598, "learning_rate": 0.00011658068683011241, "loss": 0.7575, "step": 3265 }, { "epoch": 0.615355664283026, "grad_norm": 0.045110458493318586, "learning_rate": 0.0001161005308797998, "loss": 0.7429, "step": 3270 }, { "epoch": 0.6162965750846819, "grad_norm": 0.038540030132832, "learning_rate": 0.00011562074070372764, "loss": 0.7321, "step": 3275 }, { "epoch": 0.617237485886338, "grad_norm": 0.037506266917269986, "learning_rate": 0.00011514132147881717, "loss": 0.7405, "step": 3280 }, { "epoch": 0.618178396687994, "grad_norm": 0.03913471723109424, "learning_rate": 0.0001146622783779873, "loss": 0.7495, "step": 3285 }, { "epoch": 0.6191193074896499, "grad_norm": 0.04109286833361118, "learning_rate": 0.0001141836165700985, "loss": 0.7286, "step": 3290 }, { "epoch": 0.620060218291306, "grad_norm": 0.03854391729454944, "learning_rate": 0.00011370534121989713, "loss": 0.772, "step": 3295 }, { "epoch": 0.621001129092962, "grad_norm": 0.0393768039777761, "learning_rate": 0.00011322745748795964, "loss": 0.7605, "step": 3300 }, { "epoch": 0.6219420398946179, "grad_norm": 0.04333935406275173, "learning_rate": 0.00011274997053063702, "loss": 0.7883, "step": 3305 }, { "epoch": 0.622882950696274, "grad_norm": 0.0390615079213823, "learning_rate": 0.00011227288549999894, "loss": 0.7809, "step": 3310 }, { "epoch": 0.62382386149793, "grad_norm": 0.045178121155615474, "learning_rate": 0.00011179620754377833, "loss": 0.7541, "step": 3315 }, { "epoch": 0.624764772299586, "grad_norm": 0.038276191870259636, "learning_rate": 0.00011131994180531597, "loss": 0.7668, "step": 3320 }, { "epoch": 0.625705683101242, "grad_norm": 0.04593621357066484, "learning_rate": 0.00011084409342350458, "loss": 0.7778, "step": 3325 }, { "epoch": 0.626646593902898, "grad_norm": 0.042697008069698866, "learning_rate": 0.00011036866753273372, "loss": 0.7812, "step": 3330 }, { "epoch": 0.627587504704554, "grad_norm": 0.038210651549027225, "learning_rate": 0.00010989366926283435, "loss": 0.7645, "step": 3335 }, { "epoch": 0.62852841550621, "grad_norm": 0.04016328258057881, "learning_rate": 0.00010941910373902334, "loss": 0.7447, "step": 3340 }, { "epoch": 0.629469326307866, "grad_norm": 0.03979873617240781, "learning_rate": 0.00010894497608184814, "loss": 0.7739, "step": 3345 }, { "epoch": 0.630410237109522, "grad_norm": 0.03825888432646367, "learning_rate": 0.00010847129140713192, "loss": 0.7654, "step": 3350 }, { "epoch": 0.631351147911178, "grad_norm": 0.04160212266379006, "learning_rate": 0.00010799805482591778, "loss": 0.8068, "step": 3355 }, { "epoch": 0.632292058712834, "grad_norm": 0.040102831786925454, "learning_rate": 0.00010752527144441405, "loss": 0.7676, "step": 3360 }, { "epoch": 0.63323296951449, "grad_norm": 0.04073264579453533, "learning_rate": 0.00010705294636393908, "loss": 0.7605, "step": 3365 }, { "epoch": 0.6341738803161461, "grad_norm": 0.03930481140797053, "learning_rate": 0.00010658108468086611, "loss": 0.7421, "step": 3370 }, { "epoch": 0.635114791117802, "grad_norm": 0.04129180654967124, "learning_rate": 0.00010610969148656824, "loss": 0.7459, "step": 3375 }, { "epoch": 0.636055701919458, "grad_norm": 0.04025751491566716, "learning_rate": 0.00010563877186736384, "loss": 0.7532, "step": 3380 }, { "epoch": 0.6369966127211141, "grad_norm": 0.04130711339590857, "learning_rate": 0.00010516833090446123, "loss": 0.7692, "step": 3385 }, { "epoch": 0.63793752352277, "grad_norm": 0.04249719108175661, "learning_rate": 0.000104698373673904, "loss": 0.7721, "step": 3390 }, { "epoch": 0.638878434324426, "grad_norm": 0.03793225541937712, "learning_rate": 0.00010422890524651647, "loss": 0.7605, "step": 3395 }, { "epoch": 0.6398193451260821, "grad_norm": 0.037043396185056116, "learning_rate": 0.0001037599306878486, "loss": 0.7335, "step": 3400 }, { "epoch": 0.640760255927738, "grad_norm": 0.03812826423492339, "learning_rate": 0.0001032914550581217, "loss": 0.7342, "step": 3405 }, { "epoch": 0.641701166729394, "grad_norm": 0.03967669170673079, "learning_rate": 0.00010282348341217352, "loss": 0.7574, "step": 3410 }, { "epoch": 0.6426420775310501, "grad_norm": 0.03737016200455732, "learning_rate": 0.00010235602079940385, "loss": 0.7766, "step": 3415 }, { "epoch": 0.6435829883327061, "grad_norm": 0.03898453606920356, "learning_rate": 0.0001018890722637201, "loss": 0.7677, "step": 3420 }, { "epoch": 0.644523899134362, "grad_norm": 0.042695495342005334, "learning_rate": 0.00010142264284348278, "loss": 0.8001, "step": 3425 }, { "epoch": 0.6454648099360181, "grad_norm": 0.0396946259433925, "learning_rate": 0.00010095673757145103, "loss": 0.7716, "step": 3430 }, { "epoch": 0.6464057207376741, "grad_norm": 0.04406355047386435, "learning_rate": 0.00010049136147472874, "loss": 0.7759, "step": 3435 }, { "epoch": 0.64734663153933, "grad_norm": 0.043764052802193115, "learning_rate": 0.00010002651957470968, "loss": 0.8051, "step": 3440 }, { "epoch": 0.6482875423409861, "grad_norm": 0.041926281617999266, "learning_rate": 9.956221688702384e-05, "loss": 0.7647, "step": 3445 }, { "epoch": 0.6492284531426421, "grad_norm": 0.03895521821927712, "learning_rate": 9.909845842148313e-05, "loss": 0.7602, "step": 3450 }, { "epoch": 0.650169363944298, "grad_norm": 0.03914173750008508, "learning_rate": 9.863524918202729e-05, "loss": 0.7627, "step": 3455 }, { "epoch": 0.6511102747459541, "grad_norm": 0.03993016779305327, "learning_rate": 9.817259416666985e-05, "loss": 0.7646, "step": 3460 }, { "epoch": 0.6520511855476101, "grad_norm": 0.03980886857877244, "learning_rate": 9.77104983674444e-05, "loss": 0.7425, "step": 3465 }, { "epoch": 0.6529920963492661, "grad_norm": 0.0401924210782212, "learning_rate": 9.724896677035061e-05, "loss": 0.6987, "step": 3470 }, { "epoch": 0.6539330071509221, "grad_norm": 0.03664837733116737, "learning_rate": 9.67880043553002e-05, "loss": 0.7429, "step": 3475 }, { "epoch": 0.6548739179525781, "grad_norm": 0.03777329875630416, "learning_rate": 9.632761609606382e-05, "loss": 0.7661, "step": 3480 }, { "epoch": 0.6558148287542341, "grad_norm": 0.03939981829447013, "learning_rate": 9.586780696021662e-05, "loss": 0.7475, "step": 3485 }, { "epoch": 0.6567557395558901, "grad_norm": 0.04466284968667148, "learning_rate": 9.540858190908521e-05, "loss": 0.7922, "step": 3490 }, { "epoch": 0.6576966503575461, "grad_norm": 0.038955559682471524, "learning_rate": 9.494994589769395e-05, "loss": 0.7738, "step": 3495 }, { "epoch": 0.6586375611592021, "grad_norm": 0.0411424120473035, "learning_rate": 9.449190387471146e-05, "loss": 0.7914, "step": 3500 }, { "epoch": 0.6595784719608581, "grad_norm": 0.03649617721160762, "learning_rate": 9.40344607823972e-05, "loss": 0.7332, "step": 3505 }, { "epoch": 0.6605193827625141, "grad_norm": 0.045697152332815814, "learning_rate": 9.357762155654826e-05, "loss": 0.736, "step": 3510 }, { "epoch": 0.6614602935641701, "grad_norm": 0.04346134712991159, "learning_rate": 9.312139112644593e-05, "loss": 0.7753, "step": 3515 }, { "epoch": 0.6624012043658262, "grad_norm": 0.03920510438440831, "learning_rate": 9.266577441480266e-05, "loss": 0.778, "step": 3520 }, { "epoch": 0.6633421151674821, "grad_norm": 0.040739403735231945, "learning_rate": 9.221077633770898e-05, "loss": 0.7421, "step": 3525 }, { "epoch": 0.6642830259691381, "grad_norm": 0.03703007602048622, "learning_rate": 9.175640180458026e-05, "loss": 0.7625, "step": 3530 }, { "epoch": 0.6652239367707942, "grad_norm": 0.04195682496632696, "learning_rate": 9.130265571810383e-05, "loss": 0.7675, "step": 3535 }, { "epoch": 0.6661648475724501, "grad_norm": 0.038248688786009566, "learning_rate": 9.084954297418625e-05, "loss": 0.7286, "step": 3540 }, { "epoch": 0.6671057583741061, "grad_norm": 0.04263400445476122, "learning_rate": 9.039706846190026e-05, "loss": 0.7723, "step": 3545 }, { "epoch": 0.6680466691757622, "grad_norm": 0.04018110083160048, "learning_rate": 8.99452370634319e-05, "loss": 0.7595, "step": 3550 }, { "epoch": 0.6689875799774181, "grad_norm": 0.04129649016774401, "learning_rate": 8.949405365402843e-05, "loss": 0.7386, "step": 3555 }, { "epoch": 0.6699284907790741, "grad_norm": 0.03730989500271481, "learning_rate": 8.904352310194497e-05, "loss": 0.7762, "step": 3560 }, { "epoch": 0.6708694015807302, "grad_norm": 0.04233708639216568, "learning_rate": 8.85936502683925e-05, "loss": 0.7552, "step": 3565 }, { "epoch": 0.6718103123823862, "grad_norm": 0.03856143493937795, "learning_rate": 8.814444000748523e-05, "loss": 0.7539, "step": 3570 }, { "epoch": 0.6727512231840421, "grad_norm": 0.041220356746205916, "learning_rate": 8.76958971661882e-05, "loss": 0.7583, "step": 3575 }, { "epoch": 0.6736921339856982, "grad_norm": 0.03885156037305439, "learning_rate": 8.724802658426502e-05, "loss": 0.7376, "step": 3580 }, { "epoch": 0.6746330447873542, "grad_norm": 0.03874853523714315, "learning_rate": 8.68008330942256e-05, "loss": 0.7656, "step": 3585 }, { "epoch": 0.6755739555890101, "grad_norm": 0.04028531730729849, "learning_rate": 8.635432152127418e-05, "loss": 0.7825, "step": 3590 }, { "epoch": 0.6765148663906662, "grad_norm": 0.04280053960563239, "learning_rate": 8.590849668325693e-05, "loss": 0.754, "step": 3595 }, { "epoch": 0.6774557771923222, "grad_norm": 0.03794786579840221, "learning_rate": 8.546336339061036e-05, "loss": 0.734, "step": 3600 }, { "epoch": 0.6783966879939781, "grad_norm": 0.0402090640543888, "learning_rate": 8.501892644630921e-05, "loss": 0.7534, "step": 3605 }, { "epoch": 0.6793375987956342, "grad_norm": 0.043552980145466276, "learning_rate": 8.457519064581444e-05, "loss": 0.7616, "step": 3610 }, { "epoch": 0.6802785095972902, "grad_norm": 0.04080500922486465, "learning_rate": 8.413216077702196e-05, "loss": 0.7479, "step": 3615 }, { "epoch": 0.6812194203989462, "grad_norm": 0.041554874325141766, "learning_rate": 8.368984162021043e-05, "loss": 0.7544, "step": 3620 }, { "epoch": 0.6821603312006022, "grad_norm": 0.042784250483365954, "learning_rate": 8.324823794799032e-05, "loss": 0.7553, "step": 3625 }, { "epoch": 0.6831012420022582, "grad_norm": 0.038641859923428605, "learning_rate": 8.280735452525167e-05, "loss": 0.7607, "step": 3630 }, { "epoch": 0.6840421528039142, "grad_norm": 0.043016436368934555, "learning_rate": 8.236719610911314e-05, "loss": 0.7391, "step": 3635 }, { "epoch": 0.6849830636055702, "grad_norm": 0.03879736605977712, "learning_rate": 8.192776744887076e-05, "loss": 0.7313, "step": 3640 }, { "epoch": 0.6859239744072262, "grad_norm": 0.04080074250182663, "learning_rate": 8.14890732859464e-05, "loss": 0.7391, "step": 3645 }, { "epoch": 0.6868648852088822, "grad_norm": 0.03990822597090894, "learning_rate": 8.105111835383663e-05, "loss": 0.7714, "step": 3650 }, { "epoch": 0.6878057960105382, "grad_norm": 0.039537316806122595, "learning_rate": 8.061390737806198e-05, "loss": 0.7504, "step": 3655 }, { "epoch": 0.6887467068121942, "grad_norm": 0.03939074291969639, "learning_rate": 8.017744507611544e-05, "loss": 0.715, "step": 3660 }, { "epoch": 0.6896876176138502, "grad_norm": 0.0366601266910447, "learning_rate": 7.974173615741204e-05, "loss": 0.7386, "step": 3665 }, { "epoch": 0.6906285284155063, "grad_norm": 0.038454118392867886, "learning_rate": 7.930678532323778e-05, "loss": 0.7742, "step": 3670 }, { "epoch": 0.6915694392171622, "grad_norm": 0.03849044618209459, "learning_rate": 7.887259726669884e-05, "loss": 0.7567, "step": 3675 }, { "epoch": 0.6925103500188182, "grad_norm": 0.03823520672805604, "learning_rate": 7.84391766726712e-05, "loss": 0.76, "step": 3680 }, { "epoch": 0.6934512608204743, "grad_norm": 0.041103142624627866, "learning_rate": 7.800652821774995e-05, "loss": 0.737, "step": 3685 }, { "epoch": 0.6943921716221302, "grad_norm": 0.04060986038183799, "learning_rate": 7.757465657019864e-05, "loss": 0.7447, "step": 3690 }, { "epoch": 0.6953330824237862, "grad_norm": 0.04010014279749571, "learning_rate": 7.714356638989914e-05, "loss": 0.7119, "step": 3695 }, { "epoch": 0.6962739932254423, "grad_norm": 0.03873233135487847, "learning_rate": 7.67132623283016e-05, "loss": 0.7429, "step": 3700 }, { "epoch": 0.6972149040270982, "grad_norm": 0.04007347045017312, "learning_rate": 7.628374902837363e-05, "loss": 0.7385, "step": 3705 }, { "epoch": 0.6981558148287542, "grad_norm": 0.040519533492342004, "learning_rate": 7.585503112455062e-05, "loss": 0.7534, "step": 3710 }, { "epoch": 0.6990967256304103, "grad_norm": 0.04109428606589565, "learning_rate": 7.542711324268576e-05, "loss": 0.7538, "step": 3715 }, { "epoch": 0.7000376364320663, "grad_norm": 0.03658743414096176, "learning_rate": 7.500000000000002e-05, "loss": 0.7326, "step": 3720 }, { "epoch": 0.7009785472337222, "grad_norm": 0.03980166365958357, "learning_rate": 7.45736960050322e-05, "loss": 0.7409, "step": 3725 }, { "epoch": 0.7019194580353783, "grad_norm": 0.03656884698195228, "learning_rate": 7.414820585758949e-05, "loss": 0.7699, "step": 3730 }, { "epoch": 0.7028603688370343, "grad_norm": 0.03618243018140937, "learning_rate": 7.372353414869766e-05, "loss": 0.7493, "step": 3735 }, { "epoch": 0.7038012796386902, "grad_norm": 0.0400626300749975, "learning_rate": 7.329968546055144e-05, "loss": 0.7505, "step": 3740 }, { "epoch": 0.7047421904403463, "grad_norm": 0.03697501490735565, "learning_rate": 7.287666436646539e-05, "loss": 0.7508, "step": 3745 }, { "epoch": 0.7056831012420023, "grad_norm": 0.03865403994396552, "learning_rate": 7.245447543082414e-05, "loss": 0.7348, "step": 3750 }, { "epoch": 0.7066240120436582, "grad_norm": 0.039335397836340956, "learning_rate": 7.20331232090335e-05, "loss": 0.7493, "step": 3755 }, { "epoch": 0.7075649228453142, "grad_norm": 0.03793808471305577, "learning_rate": 7.161261224747119e-05, "loss": 0.7339, "step": 3760 }, { "epoch": 0.7085058336469703, "grad_norm": 0.037488809656223004, "learning_rate": 7.119294708343755e-05, "loss": 0.7477, "step": 3765 }, { "epoch": 0.7094467444486263, "grad_norm": 0.038730154299705596, "learning_rate": 7.077413224510702e-05, "loss": 0.7421, "step": 3770 }, { "epoch": 0.7103876552502822, "grad_norm": 0.04047318420968464, "learning_rate": 7.0356172251479e-05, "loss": 0.786, "step": 3775 }, { "epoch": 0.7113285660519383, "grad_norm": 0.04012985610521384, "learning_rate": 6.993907161232907e-05, "loss": 0.7564, "step": 3780 }, { "epoch": 0.7122694768535943, "grad_norm": 0.040195166286305296, "learning_rate": 6.952283482816037e-05, "loss": 0.7452, "step": 3785 }, { "epoch": 0.7132103876552502, "grad_norm": 0.036013934570515216, "learning_rate": 6.910746639015518e-05, "loss": 0.7416, "step": 3790 }, { "epoch": 0.7141512984569063, "grad_norm": 0.039404706768370185, "learning_rate": 6.869297078012636e-05, "loss": 0.7566, "step": 3795 }, { "epoch": 0.7150922092585623, "grad_norm": 0.03794240394451752, "learning_rate": 6.827935247046883e-05, "loss": 0.7405, "step": 3800 }, { "epoch": 0.7160331200602182, "grad_norm": 0.03817108747506884, "learning_rate": 6.786661592411162e-05, "loss": 0.7466, "step": 3805 }, { "epoch": 0.7169740308618743, "grad_norm": 0.03872769520991303, "learning_rate": 6.745476559446956e-05, "loss": 0.7595, "step": 3810 }, { "epoch": 0.7179149416635303, "grad_norm": 0.03884711036444254, "learning_rate": 6.704380592539508e-05, "loss": 0.7578, "step": 3815 }, { "epoch": 0.7188558524651864, "grad_norm": 0.03780978653067677, "learning_rate": 6.663374135113059e-05, "loss": 0.7311, "step": 3820 }, { "epoch": 0.7197967632668423, "grad_norm": 0.040555453463663634, "learning_rate": 6.622457629626027e-05, "loss": 0.7664, "step": 3825 }, { "epoch": 0.7207376740684983, "grad_norm": 0.03892829872566159, "learning_rate": 6.581631517566268e-05, "loss": 0.7515, "step": 3830 }, { "epoch": 0.7216785848701543, "grad_norm": 0.03899925932302288, "learning_rate": 6.540896239446293e-05, "loss": 0.7431, "step": 3835 }, { "epoch": 0.7226194956718103, "grad_norm": 0.03724943309525145, "learning_rate": 6.500252234798503e-05, "loss": 0.7305, "step": 3840 }, { "epoch": 0.7235604064734663, "grad_norm": 0.03821337427785788, "learning_rate": 6.459699942170475e-05, "loss": 0.745, "step": 3845 }, { "epoch": 0.7245013172751223, "grad_norm": 0.04022918959169577, "learning_rate": 6.419239799120222e-05, "loss": 0.7662, "step": 3850 }, { "epoch": 0.7254422280767783, "grad_norm": 0.03852879372490541, "learning_rate": 6.378872242211443e-05, "loss": 0.7675, "step": 3855 }, { "epoch": 0.7263831388784343, "grad_norm": 0.03781259503267875, "learning_rate": 6.338597707008859e-05, "loss": 0.7308, "step": 3860 }, { "epoch": 0.7273240496800903, "grad_norm": 0.03951092581019206, "learning_rate": 6.29841662807347e-05, "loss": 0.7387, "step": 3865 }, { "epoch": 0.7282649604817464, "grad_norm": 0.040290629672598395, "learning_rate": 6.258329438957899e-05, "loss": 0.7406, "step": 3870 }, { "epoch": 0.7292058712834023, "grad_norm": 0.03694512622132562, "learning_rate": 6.218336572201705e-05, "loss": 0.7193, "step": 3875 }, { "epoch": 0.7301467820850583, "grad_norm": 0.03646511173753941, "learning_rate": 6.178438459326689e-05, "loss": 0.7402, "step": 3880 }, { "epoch": 0.7310876928867144, "grad_norm": 0.0406395023903416, "learning_rate": 6.138635530832283e-05, "loss": 0.7414, "step": 3885 }, { "epoch": 0.7320286036883703, "grad_norm": 0.036014218987116424, "learning_rate": 6.09892821619088e-05, "loss": 0.725, "step": 3890 }, { "epoch": 0.7329695144900263, "grad_norm": 0.040755514238358985, "learning_rate": 6.059316943843189e-05, "loss": 0.7587, "step": 3895 }, { "epoch": 0.7339104252916824, "grad_norm": 0.040411548204649615, "learning_rate": 6.019802141193625e-05, "loss": 0.7646, "step": 3900 }, { "epoch": 0.7348513360933383, "grad_norm": 0.038968355232819656, "learning_rate": 5.980384234605726e-05, "loss": 0.776, "step": 3905 }, { "epoch": 0.7357922468949943, "grad_norm": 0.042090530618439974, "learning_rate": 5.941063649397495e-05, "loss": 0.7758, "step": 3910 }, { "epoch": 0.7367331576966504, "grad_norm": 0.037713848383341686, "learning_rate": 5.901840809836844e-05, "loss": 0.7538, "step": 3915 }, { "epoch": 0.7376740684983064, "grad_norm": 0.03686538623291845, "learning_rate": 5.8627161391370245e-05, "loss": 0.7419, "step": 3920 }, { "epoch": 0.7386149792999623, "grad_norm": 0.0412323805290299, "learning_rate": 5.823690059452049e-05, "loss": 0.7286, "step": 3925 }, { "epoch": 0.7395558901016184, "grad_norm": 0.04681091382814438, "learning_rate": 5.7847629918721165e-05, "loss": 0.7392, "step": 3930 }, { "epoch": 0.7404968009032744, "grad_norm": 0.038102594611252705, "learning_rate": 5.7459353564191095e-05, "loss": 0.7581, "step": 3935 }, { "epoch": 0.7414377117049303, "grad_norm": 0.03890825582383635, "learning_rate": 5.707207572042037e-05, "loss": 0.7455, "step": 3940 }, { "epoch": 0.7423786225065864, "grad_norm": 0.039387553938316465, "learning_rate": 5.668580056612504e-05, "loss": 0.7285, "step": 3945 }, { "epoch": 0.7433195333082424, "grad_norm": 0.03946818985958446, "learning_rate": 5.630053226920239e-05, "loss": 0.7248, "step": 3950 }, { "epoch": 0.7442604441098983, "grad_norm": 0.03936387385845415, "learning_rate": 5.591627498668548e-05, "loss": 0.766, "step": 3955 }, { "epoch": 0.7452013549115544, "grad_norm": 0.03472976147552407, "learning_rate": 5.5533032864698754e-05, "loss": 0.7205, "step": 3960 }, { "epoch": 0.7461422657132104, "grad_norm": 0.03912387994170675, "learning_rate": 5.515081003841315e-05, "loss": 0.7593, "step": 3965 }, { "epoch": 0.7470831765148664, "grad_norm": 0.041833487223668316, "learning_rate": 5.4769610632001164e-05, "loss": 0.7618, "step": 3970 }, { "epoch": 0.7480240873165224, "grad_norm": 0.03700071263990363, "learning_rate": 5.4389438758592884e-05, "loss": 0.7354, "step": 3975 }, { "epoch": 0.7489649981181784, "grad_norm": 0.039575292853940104, "learning_rate": 5.401029852023129e-05, "loss": 0.7493, "step": 3980 }, { "epoch": 0.7499059089198344, "grad_norm": 0.034510383462628254, "learning_rate": 5.363219400782798e-05, "loss": 0.7379, "step": 3985 }, { "epoch": 0.7508468197214904, "grad_norm": 0.04074525021374615, "learning_rate": 5.325512930111907e-05, "loss": 0.7327, "step": 3990 }, { "epoch": 0.7517877305231464, "grad_norm": 0.03972661994649692, "learning_rate": 5.2879108468621346e-05, "loss": 0.7632, "step": 3995 }, { "epoch": 0.7527286413248024, "grad_norm": 0.040751426623560565, "learning_rate": 5.250413556758819e-05, "loss": 0.7573, "step": 4000 }, { "epoch": 0.7536695521264584, "grad_norm": 0.037165299818227866, "learning_rate": 5.2130214643965685e-05, "loss": 0.7703, "step": 4005 }, { "epoch": 0.7546104629281144, "grad_norm": 0.039192953238953, "learning_rate": 5.175734973234927e-05, "loss": 0.7383, "step": 4010 }, { "epoch": 0.7555513737297704, "grad_norm": 0.03861148502718981, "learning_rate": 5.1385544855940066e-05, "loss": 0.7483, "step": 4015 }, { "epoch": 0.7564922845314265, "grad_norm": 0.03866162098831916, "learning_rate": 5.1014804026501244e-05, "loss": 0.7212, "step": 4020 }, { "epoch": 0.7574331953330824, "grad_norm": 0.0403684213936758, "learning_rate": 5.0645131244315214e-05, "loss": 0.7466, "step": 4025 }, { "epoch": 0.7583741061347384, "grad_norm": 0.036399521831994304, "learning_rate": 5.027653049813991e-05, "loss": 0.7327, "step": 4030 }, { "epoch": 0.7593150169363945, "grad_norm": 0.03824476127393509, "learning_rate": 4.990900576516625e-05, "loss": 0.7304, "step": 4035 }, { "epoch": 0.7602559277380504, "grad_norm": 0.04226316377265603, "learning_rate": 4.954256101097494e-05, "loss": 0.7592, "step": 4040 }, { "epoch": 0.7611968385397064, "grad_norm": 0.0353431684108597, "learning_rate": 4.917720018949364e-05, "loss": 0.7101, "step": 4045 }, { "epoch": 0.7621377493413625, "grad_norm": 0.03822443909312598, "learning_rate": 4.8812927242954564e-05, "loss": 0.7392, "step": 4050 }, { "epoch": 0.7630786601430184, "grad_norm": 0.03835749701797319, "learning_rate": 4.844974610185173e-05, "loss": 0.7323, "step": 4055 }, { "epoch": 0.7640195709446744, "grad_norm": 0.037684888204025845, "learning_rate": 4.808766068489855e-05, "loss": 0.7392, "step": 4060 }, { "epoch": 0.7649604817463305, "grad_norm": 0.03795012722182954, "learning_rate": 4.772667489898572e-05, "loss": 0.7261, "step": 4065 }, { "epoch": 0.7659013925479865, "grad_norm": 0.03846412848813938, "learning_rate": 4.736679263913881e-05, "loss": 0.7456, "step": 4070 }, { "epoch": 0.7668423033496424, "grad_norm": 0.04277670399034575, "learning_rate": 4.7008017788476476e-05, "loss": 0.7676, "step": 4075 }, { "epoch": 0.7677832141512985, "grad_norm": 0.03733090710113088, "learning_rate": 4.665035421816852e-05, "loss": 0.7586, "step": 4080 }, { "epoch": 0.7687241249529545, "grad_norm": 0.036470579624849335, "learning_rate": 4.629380578739385e-05, "loss": 0.7254, "step": 4085 }, { "epoch": 0.7696650357546104, "grad_norm": 0.03975957425764092, "learning_rate": 4.593837634329928e-05, "loss": 0.7569, "step": 4090 }, { "epoch": 0.7706059465562665, "grad_norm": 0.040549018273597276, "learning_rate": 4.558406972095771e-05, "loss": 0.7307, "step": 4095 }, { "epoch": 0.7715468573579225, "grad_norm": 0.03801305901578898, "learning_rate": 4.523088974332676e-05, "loss": 0.7249, "step": 4100 }, { "epoch": 0.7724877681595784, "grad_norm": 0.04200138565229912, "learning_rate": 4.487884022120758e-05, "loss": 0.7567, "step": 4105 }, { "epoch": 0.7734286789612345, "grad_norm": 0.03950895201351385, "learning_rate": 4.452792495320396e-05, "loss": 0.7046, "step": 4110 }, { "epoch": 0.7743695897628905, "grad_norm": 0.036898557922398705, "learning_rate": 4.41781477256809e-05, "loss": 0.7263, "step": 4115 }, { "epoch": 0.7753105005645465, "grad_norm": 0.03802666125569989, "learning_rate": 4.382951231272397e-05, "loss": 0.7588, "step": 4120 }, { "epoch": 0.7762514113662025, "grad_norm": 0.037620561058654495, "learning_rate": 4.3482022476098736e-05, "loss": 0.7518, "step": 4125 }, { "epoch": 0.7771923221678585, "grad_norm": 0.042625621645339215, "learning_rate": 4.313568196520998e-05, "loss": 0.7467, "step": 4130 }, { "epoch": 0.7781332329695145, "grad_norm": 0.03933008556723823, "learning_rate": 4.27904945170612e-05, "loss": 0.74, "step": 4135 }, { "epoch": 0.7790741437711705, "grad_norm": 0.03976842759714889, "learning_rate": 4.244646385621451e-05, "loss": 0.7475, "step": 4140 }, { "epoch": 0.7800150545728265, "grad_norm": 0.03768428813268079, "learning_rate": 4.2103593694750324e-05, "loss": 0.7295, "step": 4145 }, { "epoch": 0.7809559653744825, "grad_norm": 0.03876815133956591, "learning_rate": 4.176188773222715e-05, "loss": 0.7232, "step": 4150 }, { "epoch": 0.7818968761761385, "grad_norm": 0.03668277473358429, "learning_rate": 4.1421349655641994e-05, "loss": 0.7589, "step": 4155 }, { "epoch": 0.7828377869777945, "grad_norm": 0.04030783885886016, "learning_rate": 4.108198313939029e-05, "loss": 0.7473, "step": 4160 }, { "epoch": 0.7837786977794505, "grad_norm": 0.03763435879728677, "learning_rate": 4.0743791845226446e-05, "loss": 0.7461, "step": 4165 }, { "epoch": 0.7847196085811066, "grad_norm": 0.03854446021480888, "learning_rate": 4.04067794222243e-05, "loss": 0.7435, "step": 4170 }, { "epoch": 0.7856605193827625, "grad_norm": 0.037540640945074796, "learning_rate": 4.007094950673753e-05, "loss": 0.7447, "step": 4175 }, { "epoch": 0.7866014301844185, "grad_norm": 0.04173529963643009, "learning_rate": 3.973630572236075e-05, "loss": 0.7272, "step": 4180 }, { "epoch": 0.7875423409860746, "grad_norm": 0.03950203020150416, "learning_rate": 3.940285167989028e-05, "loss": 0.7385, "step": 4185 }, { "epoch": 0.7884832517877305, "grad_norm": 0.03744463784853737, "learning_rate": 3.9070590977285016e-05, "loss": 0.7408, "step": 4190 }, { "epoch": 0.7894241625893865, "grad_norm": 0.0415749729661611, "learning_rate": 3.873952719962781e-05, "loss": 0.7553, "step": 4195 }, { "epoch": 0.7903650733910426, "grad_norm": 0.04023668652299723, "learning_rate": 3.840966391908678e-05, "loss": 0.7393, "step": 4200 }, { "epoch": 0.7913059841926985, "grad_norm": 0.03985163930669438, "learning_rate": 3.808100469487674e-05, "loss": 0.7433, "step": 4205 }, { "epoch": 0.7922468949943545, "grad_norm": 0.042655408743132714, "learning_rate": 3.775355307322063e-05, "loss": 0.7815, "step": 4210 }, { "epoch": 0.7931878057960106, "grad_norm": 0.037507990065317645, "learning_rate": 3.742731258731152e-05, "loss": 0.763, "step": 4215 }, { "epoch": 0.7941287165976666, "grad_norm": 0.04151808695349708, "learning_rate": 3.7102286757274364e-05, "loss": 0.7298, "step": 4220 }, { "epoch": 0.7950696273993225, "grad_norm": 0.03871917007808031, "learning_rate": 3.6778479090127913e-05, "loss": 0.7179, "step": 4225 }, { "epoch": 0.7960105382009786, "grad_norm": 0.0395273004111852, "learning_rate": 3.6455893079747114e-05, "loss": 0.7215, "step": 4230 }, { "epoch": 0.7969514490026346, "grad_norm": 0.037341829211501365, "learning_rate": 3.6134532206825136e-05, "loss": 0.7518, "step": 4235 }, { "epoch": 0.7978923598042905, "grad_norm": 0.039938364583883966, "learning_rate": 3.581439993883604e-05, "loss": 0.7498, "step": 4240 }, { "epoch": 0.7988332706059466, "grad_norm": 0.040057519903909024, "learning_rate": 3.5495499729997304e-05, "loss": 0.7071, "step": 4245 }, { "epoch": 0.7997741814076026, "grad_norm": 0.037110178091548936, "learning_rate": 3.5177835021232395e-05, "loss": 0.7041, "step": 4250 }, { "epoch": 0.8007150922092585, "grad_norm": 0.03888319458444373, "learning_rate": 3.486140924013391e-05, "loss": 0.7312, "step": 4255 }, { "epoch": 0.8016560030109146, "grad_norm": 0.03743060676550586, "learning_rate": 3.4546225800926416e-05, "loss": 0.7128, "step": 4260 }, { "epoch": 0.8025969138125706, "grad_norm": 0.038536328098299384, "learning_rate": 3.4232288104429636e-05, "loss": 0.7449, "step": 4265 }, { "epoch": 0.8035378246142266, "grad_norm": 0.03878605489785658, "learning_rate": 3.3919599538021664e-05, "loss": 0.7497, "step": 4270 }, { "epoch": 0.8044787354158826, "grad_norm": 0.03793631840075618, "learning_rate": 3.3608163475602684e-05, "loss": 0.7377, "step": 4275 }, { "epoch": 0.8054196462175386, "grad_norm": 0.03968152466748523, "learning_rate": 3.329798327755835e-05, "loss": 0.7307, "step": 4280 }, { "epoch": 0.8063605570191946, "grad_norm": 0.03749528471492435, "learning_rate": 3.298906229072357e-05, "loss": 0.7644, "step": 4285 }, { "epoch": 0.8073014678208505, "grad_norm": 0.03784045958987088, "learning_rate": 3.268140384834633e-05, "loss": 0.7269, "step": 4290 }, { "epoch": 0.8082423786225066, "grad_norm": 0.037631543918115885, "learning_rate": 3.237501127005192e-05, "loss": 0.7341, "step": 4295 }, { "epoch": 0.8091832894241626, "grad_norm": 0.0390771932444786, "learning_rate": 3.206988786180693e-05, "loss": 0.7328, "step": 4300 }, { "epoch": 0.8101242002258185, "grad_norm": 0.03647087521210094, "learning_rate": 3.176603691588365e-05, "loss": 0.7395, "step": 4305 }, { "epoch": 0.8110651110274746, "grad_norm": 0.03694803246663371, "learning_rate": 3.146346171082445e-05, "loss": 0.7527, "step": 4310 }, { "epoch": 0.8120060218291306, "grad_norm": 0.040278247753399284, "learning_rate": 3.1162165511406756e-05, "loss": 0.7317, "step": 4315 }, { "epoch": 0.8129469326307867, "grad_norm": 0.03898944326441212, "learning_rate": 3.086215156860729e-05, "loss": 0.7222, "step": 4320 }, { "epoch": 0.8138878434324426, "grad_norm": 0.04051272560722607, "learning_rate": 3.056342311956735e-05, "loss": 0.7484, "step": 4325 }, { "epoch": 0.8148287542340986, "grad_norm": 0.03796686896436505, "learning_rate": 3.026598338755783e-05, "loss": 0.737, "step": 4330 }, { "epoch": 0.8157696650357547, "grad_norm": 0.03901772952940146, "learning_rate": 2.9969835581944423e-05, "loss": 0.7275, "step": 4335 }, { "epoch": 0.8167105758374106, "grad_norm": 0.03789607099708822, "learning_rate": 2.9674982898152904e-05, "loss": 0.7522, "step": 4340 }, { "epoch": 0.8176514866390666, "grad_norm": 0.04437902229228163, "learning_rate": 2.938142851763476e-05, "loss": 0.748, "step": 4345 }, { "epoch": 0.8185923974407227, "grad_norm": 0.040534612356349066, "learning_rate": 2.908917560783286e-05, "loss": 0.745, "step": 4350 }, { "epoch": 0.8195333082423786, "grad_norm": 0.04165229631859438, "learning_rate": 2.8798227322147167e-05, "loss": 0.754, "step": 4355 }, { "epoch": 0.8204742190440346, "grad_norm": 0.04285176631929181, "learning_rate": 2.8508586799900878e-05, "loss": 0.7279, "step": 4360 }, { "epoch": 0.8214151298456907, "grad_norm": 0.04179797544417384, "learning_rate": 2.8220257166306338e-05, "loss": 0.7461, "step": 4365 }, { "epoch": 0.8223560406473467, "grad_norm": 0.04019820707217107, "learning_rate": 2.7933241532431576e-05, "loss": 0.7481, "step": 4370 }, { "epoch": 0.8232969514490026, "grad_norm": 0.03603648651130401, "learning_rate": 2.7647542995166576e-05, "loss": 0.7335, "step": 4375 }, { "epoch": 0.8242378622506586, "grad_norm": 0.03979622492169847, "learning_rate": 2.736316463718978e-05, "loss": 0.7483, "step": 4380 }, { "epoch": 0.8251787730523147, "grad_norm": 0.03939493805725078, "learning_rate": 2.7080109526935083e-05, "loss": 0.7187, "step": 4385 }, { "epoch": 0.8261196838539706, "grad_norm": 0.036865190585736146, "learning_rate": 2.6798380718558526e-05, "loss": 0.759, "step": 4390 }, { "epoch": 0.8270605946556266, "grad_norm": 0.03612023659473949, "learning_rate": 2.6517981251905336e-05, "loss": 0.7388, "step": 4395 }, { "epoch": 0.8280015054572827, "grad_norm": 0.03764904252746977, "learning_rate": 2.623891415247721e-05, "loss": 0.7146, "step": 4400 }, { "epoch": 0.8289424162589386, "grad_norm": 0.03996839095480978, "learning_rate": 2.596118243139968e-05, "loss": 0.7382, "step": 4405 }, { "epoch": 0.8298833270605946, "grad_norm": 0.041170050061391135, "learning_rate": 2.5684789085389607e-05, "loss": 0.7324, "step": 4410 }, { "epoch": 0.8308242378622507, "grad_norm": 0.03765734472228694, "learning_rate": 2.5409737096722716e-05, "loss": 0.7205, "step": 4415 }, { "epoch": 0.8317651486639067, "grad_norm": 0.03910602217381, "learning_rate": 2.5136029433201625e-05, "loss": 0.712, "step": 4420 }, { "epoch": 0.8327060594655626, "grad_norm": 0.043314895360783305, "learning_rate": 2.4863669048123746e-05, "loss": 0.7252, "step": 4425 }, { "epoch": 0.8336469702672187, "grad_norm": 0.038416515119674845, "learning_rate": 2.4592658880249244e-05, "loss": 0.7342, "step": 4430 }, { "epoch": 0.8345878810688747, "grad_norm": 0.03966310990660385, "learning_rate": 2.4323001853769692e-05, "loss": 0.7278, "step": 4435 }, { "epoch": 0.8355287918705306, "grad_norm": 0.039679489923308116, "learning_rate": 2.4054700878276122e-05, "loss": 0.7268, "step": 4440 }, { "epoch": 0.8364697026721867, "grad_norm": 0.03897089451361406, "learning_rate": 2.3787758848727912e-05, "loss": 0.7233, "step": 4445 }, { "epoch": 0.8374106134738427, "grad_norm": 0.03825105631460866, "learning_rate": 2.352217864542149e-05, "loss": 0.7186, "step": 4450 }, { "epoch": 0.8383515242754986, "grad_norm": 0.037516278264590745, "learning_rate": 2.3257963133959086e-05, "loss": 0.7359, "step": 4455 }, { "epoch": 0.8392924350771547, "grad_norm": 0.03803680908475784, "learning_rate": 2.2995115165218076e-05, "loss": 0.7324, "step": 4460 }, { "epoch": 0.8402333458788107, "grad_norm": 0.041405405174316776, "learning_rate": 2.2733637575320085e-05, "loss": 0.7328, "step": 4465 }, { "epoch": 0.8411742566804666, "grad_norm": 0.03654086979510833, "learning_rate": 2.2473533185600295e-05, "loss": 0.7187, "step": 4470 }, { "epoch": 0.8421151674821227, "grad_norm": 0.039987056977315666, "learning_rate": 2.2214804802577108e-05, "loss": 0.741, "step": 4475 }, { "epoch": 0.8430560782837787, "grad_norm": 0.04099954115508159, "learning_rate": 2.1957455217922033e-05, "loss": 0.7247, "step": 4480 }, { "epoch": 0.8439969890854347, "grad_norm": 0.038398613227579474, "learning_rate": 2.1701487208429197e-05, "loss": 0.7382, "step": 4485 }, { "epoch": 0.8449378998870907, "grad_norm": 0.03836989471820563, "learning_rate": 2.1446903535985587e-05, "loss": 0.7134, "step": 4490 }, { "epoch": 0.8458788106887467, "grad_norm": 0.036465783338229904, "learning_rate": 2.119370694754132e-05, "loss": 0.7, "step": 4495 }, { "epoch": 0.8468197214904027, "grad_norm": 0.0385801606753652, "learning_rate": 2.094190017507989e-05, "loss": 0.7475, "step": 4500 }, { "epoch": 0.8477606322920587, "grad_norm": 0.035990420555936965, "learning_rate": 2.0691485935588743e-05, "loss": 0.7334, "step": 4505 }, { "epoch": 0.8487015430937147, "grad_norm": 0.04380741523278768, "learning_rate": 2.0442466931029867e-05, "loss": 0.7237, "step": 4510 }, { "epoch": 0.8496424538953707, "grad_norm": 0.037151691736236815, "learning_rate": 2.0194845848310674e-05, "loss": 0.7276, "step": 4515 }, { "epoch": 0.8505833646970267, "grad_norm": 0.036431746414973216, "learning_rate": 1.9948625359255248e-05, "loss": 0.7031, "step": 4520 }, { "epoch": 0.8515242754986827, "grad_norm": 0.03727873723258803, "learning_rate": 1.970380812057512e-05, "loss": 0.7229, "step": 4525 }, { "epoch": 0.8524651863003387, "grad_norm": 0.03479466063222127, "learning_rate": 1.9460396773840786e-05, "loss": 0.7544, "step": 4530 }, { "epoch": 0.8534060971019948, "grad_norm": 0.04104017949784562, "learning_rate": 1.9218393945453327e-05, "loss": 0.7197, "step": 4535 }, { "epoch": 0.8543470079036507, "grad_norm": 0.03790070545246109, "learning_rate": 1.8977802246615908e-05, "loss": 0.7431, "step": 4540 }, { "epoch": 0.8552879187053067, "grad_norm": 0.03729217061921706, "learning_rate": 1.8738624273305602e-05, "loss": 0.7533, "step": 4545 }, { "epoch": 0.8562288295069628, "grad_norm": 0.03931349035555177, "learning_rate": 1.8500862606245476e-05, "loss": 0.7394, "step": 4550 }, { "epoch": 0.8571697403086187, "grad_norm": 0.0394433321374659, "learning_rate": 1.8264519810876722e-05, "loss": 0.7326, "step": 4555 }, { "epoch": 0.8581106511102747, "grad_norm": 0.035368518092189605, "learning_rate": 1.802959843733086e-05, "loss": 0.7472, "step": 4560 }, { "epoch": 0.8590515619119308, "grad_norm": 0.037582766906246075, "learning_rate": 1.7796101020402405e-05, "loss": 0.7061, "step": 4565 }, { "epoch": 0.8599924727135867, "grad_norm": 0.038386275083281136, "learning_rate": 1.7564030079521312e-05, "loss": 0.7281, "step": 4570 }, { "epoch": 0.8609333835152427, "grad_norm": 0.03798225357409595, "learning_rate": 1.7333388118726033e-05, "loss": 0.7417, "step": 4575 }, { "epoch": 0.8618742943168988, "grad_norm": 0.03952864589268583, "learning_rate": 1.7104177626636308e-05, "loss": 0.7386, "step": 4580 }, { "epoch": 0.8628152051185548, "grad_norm": 0.039175178129552325, "learning_rate": 1.6876401076426332e-05, "loss": 0.7348, "step": 4585 }, { "epoch": 0.8637561159202107, "grad_norm": 0.04213404267588905, "learning_rate": 1.665006092579817e-05, "loss": 0.7451, "step": 4590 }, { "epoch": 0.8646970267218668, "grad_norm": 0.036904029064067076, "learning_rate": 1.6425159616955208e-05, "loss": 0.732, "step": 4595 }, { "epoch": 0.8656379375235228, "grad_norm": 0.04055058687139153, "learning_rate": 1.620169957657567e-05, "loss": 0.7336, "step": 4600 }, { "epoch": 0.8665788483251787, "grad_norm": 0.03633816516730953, "learning_rate": 1.5979683215786575e-05, "loss": 0.7253, "step": 4605 }, { "epoch": 0.8675197591268348, "grad_norm": 0.03780409397880549, "learning_rate": 1.575911293013773e-05, "loss": 0.7493, "step": 4610 }, { "epoch": 0.8684606699284908, "grad_norm": 0.036721595492597056, "learning_rate": 1.5539991099575854e-05, "loss": 0.7202, "step": 4615 }, { "epoch": 0.8694015807301467, "grad_norm": 0.037482839459661806, "learning_rate": 1.5322320088418725e-05, "loss": 0.7112, "step": 4620 }, { "epoch": 0.8703424915318028, "grad_norm": 0.04032680309979362, "learning_rate": 1.510610224533001e-05, "loss": 0.7459, "step": 4625 }, { "epoch": 0.8712834023334588, "grad_norm": 0.039375359663647816, "learning_rate": 1.489133990329366e-05, "loss": 0.7459, "step": 4630 }, { "epoch": 0.8722243131351148, "grad_norm": 0.03878387865856684, "learning_rate": 1.467803537958876e-05, "loss": 0.7314, "step": 4635 }, { "epoch": 0.8731652239367708, "grad_norm": 0.03820312498729265, "learning_rate": 1.446619097576468e-05, "loss": 0.7081, "step": 4640 }, { "epoch": 0.8741061347384268, "grad_norm": 0.04141438212898032, "learning_rate": 1.425580897761604e-05, "loss": 0.7524, "step": 4645 }, { "epoch": 0.8750470455400828, "grad_norm": 0.03963965690705673, "learning_rate": 1.4046891655158233e-05, "loss": 0.7289, "step": 4650 }, { "epoch": 0.8759879563417388, "grad_norm": 0.04111262551595695, "learning_rate": 1.383944126260284e-05, "loss": 0.7293, "step": 4655 }, { "epoch": 0.8769288671433948, "grad_norm": 0.04061469675303553, "learning_rate": 1.3633460038333211e-05, "loss": 0.7525, "step": 4660 }, { "epoch": 0.8778697779450508, "grad_norm": 0.0376892938395166, "learning_rate": 1.3428950204880534e-05, "loss": 0.7311, "step": 4665 }, { "epoch": 0.8788106887467068, "grad_norm": 0.03891564478991497, "learning_rate": 1.3225913968899705e-05, "loss": 0.7366, "step": 4670 }, { "epoch": 0.8797515995483628, "grad_norm": 0.03795076957307728, "learning_rate": 1.3024353521145515e-05, "loss": 0.74, "step": 4675 }, { "epoch": 0.8806925103500188, "grad_norm": 0.03879491364323159, "learning_rate": 1.2824271036449013e-05, "loss": 0.7569, "step": 4680 }, { "epoch": 0.8816334211516749, "grad_norm": 0.03648855134440867, "learning_rate": 1.2625668673694206e-05, "loss": 0.7261, "step": 4685 }, { "epoch": 0.8825743319533308, "grad_norm": 0.039114744946843395, "learning_rate": 1.2428548575794506e-05, "loss": 0.7276, "step": 4690 }, { "epoch": 0.8835152427549868, "grad_norm": 0.03922278721510972, "learning_rate": 1.2232912869669753e-05, "loss": 0.7412, "step": 4695 }, { "epoch": 0.8844561535566429, "grad_norm": 0.0413220284359729, "learning_rate": 1.2038763666223283e-05, "loss": 0.6961, "step": 4700 }, { "epoch": 0.8853970643582988, "grad_norm": 0.03774504327378547, "learning_rate": 1.1846103060319112e-05, "loss": 0.7616, "step": 4705 }, { "epoch": 0.8863379751599548, "grad_norm": 0.03719254603141807, "learning_rate": 1.1654933130759269e-05, "loss": 0.7295, "step": 4710 }, { "epoch": 0.8872788859616109, "grad_norm": 0.04039683465612545, "learning_rate": 1.1465255940261536e-05, "loss": 0.7315, "step": 4715 }, { "epoch": 0.8882197967632668, "grad_norm": 0.038907342623597366, "learning_rate": 1.1277073535436943e-05, "loss": 0.7424, "step": 4720 }, { "epoch": 0.8891607075649228, "grad_norm": 0.03788244107623249, "learning_rate": 1.1090387946768003e-05, "loss": 0.7515, "step": 4725 }, { "epoch": 0.8901016183665789, "grad_norm": 0.038838422890715006, "learning_rate": 1.090520118858652e-05, "loss": 0.7368, "step": 4730 }, { "epoch": 0.8910425291682349, "grad_norm": 0.03725542123628333, "learning_rate": 1.0721515259051916e-05, "loss": 0.7201, "step": 4735 }, { "epoch": 0.8919834399698908, "grad_norm": 0.03589707805750827, "learning_rate": 1.053933214012983e-05, "loss": 0.7129, "step": 4740 }, { "epoch": 0.8929243507715469, "grad_norm": 0.038642857374594816, "learning_rate": 1.0358653797570593e-05, "loss": 0.7541, "step": 4745 }, { "epoch": 0.8938652615732029, "grad_norm": 0.039982874321707856, "learning_rate": 1.017948218088797e-05, "loss": 0.7061, "step": 4750 }, { "epoch": 0.8948061723748588, "grad_norm": 0.0370503487079274, "learning_rate": 1.0001819223338287e-05, "loss": 0.7363, "step": 4755 }, { "epoch": 0.8957470831765149, "grad_norm": 0.03536457438950792, "learning_rate": 9.825666841899465e-06, "loss": 0.7415, "step": 4760 }, { "epoch": 0.8966879939781709, "grad_norm": 0.03707956825731724, "learning_rate": 9.651026937250288e-06, "loss": 0.7211, "step": 4765 }, { "epoch": 0.8976289047798268, "grad_norm": 0.03963652644770756, "learning_rate": 9.477901393750076e-06, "loss": 0.7391, "step": 4770 }, { "epoch": 0.8985698155814829, "grad_norm": 0.038563286201813984, "learning_rate": 9.306292079418115e-06, "loss": 0.6965, "step": 4775 }, { "epoch": 0.8995107263831389, "grad_norm": 0.037576451289906165, "learning_rate": 9.136200845913716e-06, "loss": 0.7404, "step": 4780 }, { "epoch": 0.9004516371847949, "grad_norm": 0.03920865759024056, "learning_rate": 8.967629528516141e-06, "loss": 0.7214, "step": 4785 }, { "epoch": 0.9013925479864509, "grad_norm": 0.03869578048793669, "learning_rate": 8.800579946104702e-06, "loss": 0.7287, "step": 4790 }, { "epoch": 0.9023334587881069, "grad_norm": 0.041785756799252886, "learning_rate": 8.635053901139367e-06, "loss": 0.7647, "step": 4795 }, { "epoch": 0.9032743695897629, "grad_norm": 0.03868915487772746, "learning_rate": 8.471053179641147e-06, "loss": 0.7223, "step": 4800 }, { "epoch": 0.9042152803914189, "grad_norm": 0.03662076107336638, "learning_rate": 8.30857955117279e-06, "loss": 0.7399, "step": 4805 }, { "epoch": 0.9051561911930749, "grad_norm": 0.03359384020617572, "learning_rate": 8.147634768819788e-06, "loss": 0.7046, "step": 4810 }, { "epoch": 0.9060971019947309, "grad_norm": 0.03666121162864638, "learning_rate": 7.988220569171467e-06, "loss": 0.7387, "step": 4815 }, { "epoch": 0.9070380127963868, "grad_norm": 0.036559492972736925, "learning_rate": 7.830338672302223e-06, "loss": 0.7398, "step": 4820 }, { "epoch": 0.9079789235980429, "grad_norm": 0.03695997811728329, "learning_rate": 7.673990781752881e-06, "loss": 0.7133, "step": 4825 }, { "epoch": 0.9089198343996989, "grad_norm": 0.038686133865653574, "learning_rate": 7.5191785845124255e-06, "loss": 0.7275, "step": 4830 }, { "epoch": 0.909860745201355, "grad_norm": 0.034693594406954034, "learning_rate": 7.365903750999791e-06, "loss": 0.7157, "step": 4835 }, { "epoch": 0.9108016560030109, "grad_norm": 0.0388397518590927, "learning_rate": 7.2141679350457175e-06, "loss": 0.7216, "step": 4840 }, { "epoch": 0.9117425668046669, "grad_norm": 0.03748596193447337, "learning_rate": 7.063972773875076e-06, "loss": 0.7359, "step": 4845 }, { "epoch": 0.912683477606323, "grad_norm": 0.03888928393947248, "learning_rate": 6.915319888089055e-06, "loss": 0.7296, "step": 4850 }, { "epoch": 0.9136243884079789, "grad_norm": 0.03873257173734812, "learning_rate": 6.768210881647784e-06, "loss": 0.7514, "step": 4855 }, { "epoch": 0.9145652992096349, "grad_norm": 0.03709779612823535, "learning_rate": 6.622647341853005e-06, "loss": 0.7421, "step": 4860 }, { "epoch": 0.915506210011291, "grad_norm": 0.03587682055067527, "learning_rate": 6.478630839330828e-06, "loss": 0.7205, "step": 4865 }, { "epoch": 0.9164471208129469, "grad_norm": 0.04029055825522218, "learning_rate": 6.336162928014937e-06, "loss": 0.737, "step": 4870 }, { "epoch": 0.9173880316146029, "grad_norm": 0.037073343055570404, "learning_rate": 6.195245145129812e-06, "loss": 0.7347, "step": 4875 }, { "epoch": 0.918328942416259, "grad_norm": 0.036440160856527605, "learning_rate": 6.055879011173998e-06, "loss": 0.7387, "step": 4880 }, { "epoch": 0.919269853217915, "grad_norm": 0.0377819594467133, "learning_rate": 5.918066029903812e-06, "loss": 0.7215, "step": 4885 }, { "epoch": 0.9202107640195709, "grad_norm": 0.0384362688453867, "learning_rate": 5.781807688317214e-06, "loss": 0.7101, "step": 4890 }, { "epoch": 0.921151674821227, "grad_norm": 0.04042614908872056, "learning_rate": 5.6471054566374965e-06, "loss": 0.726, "step": 4895 }, { "epoch": 0.922092585622883, "grad_norm": 0.037566662938396556, "learning_rate": 5.5139607882976666e-06, "loss": 0.7312, "step": 4900 }, { "epoch": 0.9230334964245389, "grad_norm": 0.03800770208223098, "learning_rate": 5.382375119924626e-06, "loss": 0.7431, "step": 4905 }, { "epoch": 0.923974407226195, "grad_norm": 0.03619717959439674, "learning_rate": 5.252349871323747e-06, "loss": 0.7284, "step": 4910 }, { "epoch": 0.924915318027851, "grad_norm": 0.03753904354868467, "learning_rate": 5.123886445463504e-06, "loss": 0.7567, "step": 4915 }, { "epoch": 0.9258562288295069, "grad_norm": 0.038858725721177816, "learning_rate": 4.99698622846037e-06, "loss": 0.7093, "step": 4920 }, { "epoch": 0.926797139631163, "grad_norm": 0.03824992710079084, "learning_rate": 4.871650589563775e-06, "loss": 0.7451, "step": 4925 }, { "epoch": 0.927738050432819, "grad_norm": 0.04100741519395592, "learning_rate": 4.747880881141502e-06, "loss": 0.7291, "step": 4930 }, { "epoch": 0.928678961234475, "grad_norm": 0.03950366551892632, "learning_rate": 4.62567843866492e-06, "loss": 0.7362, "step": 4935 }, { "epoch": 0.929619872036131, "grad_norm": 0.03655294044802147, "learning_rate": 4.5050445806946555e-06, "loss": 0.7318, "step": 4940 }, { "epoch": 0.930560782837787, "grad_norm": 0.03893513792284634, "learning_rate": 4.385980608866374e-06, "loss": 0.751, "step": 4945 }, { "epoch": 0.931501693639443, "grad_norm": 0.03535458990197148, "learning_rate": 4.268487807876725e-06, "loss": 0.7318, "step": 4950 }, { "epoch": 0.9324426044410989, "grad_norm": 0.041789838223581545, "learning_rate": 4.152567445469418e-06, "loss": 0.7276, "step": 4955 }, { "epoch": 0.933383515242755, "grad_norm": 0.03906894325898849, "learning_rate": 4.038220772421668e-06, "loss": 0.7163, "step": 4960 }, { "epoch": 0.934324426044411, "grad_norm": 0.03989311989542537, "learning_rate": 3.9254490225305915e-06, "loss": 0.7326, "step": 4965 }, { "epoch": 0.9352653368460669, "grad_norm": 0.03578498133734959, "learning_rate": 3.814253412599927e-06, "loss": 0.6939, "step": 4970 }, { "epoch": 0.936206247647723, "grad_norm": 0.03670755137788207, "learning_rate": 3.704635142426937e-06, "loss": 0.7555, "step": 4975 }, { "epoch": 0.937147158449379, "grad_norm": 0.03938777341705205, "learning_rate": 3.5965953947894144e-06, "loss": 0.7263, "step": 4980 }, { "epoch": 0.938088069251035, "grad_norm": 0.03628876749133803, "learning_rate": 3.490135335432942e-06, "loss": 0.7009, "step": 4985 }, { "epoch": 0.939028980052691, "grad_norm": 0.03826742122227512, "learning_rate": 3.3852561130583376e-06, "loss": 0.7432, "step": 4990 }, { "epoch": 0.939969890854347, "grad_norm": 0.03883574703669586, "learning_rate": 3.281958859309197e-06, "loss": 0.7406, "step": 4995 }, { "epoch": 0.940910801656003, "grad_norm": 0.038660784217541634, "learning_rate": 3.18024468875977e-06, "loss": 0.7288, "step": 5000 }, { "epoch": 0.941851712457659, "grad_norm": 0.03986660657950191, "learning_rate": 3.0801146989028525e-06, "loss": 0.7233, "step": 5005 }, { "epoch": 0.942792623259315, "grad_norm": 0.03698570319860795, "learning_rate": 2.9815699701379813e-06, "loss": 0.7249, "step": 5010 }, { "epoch": 0.943733534060971, "grad_norm": 0.035194532189499236, "learning_rate": 2.884611565759792e-06, "loss": 0.724, "step": 5015 }, { "epoch": 0.944674444862627, "grad_norm": 0.03724239049887656, "learning_rate": 2.7892405319464963e-06, "loss": 0.7434, "step": 5020 }, { "epoch": 0.945615355664283, "grad_norm": 0.041330518568329144, "learning_rate": 2.6954578977486707e-06, "loss": 0.7232, "step": 5025 }, { "epoch": 0.946556266465939, "grad_norm": 0.037368253372142564, "learning_rate": 2.6032646750780706e-06, "loss": 0.7389, "step": 5030 }, { "epoch": 0.9474971772675951, "grad_norm": 0.03493182990339746, "learning_rate": 2.5126618586967685e-06, "loss": 0.7497, "step": 5035 }, { "epoch": 0.948438088069251, "grad_norm": 0.03889842265077455, "learning_rate": 2.4236504262064136e-06, "loss": 0.7089, "step": 5040 }, { "epoch": 0.949378998870907, "grad_norm": 0.0359201656454254, "learning_rate": 2.3362313380376253e-06, "loss": 0.7094, "step": 5045 }, { "epoch": 0.9503199096725631, "grad_norm": 0.0353292313883741, "learning_rate": 2.2504055374397144e-06, "loss": 0.7348, "step": 5050 }, { "epoch": 0.951260820474219, "grad_norm": 0.03966754193538584, "learning_rate": 2.1661739504704623e-06, "loss": 0.7268, "step": 5055 }, { "epoch": 0.952201731275875, "grad_norm": 0.03702888456410481, "learning_rate": 2.0835374859861255e-06, "loss": 0.7224, "step": 5060 }, { "epoch": 0.9531426420775311, "grad_norm": 0.037585415673704874, "learning_rate": 2.0024970356316615e-06, "loss": 0.7313, "step": 5065 }, { "epoch": 0.954083552879187, "grad_norm": 0.04180203300864954, "learning_rate": 1.9230534738310375e-06, "loss": 0.7107, "step": 5070 }, { "epoch": 0.955024463680843, "grad_norm": 0.037393099939864984, "learning_rate": 1.8452076577778696e-06, "loss": 0.7181, "step": 5075 }, { "epoch": 0.9559653744824991, "grad_norm": 0.03553043089641243, "learning_rate": 1.7689604274261637e-06, "loss": 0.7303, "step": 5080 }, { "epoch": 0.9569062852841551, "grad_norm": 0.037063309388190024, "learning_rate": 1.6943126054811906e-06, "loss": 0.7582, "step": 5085 }, { "epoch": 0.957847196085811, "grad_norm": 0.03908640307451864, "learning_rate": 1.621264997390692e-06, "loss": 0.7409, "step": 5090 }, { "epoch": 0.9587881068874671, "grad_norm": 0.03940471571019541, "learning_rate": 1.5498183913361383e-06, "loss": 0.7608, "step": 5095 }, { "epoch": 0.9597290176891231, "grad_norm": 0.03659422309678058, "learning_rate": 1.4799735582242344e-06, "loss": 0.7184, "step": 5100 }, { "epoch": 0.960669928490779, "grad_norm": 0.03897811850405322, "learning_rate": 1.4117312516785938e-06, "loss": 0.7231, "step": 5105 }, { "epoch": 0.9616108392924351, "grad_norm": 0.03672154149071319, "learning_rate": 1.345092208031645e-06, "loss": 0.7052, "step": 5110 }, { "epoch": 0.9625517500940911, "grad_norm": 0.03491112077981314, "learning_rate": 1.280057146316621e-06, "loss": 0.7194, "step": 5115 }, { "epoch": 0.963492660895747, "grad_norm": 0.03847928744181442, "learning_rate": 1.2166267682598818e-06, "loss": 0.7348, "step": 5120 }, { "epoch": 0.9644335716974031, "grad_norm": 0.03929152091074595, "learning_rate": 1.154801758273255e-06, "loss": 0.7096, "step": 5125 }, { "epoch": 0.9653744824990591, "grad_norm": 0.037421477773637336, "learning_rate": 1.0945827834467402e-06, "loss": 0.7433, "step": 5130 }, { "epoch": 0.9663153933007151, "grad_norm": 0.03988682711853078, "learning_rate": 1.035970493541216e-06, "loss": 0.744, "step": 5135 }, { "epoch": 0.9672563041023711, "grad_norm": 0.03687421910742082, "learning_rate": 9.789655209815284e-07, "loss": 0.7039, "step": 5140 }, { "epoch": 0.9681972149040271, "grad_norm": 0.0401166521279677, "learning_rate": 9.235684808495792e-07, "loss": 0.7078, "step": 5145 }, { "epoch": 0.9691381257056831, "grad_norm": 0.036933913690302404, "learning_rate": 8.697799708777653e-07, "loss": 0.7205, "step": 5150 }, { "epoch": 0.9700790365073391, "grad_norm": 0.03862870548258374, "learning_rate": 8.176005714424671e-07, "loss": 0.7427, "step": 5155 }, { "epoch": 0.9710199473089951, "grad_norm": 0.037195912476703265, "learning_rate": 7.670308455578034e-07, "loss": 0.7095, "step": 5160 }, { "epoch": 0.9719608581106511, "grad_norm": 0.0387790773069963, "learning_rate": 7.180713388695858e-07, "loss": 0.7335, "step": 5165 }, { "epoch": 0.9729017689123071, "grad_norm": 0.038473815659207836, "learning_rate": 6.707225796494076e-07, "loss": 0.7406, "step": 5170 }, { "epoch": 0.9738426797139631, "grad_norm": 0.038220935568139616, "learning_rate": 6.249850787889477e-07, "loss": 0.7328, "step": 5175 }, { "epoch": 0.9747835905156191, "grad_norm": 0.03577405006427383, "learning_rate": 5.808593297944253e-07, "loss": 0.6966, "step": 5180 }, { "epoch": 0.9757245013172752, "grad_norm": 0.03926752579673784, "learning_rate": 5.383458087813375e-07, "loss": 0.7144, "step": 5185 }, { "epoch": 0.9766654121189311, "grad_norm": 0.03907926405272728, "learning_rate": 4.974449744692966e-07, "loss": 0.7429, "step": 5190 }, { "epoch": 0.9776063229205871, "grad_norm": 0.038899452851094404, "learning_rate": 4.5815726817705065e-07, "loss": 0.7677, "step": 5195 }, { "epoch": 0.9785472337222432, "grad_norm": 0.0423749591733099, "learning_rate": 4.204831138177378e-07, "loss": 0.7252, "step": 5200 }, { "epoch": 0.9794881445238991, "grad_norm": 0.03698062230023023, "learning_rate": 3.844229178943725e-07, "loss": 0.7439, "step": 5205 }, { "epoch": 0.9804290553255551, "grad_norm": 0.03715944964729997, "learning_rate": 3.4997706949534966e-07, "loss": 0.7431, "step": 5210 }, { "epoch": 0.9813699661272112, "grad_norm": 0.03833525661199143, "learning_rate": 3.171459402903309e-07, "loss": 0.7506, "step": 5215 }, { "epoch": 0.9823108769288671, "grad_norm": 0.04029829091509341, "learning_rate": 2.859298845261815e-07, "loss": 0.7206, "step": 5220 }, { "epoch": 0.9832517877305231, "grad_norm": 0.04317581049694436, "learning_rate": 2.56329239023223e-07, "loss": 0.7442, "step": 5225 }, { "epoch": 0.9841926985321792, "grad_norm": 0.03947040991708499, "learning_rate": 2.2834432317151986e-07, "loss": 0.7467, "step": 5230 }, { "epoch": 0.9851336093338352, "grad_norm": 0.038225453966465185, "learning_rate": 2.0197543892743195e-07, "loss": 0.71, "step": 5235 }, { "epoch": 0.9860745201354911, "grad_norm": 0.03981803197968165, "learning_rate": 1.772228708104506e-07, "loss": 0.7083, "step": 5240 }, { "epoch": 0.9870154309371472, "grad_norm": 0.03706590784392966, "learning_rate": 1.5408688590000107e-07, "loss": 0.7193, "step": 5245 }, { "epoch": 0.9879563417388032, "grad_norm": 0.03776800734619944, "learning_rate": 1.325677338326947e-07, "loss": 0.7093, "step": 5250 }, { "epoch": 0.9888972525404591, "grad_norm": 0.03617798634178418, "learning_rate": 1.1266564679949797e-07, "loss": 0.7014, "step": 5255 }, { "epoch": 0.9898381633421152, "grad_norm": 0.04052107735340787, "learning_rate": 9.43808395433343e-08, "loss": 0.7344, "step": 5260 }, { "epoch": 0.9907790741437712, "grad_norm": 0.03811509221337354, "learning_rate": 7.771350935670274e-08, "loss": 0.7027, "step": 5265 }, { "epoch": 0.9917199849454271, "grad_norm": 0.03795993990734393, "learning_rate": 6.266383607961278e-08, "loss": 0.7107, "step": 5270 }, { "epoch": 0.9926608957470832, "grad_norm": 0.03844604892644717, "learning_rate": 4.9231982097586164e-08, "loss": 0.7499, "step": 5275 }, { "epoch": 0.9936018065487392, "grad_norm": 0.03756964997959224, "learning_rate": 3.741809233989146e-08, "loss": 0.7194, "step": 5280 }, { "epoch": 0.9945427173503952, "grad_norm": 0.03744567511221886, "learning_rate": 2.7222294278045343e-08, "loss": 0.7258, "step": 5285 }, { "epoch": 0.9954836281520512, "grad_norm": 0.03534047578763023, "learning_rate": 1.8644697924413697e-08, "loss": 0.7428, "step": 5290 }, { "epoch": 0.9964245389537072, "grad_norm": 0.03693996603556449, "learning_rate": 1.1685395830979271e-08, "loss": 0.7315, "step": 5295 }, { "epoch": 0.9973654497553632, "grad_norm": 0.03684649235727902, "learning_rate": 6.344463088425733e-09, "loss": 0.7008, "step": 5300 }, { "epoch": 0.9983063605570192, "grad_norm": 0.03615359821424916, "learning_rate": 2.6219573252383995e-09, "loss": 0.7311, "step": 5305 }, { "epoch": 0.9992472713586752, "grad_norm": 0.03782149225904557, "learning_rate": 5.179187071546742e-10, "loss": 0.7642, "step": 5310 }, { "epoch": 1.0, "eval_loss": 1.0969747304916382, "eval_runtime": 1105.2258, "eval_samples_per_second": 191.917, "eval_steps_per_second": 5.998, "step": 5314 }, { "epoch": 1.0, "step": 5314, "total_flos": 773266676056064.0, "train_loss": 0.8050324304417017, "train_runtime": 21832.2284, "train_samples_per_second": 31.155, "train_steps_per_second": 0.243 } ], "logging_steps": 5, "max_steps": 5314, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 773266676056064.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }