{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5916, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016904205977749838, "grad_norm": 1.5393127157503101, "learning_rate": 1.5202702702702706e-07, "loss": 0.7280044555664062, "step": 10 }, { "epoch": 0.0033808411955499676, "grad_norm": 1.4909846560751012, "learning_rate": 3.2094594594594594e-07, "loss": 0.7421677589416504, "step": 20 }, { "epoch": 0.005071261793324951, "grad_norm": 1.1386498100093034, "learning_rate": 4.898648648648649e-07, "loss": 0.7222579956054688, "step": 30 }, { "epoch": 0.006761682391099935, "grad_norm": 0.8683143176223492, "learning_rate": 6.587837837837838e-07, "loss": 0.6997623920440674, "step": 40 }, { "epoch": 0.008452102988874919, "grad_norm": 0.6915935865750568, "learning_rate": 8.277027027027028e-07, "loss": 0.6706014156341553, "step": 50 }, { "epoch": 0.010142523586649903, "grad_norm": 0.5119352644321808, "learning_rate": 9.966216216216217e-07, "loss": 0.6358686923980713, "step": 60 }, { "epoch": 0.011832944184424887, "grad_norm": 0.4337471829765602, "learning_rate": 1.1655405405405406e-06, "loss": 0.618624210357666, "step": 70 }, { "epoch": 0.01352336478219987, "grad_norm": 0.4205476110134685, "learning_rate": 1.3344594594594596e-06, "loss": 0.6060873508453369, "step": 80 }, { "epoch": 0.015213785379974854, "grad_norm": 0.39627091778455564, "learning_rate": 1.5033783783783785e-06, "loss": 0.5913969993591308, "step": 90 }, { "epoch": 0.016904205977749838, "grad_norm": 0.40212736895916457, "learning_rate": 1.6722972972972977e-06, "loss": 0.5814601421356201, "step": 100 }, { "epoch": 0.018594626575524822, "grad_norm": 0.4773638249628802, "learning_rate": 1.8412162162162164e-06, "loss": 0.5719439029693604, "step": 110 }, { "epoch": 0.020285047173299806, "grad_norm": 0.38472922772015794, "learning_rate": 2.0101351351351353e-06, "loss": 0.5604746818542481, "step": 120 }, { "epoch": 0.02197546777107479, "grad_norm": 0.40656017027176017, "learning_rate": 2.1790540540540543e-06, "loss": 0.5584488391876221, "step": 130 }, { "epoch": 0.023665888368849773, "grad_norm": 0.371308302071795, "learning_rate": 2.347972972972973e-06, "loss": 0.5508419036865234, "step": 140 }, { "epoch": 0.025356308966624757, "grad_norm": 0.6240307244052877, "learning_rate": 2.516891891891892e-06, "loss": 0.545355224609375, "step": 150 }, { "epoch": 0.02704672956439974, "grad_norm": 0.4324101126507423, "learning_rate": 2.685810810810811e-06, "loss": 0.5469128608703613, "step": 160 }, { "epoch": 0.028737150162174725, "grad_norm": 0.4050600506019846, "learning_rate": 2.85472972972973e-06, "loss": 0.5449463367462158, "step": 170 }, { "epoch": 0.03042757075994971, "grad_norm": 0.40162846746542147, "learning_rate": 3.023648648648649e-06, "loss": 0.5376415252685547, "step": 180 }, { "epoch": 0.032117991357724696, "grad_norm": 0.38755458720694097, "learning_rate": 3.192567567567568e-06, "loss": 0.5347342491149902, "step": 190 }, { "epoch": 0.033808411955499676, "grad_norm": 0.4920608811887175, "learning_rate": 3.3614864864864864e-06, "loss": 0.537785816192627, "step": 200 }, { "epoch": 0.03549883255327466, "grad_norm": 0.4350756261944741, "learning_rate": 3.5304054054054053e-06, "loss": 0.5192846775054931, "step": 210 }, { "epoch": 0.037189253151049644, "grad_norm": 0.4693428284277957, "learning_rate": 3.6993243243243247e-06, "loss": 0.5229539394378662, "step": 220 }, { "epoch": 0.03887967374882463, "grad_norm": 0.47341352669542003, "learning_rate": 3.868243243243244e-06, "loss": 0.5329394340515137, "step": 230 }, { "epoch": 0.04057009434659961, "grad_norm": 0.42129758409095647, "learning_rate": 4.037162162162163e-06, "loss": 0.5307604312896729, "step": 240 }, { "epoch": 0.0422605149443746, "grad_norm": 0.40815950715332777, "learning_rate": 4.206081081081081e-06, "loss": 0.5192375183105469, "step": 250 }, { "epoch": 0.04395093554214958, "grad_norm": 0.4200400303493133, "learning_rate": 4.3750000000000005e-06, "loss": 0.5104325294494629, "step": 260 }, { "epoch": 0.045641356139924566, "grad_norm": 0.4759394203212884, "learning_rate": 4.543918918918919e-06, "loss": 0.5136491775512695, "step": 270 }, { "epoch": 0.047331776737699546, "grad_norm": 0.4354678343040916, "learning_rate": 4.712837837837838e-06, "loss": 0.5056795597076416, "step": 280 }, { "epoch": 0.049022197335474534, "grad_norm": 0.4796606080973553, "learning_rate": 4.881756756756757e-06, "loss": 0.5168024063110351, "step": 290 }, { "epoch": 0.050712617933249514, "grad_norm": 0.5075769744330321, "learning_rate": 5.050675675675676e-06, "loss": 0.5202849388122559, "step": 300 }, { "epoch": 0.0524030385310245, "grad_norm": 0.4153292083599306, "learning_rate": 5.219594594594595e-06, "loss": 0.5088020324707031, "step": 310 }, { "epoch": 0.05409345912879948, "grad_norm": 0.6184601220323779, "learning_rate": 5.388513513513513e-06, "loss": 0.5058822154998779, "step": 320 }, { "epoch": 0.05578387972657447, "grad_norm": 0.5404243375856757, "learning_rate": 5.557432432432433e-06, "loss": 0.5120021820068359, "step": 330 }, { "epoch": 0.05747430032434945, "grad_norm": 0.4787472095986187, "learning_rate": 5.726351351351351e-06, "loss": 0.5063863754272461, "step": 340 }, { "epoch": 0.059164720922124436, "grad_norm": 0.4911006620908002, "learning_rate": 5.8952702702702705e-06, "loss": 0.502506160736084, "step": 350 }, { "epoch": 0.06085514151989942, "grad_norm": 0.7405518769222905, "learning_rate": 6.06418918918919e-06, "loss": 0.4920937538146973, "step": 360 }, { "epoch": 0.0625455621176744, "grad_norm": 0.47943676154065007, "learning_rate": 6.233108108108109e-06, "loss": 0.5055969715118408, "step": 370 }, { "epoch": 0.06423598271544939, "grad_norm": 0.5295416406304393, "learning_rate": 6.402027027027028e-06, "loss": 0.49520511627197267, "step": 380 }, { "epoch": 0.06592640331322437, "grad_norm": 0.44250966671562564, "learning_rate": 6.570945945945947e-06, "loss": 0.5030886650085449, "step": 390 }, { "epoch": 0.06761682391099935, "grad_norm": 0.5383736396097062, "learning_rate": 6.739864864864866e-06, "loss": 0.4966264247894287, "step": 400 }, { "epoch": 0.06930724450877435, "grad_norm": 0.4378624064122213, "learning_rate": 6.908783783783785e-06, "loss": 0.5109675407409668, "step": 410 }, { "epoch": 0.07099766510654933, "grad_norm": 0.7886546643219341, "learning_rate": 7.0777027027027035e-06, "loss": 0.4995694637298584, "step": 420 }, { "epoch": 0.0726880857043243, "grad_norm": 0.44892133998631173, "learning_rate": 7.246621621621622e-06, "loss": 0.4928102970123291, "step": 430 }, { "epoch": 0.07437850630209929, "grad_norm": 0.4522504419948549, "learning_rate": 7.415540540540541e-06, "loss": 0.4935169219970703, "step": 440 }, { "epoch": 0.07606892689987428, "grad_norm": 0.47388387012658867, "learning_rate": 7.58445945945946e-06, "loss": 0.4919950008392334, "step": 450 }, { "epoch": 0.07775934749764926, "grad_norm": 0.4751247826361683, "learning_rate": 7.753378378378378e-06, "loss": 0.4960982322692871, "step": 460 }, { "epoch": 0.07944976809542424, "grad_norm": 0.48392017417638244, "learning_rate": 7.922297297297298e-06, "loss": 0.4992071151733398, "step": 470 }, { "epoch": 0.08114018869319922, "grad_norm": 0.4974127003472514, "learning_rate": 8.091216216216217e-06, "loss": 0.4960615634918213, "step": 480 }, { "epoch": 0.08283060929097422, "grad_norm": 0.45126823416690404, "learning_rate": 8.260135135135135e-06, "loss": 0.4928754806518555, "step": 490 }, { "epoch": 0.0845210298887492, "grad_norm": 0.5896794617176134, "learning_rate": 8.429054054054054e-06, "loss": 0.49259119033813475, "step": 500 }, { "epoch": 0.08621145048652418, "grad_norm": 0.46198142303389655, "learning_rate": 8.597972972972974e-06, "loss": 0.4931736946105957, "step": 510 }, { "epoch": 0.08790187108429916, "grad_norm": 0.5089850771284543, "learning_rate": 8.766891891891893e-06, "loss": 0.48741836547851564, "step": 520 }, { "epoch": 0.08959229168207415, "grad_norm": 0.498017321228989, "learning_rate": 8.93581081081081e-06, "loss": 0.4871868133544922, "step": 530 }, { "epoch": 0.09128271227984913, "grad_norm": 0.4364519713463667, "learning_rate": 9.104729729729732e-06, "loss": 0.4887679100036621, "step": 540 }, { "epoch": 0.09297313287762411, "grad_norm": 0.4482779936423717, "learning_rate": 9.27364864864865e-06, "loss": 0.48195371627807615, "step": 550 }, { "epoch": 0.09466355347539909, "grad_norm": 0.4562904990399742, "learning_rate": 9.442567567567569e-06, "loss": 0.48915772438049315, "step": 560 }, { "epoch": 0.09635397407317409, "grad_norm": 0.49452467224625607, "learning_rate": 9.611486486486488e-06, "loss": 0.4924494743347168, "step": 570 }, { "epoch": 0.09804439467094907, "grad_norm": 0.4652453454681303, "learning_rate": 9.780405405405407e-06, "loss": 0.4876460075378418, "step": 580 }, { "epoch": 0.09973481526872405, "grad_norm": 0.46684881016372576, "learning_rate": 9.949324324324325e-06, "loss": 0.4890481948852539, "step": 590 }, { "epoch": 0.10142523586649903, "grad_norm": 0.5490321111052099, "learning_rate": 9.999957346063017e-06, "loss": 0.484678840637207, "step": 600 }, { "epoch": 0.10311565646427402, "grad_norm": 0.7722942859795399, "learning_rate": 9.999748430572558e-06, "loss": 0.48883733749389646, "step": 610 }, { "epoch": 0.104806077062049, "grad_norm": 0.5530949398407944, "learning_rate": 9.999365426397275e-06, "loss": 0.48575553894042967, "step": 620 }, { "epoch": 0.10649649765982398, "grad_norm": 0.47335899243694096, "learning_rate": 9.998808346873179e-06, "loss": 0.4800987720489502, "step": 630 }, { "epoch": 0.10818691825759896, "grad_norm": 0.47768175202180385, "learning_rate": 9.99807721139749e-06, "loss": 0.47647809982299805, "step": 640 }, { "epoch": 0.10987733885537396, "grad_norm": 0.46600234628629655, "learning_rate": 9.997172045427974e-06, "loss": 0.47447705268859863, "step": 650 }, { "epoch": 0.11156775945314894, "grad_norm": 0.5180380166358451, "learning_rate": 9.996092880482047e-06, "loss": 0.4927847385406494, "step": 660 }, { "epoch": 0.11325818005092392, "grad_norm": 0.7812305828383127, "learning_rate": 9.99483975413568e-06, "loss": 0.48745927810668943, "step": 670 }, { "epoch": 0.1149486006486989, "grad_norm": 0.43644647448782514, "learning_rate": 9.993412710022096e-06, "loss": 0.48123817443847655, "step": 680 }, { "epoch": 0.11663902124647389, "grad_norm": 0.49127236612828806, "learning_rate": 9.991811797830238e-06, "loss": 0.4744853973388672, "step": 690 }, { "epoch": 0.11832944184424887, "grad_norm": 0.5833255708338916, "learning_rate": 9.99003707330305e-06, "loss": 0.48299551010131836, "step": 700 }, { "epoch": 0.12001986244202385, "grad_norm": 0.5184397410093639, "learning_rate": 9.988088598235532e-06, "loss": 0.48394503593444826, "step": 710 }, { "epoch": 0.12171028303979883, "grad_norm": 0.5258890543376203, "learning_rate": 9.985966440472594e-06, "loss": 0.4847887992858887, "step": 720 }, { "epoch": 0.12340070363757383, "grad_norm": 0.4640524383999511, "learning_rate": 9.98367067390668e-06, "loss": 0.48206357955932616, "step": 730 }, { "epoch": 0.1250911242353488, "grad_norm": 0.4273195720362554, "learning_rate": 9.981201378475213e-06, "loss": 0.48335995674133303, "step": 740 }, { "epoch": 0.1267815448331238, "grad_norm": 0.4888196321723977, "learning_rate": 9.978558640157794e-06, "loss": 0.4779070854187012, "step": 750 }, { "epoch": 0.12847196543089878, "grad_norm": 0.4492781187011461, "learning_rate": 9.975742550973223e-06, "loss": 0.47565546035766604, "step": 760 }, { "epoch": 0.13016238602867375, "grad_norm": 0.6183045865145201, "learning_rate": 9.972753208976283e-06, "loss": 0.4773505687713623, "step": 770 }, { "epoch": 0.13185280662644874, "grad_norm": 0.4825877361384316, "learning_rate": 9.969590718254337e-06, "loss": 0.47838778495788575, "step": 780 }, { "epoch": 0.13354322722422374, "grad_norm": 0.4228373437680581, "learning_rate": 9.966255188923694e-06, "loss": 0.47666234970092775, "step": 790 }, { "epoch": 0.1352336478219987, "grad_norm": 0.49439709272758775, "learning_rate": 9.962746737125783e-06, "loss": 0.47107582092285155, "step": 800 }, { "epoch": 0.1369240684197737, "grad_norm": 0.4644619381643305, "learning_rate": 9.959065485023099e-06, "loss": 0.47295198440551756, "step": 810 }, { "epoch": 0.1386144890175487, "grad_norm": 0.40092407936813573, "learning_rate": 9.95521156079496e-06, "loss": 0.46324882507324217, "step": 820 }, { "epoch": 0.14030490961532366, "grad_norm": 0.40293191886864566, "learning_rate": 9.951185098633039e-06, "loss": 0.4737586975097656, "step": 830 }, { "epoch": 0.14199533021309865, "grad_norm": 0.4315377378558263, "learning_rate": 9.946986238736688e-06, "loss": 0.47527265548706055, "step": 840 }, { "epoch": 0.14368575081087362, "grad_norm": 0.43129828851312546, "learning_rate": 9.942615127308064e-06, "loss": 0.4667802810668945, "step": 850 }, { "epoch": 0.1453761714086486, "grad_norm": 0.7253025541777565, "learning_rate": 9.938071916547033e-06, "loss": 0.4730061531066895, "step": 860 }, { "epoch": 0.1470665920064236, "grad_norm": 0.4358429443768519, "learning_rate": 9.933356764645871e-06, "loss": 0.4679386138916016, "step": 870 }, { "epoch": 0.14875701260419857, "grad_norm": 0.5060899359980359, "learning_rate": 9.928469835783757e-06, "loss": 0.47495126724243164, "step": 880 }, { "epoch": 0.15044743320197357, "grad_norm": 0.3895332485316844, "learning_rate": 9.923411300121055e-06, "loss": 0.4657557010650635, "step": 890 }, { "epoch": 0.15213785379974856, "grad_norm": 0.43200962105069246, "learning_rate": 9.918181333793393e-06, "loss": 0.4725798606872559, "step": 900 }, { "epoch": 0.15382827439752353, "grad_norm": 0.5031797328287678, "learning_rate": 9.912780118905524e-06, "loss": 0.4736931800842285, "step": 910 }, { "epoch": 0.15551869499529852, "grad_norm": 0.45701564911823583, "learning_rate": 9.90720784352499e-06, "loss": 0.47264862060546875, "step": 920 }, { "epoch": 0.1572091155930735, "grad_norm": 0.4450162875737329, "learning_rate": 9.901464701675575e-06, "loss": 0.47407169342041017, "step": 930 }, { "epoch": 0.15889953619084848, "grad_norm": 0.45122564139548105, "learning_rate": 9.895550893330537e-06, "loss": 0.4679454803466797, "step": 940 }, { "epoch": 0.16058995678862348, "grad_norm": 0.4573555163342898, "learning_rate": 9.889466624405664e-06, "loss": 0.4702300071716309, "step": 950 }, { "epoch": 0.16228037738639844, "grad_norm": 0.4499082495323313, "learning_rate": 9.883212106752088e-06, "loss": 0.4699045181274414, "step": 960 }, { "epoch": 0.16397079798417344, "grad_norm": 0.4931931137952571, "learning_rate": 9.876787558148918e-06, "loss": 0.4716080665588379, "step": 970 }, { "epoch": 0.16566121858194843, "grad_norm": 0.42457299398663156, "learning_rate": 9.87019320229565e-06, "loss": 0.4631767272949219, "step": 980 }, { "epoch": 0.1673516391797234, "grad_norm": 0.4329347769304198, "learning_rate": 9.863429268804388e-06, "loss": 0.46808829307556155, "step": 990 }, { "epoch": 0.1690420597774984, "grad_norm": 0.48564688866162065, "learning_rate": 9.856495993191836e-06, "loss": 0.4762742042541504, "step": 1000 }, { "epoch": 0.17073248037527336, "grad_norm": 0.4322957425108914, "learning_rate": 9.849393616871107e-06, "loss": 0.46973333358764646, "step": 1010 }, { "epoch": 0.17242290097304835, "grad_norm": 0.44726230667077793, "learning_rate": 9.842122387143317e-06, "loss": 0.46886191368103025, "step": 1020 }, { "epoch": 0.17411332157082335, "grad_norm": 0.4877471481922099, "learning_rate": 9.834682557188967e-06, "loss": 0.46724977493286135, "step": 1030 }, { "epoch": 0.17580374216859831, "grad_norm": 0.45139860892547107, "learning_rate": 9.827074386059135e-06, "loss": 0.468550968170166, "step": 1040 }, { "epoch": 0.1774941627663733, "grad_norm": 0.4083605201702828, "learning_rate": 9.819298138666446e-06, "loss": 0.4680886745452881, "step": 1050 }, { "epoch": 0.1791845833641483, "grad_norm": 0.5192065551375825, "learning_rate": 9.811354085775865e-06, "loss": 0.4614398002624512, "step": 1060 }, { "epoch": 0.18087500396192327, "grad_norm": 0.41211296295044714, "learning_rate": 9.80324250399525e-06, "loss": 0.46942787170410155, "step": 1070 }, { "epoch": 0.18256542455969826, "grad_norm": 0.5801659241580247, "learning_rate": 9.794963675765734e-06, "loss": 0.4630770206451416, "step": 1080 }, { "epoch": 0.18425584515747323, "grad_norm": 0.5983535960495054, "learning_rate": 9.786517889351882e-06, "loss": 0.46579856872558595, "step": 1090 }, { "epoch": 0.18594626575524822, "grad_norm": 0.43798357338036104, "learning_rate": 9.777905438831663e-06, "loss": 0.46267199516296387, "step": 1100 }, { "epoch": 0.18763668635302322, "grad_norm": 0.4325962700066863, "learning_rate": 9.769126624086202e-06, "loss": 0.4716958999633789, "step": 1110 }, { "epoch": 0.18932710695079819, "grad_norm": 0.41620408070601617, "learning_rate": 9.76018175078934e-06, "loss": 0.46593875885009767, "step": 1120 }, { "epoch": 0.19101752754857318, "grad_norm": 0.6965781109451706, "learning_rate": 9.751071130396991e-06, "loss": 0.4633523464202881, "step": 1130 }, { "epoch": 0.19270794814634817, "grad_norm": 0.4459465191100755, "learning_rate": 9.741795080136305e-06, "loss": 0.47896766662597656, "step": 1140 }, { "epoch": 0.19439836874412314, "grad_norm": 0.6319696655865172, "learning_rate": 9.732353922994608e-06, "loss": 0.46693859100341795, "step": 1150 }, { "epoch": 0.19608878934189813, "grad_norm": 0.47012774591561624, "learning_rate": 9.722747987708165e-06, "loss": 0.4546792984008789, "step": 1160 }, { "epoch": 0.19777920993967313, "grad_norm": 0.832723029381089, "learning_rate": 9.712977608750735e-06, "loss": 0.47440481185913086, "step": 1170 }, { "epoch": 0.1994696305374481, "grad_norm": 0.4829770005211469, "learning_rate": 9.703043126321921e-06, "loss": 0.46451354026794434, "step": 1180 }, { "epoch": 0.2011600511352231, "grad_norm": 0.47584739817374816, "learning_rate": 9.692944886335319e-06, "loss": 0.46211748123168944, "step": 1190 }, { "epoch": 0.20285047173299806, "grad_norm": 0.39648704616543984, "learning_rate": 9.682683240406485e-06, "loss": 0.47053070068359376, "step": 1200 }, { "epoch": 0.20454089233077305, "grad_norm": 0.4534813956704384, "learning_rate": 9.672258545840687e-06, "loss": 0.462983226776123, "step": 1210 }, { "epoch": 0.20623131292854804, "grad_norm": 0.4391686833686041, "learning_rate": 9.66167116562046e-06, "loss": 0.4643733024597168, "step": 1220 }, { "epoch": 0.207921733526323, "grad_norm": 0.3992535539144578, "learning_rate": 9.650921468392974e-06, "loss": 0.45770740509033203, "step": 1230 }, { "epoch": 0.209612154124098, "grad_norm": 0.6549411613955338, "learning_rate": 9.640009828457187e-06, "loss": 0.46063737869262694, "step": 1240 }, { "epoch": 0.211302574721873, "grad_norm": 0.4204768929959213, "learning_rate": 9.628936625750828e-06, "loss": 0.46723523139953616, "step": 1250 }, { "epoch": 0.21299299531964797, "grad_norm": 0.38551469436375513, "learning_rate": 9.617702245837157e-06, "loss": 0.45136494636535646, "step": 1260 }, { "epoch": 0.21468341591742296, "grad_norm": 0.4206545742529992, "learning_rate": 9.606307079891537e-06, "loss": 0.4603860855102539, "step": 1270 }, { "epoch": 0.21637383651519793, "grad_norm": 0.5019242021118502, "learning_rate": 9.594751524687821e-06, "loss": 0.46168107986450196, "step": 1280 }, { "epoch": 0.21806425711297292, "grad_norm": 0.4065610280835682, "learning_rate": 9.583035982584538e-06, "loss": 0.46237959861755373, "step": 1290 }, { "epoch": 0.21975467771074791, "grad_norm": 0.4201022803564497, "learning_rate": 9.571160861510875e-06, "loss": 0.45450439453125, "step": 1300 }, { "epoch": 0.22144509830852288, "grad_norm": 0.48129418825307496, "learning_rate": 9.559126574952477e-06, "loss": 0.45772609710693357, "step": 1310 }, { "epoch": 0.22313551890629787, "grad_norm": 0.4380826104631551, "learning_rate": 9.546933541937052e-06, "loss": 0.4615782737731934, "step": 1320 }, { "epoch": 0.22482593950407287, "grad_norm": 0.5041722965228266, "learning_rate": 9.534582187019777e-06, "loss": 0.45874805450439454, "step": 1330 }, { "epoch": 0.22651636010184784, "grad_norm": 0.5332437794142343, "learning_rate": 9.522072940268515e-06, "loss": 0.4533642292022705, "step": 1340 }, { "epoch": 0.22820678069962283, "grad_norm": 0.49071947040909053, "learning_rate": 9.509406237248847e-06, "loss": 0.4662328720092773, "step": 1350 }, { "epoch": 0.2298972012973978, "grad_norm": 0.40385017293891995, "learning_rate": 9.496582519008897e-06, "loss": 0.4652996063232422, "step": 1360 }, { "epoch": 0.2315876218951728, "grad_norm": 0.4146237415189512, "learning_rate": 9.483602232063979e-06, "loss": 0.4565859794616699, "step": 1370 }, { "epoch": 0.23327804249294778, "grad_norm": 0.46073779616681215, "learning_rate": 9.47046582838105e-06, "loss": 0.46174845695495603, "step": 1380 }, { "epoch": 0.23496846309072275, "grad_norm": 0.38101750619091945, "learning_rate": 9.45717376536297e-06, "loss": 0.4629813194274902, "step": 1390 }, { "epoch": 0.23665888368849775, "grad_norm": 0.4379618065240927, "learning_rate": 9.443726505832584e-06, "loss": 0.46460161209106443, "step": 1400 }, { "epoch": 0.23834930428627274, "grad_norm": 0.4676173735785799, "learning_rate": 9.43012451801659e-06, "loss": 0.4546971321105957, "step": 1410 }, { "epoch": 0.2400397248840477, "grad_norm": 0.4702750217508668, "learning_rate": 9.416368275529255e-06, "loss": 0.45699052810668944, "step": 1420 }, { "epoch": 0.2417301454818227, "grad_norm": 0.570311072883872, "learning_rate": 9.402458257355911e-06, "loss": 0.4610409736633301, "step": 1430 }, { "epoch": 0.24342056607959767, "grad_norm": 0.4087099940494567, "learning_rate": 9.388394947836278e-06, "loss": 0.46268315315246583, "step": 1440 }, { "epoch": 0.24511098667737266, "grad_norm": 0.596327200385323, "learning_rate": 9.374178836647609e-06, "loss": 0.4597465515136719, "step": 1450 }, { "epoch": 0.24680140727514766, "grad_norm": 0.4268602658887389, "learning_rate": 9.359810418787626e-06, "loss": 0.4541053295135498, "step": 1460 }, { "epoch": 0.24849182787292262, "grad_norm": 0.5653285477886548, "learning_rate": 9.3452901945573e-06, "loss": 0.45857672691345214, "step": 1470 }, { "epoch": 0.2501822484706976, "grad_norm": 0.7824638784414897, "learning_rate": 9.33061866954341e-06, "loss": 0.45687179565429686, "step": 1480 }, { "epoch": 0.2518726690684726, "grad_norm": 0.4685023923882284, "learning_rate": 9.31579635460096e-06, "loss": 0.45577211380004884, "step": 1490 }, { "epoch": 0.2535630896662476, "grad_norm": 0.42596174239181667, "learning_rate": 9.300823765835385e-06, "loss": 0.4546334266662598, "step": 1500 }, { "epoch": 0.25525351026402254, "grad_norm": 0.459993088579298, "learning_rate": 9.285701424584568e-06, "loss": 0.4472480773925781, "step": 1510 }, { "epoch": 0.25694393086179756, "grad_norm": 0.596362594134855, "learning_rate": 9.270429857400703e-06, "loss": 0.453325891494751, "step": 1520 }, { "epoch": 0.25863435145957253, "grad_norm": 0.4178802706307328, "learning_rate": 9.255009596031952e-06, "loss": 0.4538599967956543, "step": 1530 }, { "epoch": 0.2603247720573475, "grad_norm": 0.41372980820233307, "learning_rate": 9.239441177403938e-06, "loss": 0.45840139389038087, "step": 1540 }, { "epoch": 0.2620151926551225, "grad_norm": 0.44229655857380346, "learning_rate": 9.223725143601037e-06, "loss": 0.44556608200073244, "step": 1550 }, { "epoch": 0.2637056132528975, "grad_norm": 0.4568368063923109, "learning_rate": 9.207862041847513e-06, "loss": 0.4543326377868652, "step": 1560 }, { "epoch": 0.26539603385067245, "grad_norm": 0.3992030096330564, "learning_rate": 9.191852424488464e-06, "loss": 0.45322580337524415, "step": 1570 }, { "epoch": 0.2670864544484475, "grad_norm": 0.4800872482091329, "learning_rate": 9.175696848970579e-06, "loss": 0.448167610168457, "step": 1580 }, { "epoch": 0.26877687504622244, "grad_norm": 1.5893046541101095, "learning_rate": 9.159395877822743e-06, "loss": 0.4591785430908203, "step": 1590 }, { "epoch": 0.2704672956439974, "grad_norm": 0.7772627717928515, "learning_rate": 9.142950078636438e-06, "loss": 0.44793338775634767, "step": 1600 }, { "epoch": 0.27215771624177243, "grad_norm": 0.45054785945064246, "learning_rate": 9.126360024045987e-06, "loss": 0.45564422607421873, "step": 1610 }, { "epoch": 0.2738481368395474, "grad_norm": 0.4062144277621208, "learning_rate": 9.10962629170861e-06, "loss": 0.4568020820617676, "step": 1620 }, { "epoch": 0.27553855743732236, "grad_norm": 0.4034609731421723, "learning_rate": 9.092749464284316e-06, "loss": 0.45940208435058594, "step": 1630 }, { "epoch": 0.2772289780350974, "grad_norm": 0.4018136021331866, "learning_rate": 9.075730129415605e-06, "loss": 0.45337772369384766, "step": 1640 }, { "epoch": 0.27891939863287235, "grad_norm": 0.43962151340240485, "learning_rate": 9.058568879707024e-06, "loss": 0.45505146980285643, "step": 1650 }, { "epoch": 0.2806098192306473, "grad_norm": 0.4199289492540323, "learning_rate": 9.041266312704511e-06, "loss": 0.45896520614624026, "step": 1660 }, { "epoch": 0.2823002398284223, "grad_norm": 0.4181263154601458, "learning_rate": 9.023823030874608e-06, "loss": 0.460459041595459, "step": 1670 }, { "epoch": 0.2839906604261973, "grad_norm": 0.40795828627859493, "learning_rate": 9.006239641583471e-06, "loss": 0.45454959869384765, "step": 1680 }, { "epoch": 0.28568108102397227, "grad_norm": 0.4046378255672331, "learning_rate": 8.98851675707573e-06, "loss": 0.45649113655090334, "step": 1690 }, { "epoch": 0.28737150162174724, "grad_norm": 0.40847471276070824, "learning_rate": 8.970654994453163e-06, "loss": 0.4552486419677734, "step": 1700 }, { "epoch": 0.28906192221952226, "grad_norm": 0.38887379249470955, "learning_rate": 8.952654975653215e-06, "loss": 0.446870231628418, "step": 1710 }, { "epoch": 0.2907523428172972, "grad_norm": 0.44918248701550884, "learning_rate": 8.93451732742734e-06, "loss": 0.4545734405517578, "step": 1720 }, { "epoch": 0.2924427634150722, "grad_norm": 0.42111108783651335, "learning_rate": 8.91624268131918e-06, "loss": 0.4571378231048584, "step": 1730 }, { "epoch": 0.2941331840128472, "grad_norm": 0.4376361106159771, "learning_rate": 8.89783167364257e-06, "loss": 0.4523021697998047, "step": 1740 }, { "epoch": 0.2958236046106222, "grad_norm": 0.4125543712485479, "learning_rate": 8.879284945459388e-06, "loss": 0.45134286880493163, "step": 1750 }, { "epoch": 0.29751402520839715, "grad_norm": 0.39223691908106756, "learning_rate": 8.860603142557227e-06, "loss": 0.4513846397399902, "step": 1760 }, { "epoch": 0.29920444580617217, "grad_norm": 0.37509510978050736, "learning_rate": 8.841786915426918e-06, "loss": 0.4400909900665283, "step": 1770 }, { "epoch": 0.30089486640394714, "grad_norm": 0.44139926255235873, "learning_rate": 8.822836919239873e-06, "loss": 0.45766630172729494, "step": 1780 }, { "epoch": 0.3025852870017221, "grad_norm": 0.48830742143223577, "learning_rate": 8.803753813825271e-06, "loss": 0.45400180816650393, "step": 1790 }, { "epoch": 0.3042757075994971, "grad_norm": 0.5574507035046785, "learning_rate": 8.784538263647088e-06, "loss": 0.4435715675354004, "step": 1800 }, { "epoch": 0.3059661281972721, "grad_norm": 0.5225167208182738, "learning_rate": 8.765190937780964e-06, "loss": 0.4518153190612793, "step": 1810 }, { "epoch": 0.30765654879504706, "grad_norm": 0.456709142206432, "learning_rate": 8.74571250989089e-06, "loss": 0.4490679740905762, "step": 1820 }, { "epoch": 0.309346969392822, "grad_norm": 0.3950564064324204, "learning_rate": 8.726103658205772e-06, "loss": 0.4453883171081543, "step": 1830 }, { "epoch": 0.31103738999059705, "grad_norm": 0.43108949686604414, "learning_rate": 8.706365065495806e-06, "loss": 0.4514158248901367, "step": 1840 }, { "epoch": 0.312727810588372, "grad_norm": 0.4029582903079163, "learning_rate": 8.686497419048696e-06, "loss": 0.4506711006164551, "step": 1850 }, { "epoch": 0.314418231186147, "grad_norm": 0.47097814167708196, "learning_rate": 8.66650141064574e-06, "loss": 0.4476929664611816, "step": 1860 }, { "epoch": 0.316108651783922, "grad_norm": 0.48440126583440685, "learning_rate": 8.64637773653773e-06, "loss": 0.44446544647216796, "step": 1870 }, { "epoch": 0.31779907238169697, "grad_norm": 0.39849105565330367, "learning_rate": 8.626127097420711e-06, "loss": 0.44705805778503416, "step": 1880 }, { "epoch": 0.31948949297947193, "grad_norm": 0.43218756408209813, "learning_rate": 8.605750198411586e-06, "loss": 0.4613940238952637, "step": 1890 }, { "epoch": 0.32117991357724696, "grad_norm": 0.38906721440922765, "learning_rate": 8.585247749023567e-06, "loss": 0.4503718376159668, "step": 1900 }, { "epoch": 0.3228703341750219, "grad_norm": 0.4502618710490373, "learning_rate": 8.564620463141455e-06, "loss": 0.4474172592163086, "step": 1910 }, { "epoch": 0.3245607547727969, "grad_norm": 0.48392261273487897, "learning_rate": 8.543869058996807e-06, "loss": 0.45044708251953125, "step": 1920 }, { "epoch": 0.3262511753705719, "grad_norm": 0.46357543002429463, "learning_rate": 8.52299425914291e-06, "loss": 0.45404787063598634, "step": 1930 }, { "epoch": 0.3279415959683469, "grad_norm": 0.7728990225682476, "learning_rate": 8.501996790429618e-06, "loss": 0.45133085250854493, "step": 1940 }, { "epoch": 0.32963201656612184, "grad_norm": 0.46229314858380605, "learning_rate": 8.480877383978066e-06, "loss": 0.45330057144165037, "step": 1950 }, { "epoch": 0.33132243716389687, "grad_norm": 0.45494105017751585, "learning_rate": 8.45963677515519e-06, "loss": 0.44877138137817385, "step": 1960 }, { "epoch": 0.33301285776167183, "grad_norm": 0.4317089109386061, "learning_rate": 8.43827570354813e-06, "loss": 0.4513542175292969, "step": 1970 }, { "epoch": 0.3347032783594468, "grad_norm": 0.6488590237946437, "learning_rate": 8.416794912938483e-06, "loss": 0.4554163932800293, "step": 1980 }, { "epoch": 0.3363936989572218, "grad_norm": 0.528391137748562, "learning_rate": 8.395195151276397e-06, "loss": 0.44977054595947263, "step": 1990 }, { "epoch": 0.3380841195549968, "grad_norm": 0.5024989817984604, "learning_rate": 8.373477170654536e-06, "loss": 0.4485034942626953, "step": 2000 }, { "epoch": 0.33977454015277175, "grad_norm": 0.41678908010057425, "learning_rate": 8.351641727281882e-06, "loss": 0.45275564193725587, "step": 2010 }, { "epoch": 0.3414649607505467, "grad_norm": 0.4165453002530441, "learning_rate": 8.329689581457412e-06, "loss": 0.4469959259033203, "step": 2020 }, { "epoch": 0.34315538134832174, "grad_norm": 0.4395909846620004, "learning_rate": 8.307621497543625e-06, "loss": 0.4442157745361328, "step": 2030 }, { "epoch": 0.3448458019460967, "grad_norm": 0.5120332702550214, "learning_rate": 8.285438243939923e-06, "loss": 0.44135217666625975, "step": 2040 }, { "epoch": 0.3465362225438717, "grad_norm": 0.40470346443657984, "learning_rate": 8.263140593055856e-06, "loss": 0.4434605598449707, "step": 2050 }, { "epoch": 0.3482266431416467, "grad_norm": 0.5574860966144031, "learning_rate": 8.240729321284233e-06, "loss": 0.4367219924926758, "step": 2060 }, { "epoch": 0.34991706373942166, "grad_norm": 0.6284853204866423, "learning_rate": 8.218205208974081e-06, "loss": 0.4486133575439453, "step": 2070 }, { "epoch": 0.35160748433719663, "grad_norm": 0.5523042621914968, "learning_rate": 8.195569040403478e-06, "loss": 0.44528541564941404, "step": 2080 }, { "epoch": 0.35329790493497165, "grad_norm": 0.4861292804265429, "learning_rate": 8.172821603752244e-06, "loss": 0.4419032096862793, "step": 2090 }, { "epoch": 0.3549883255327466, "grad_norm": 0.5151777753558611, "learning_rate": 8.149963691074494e-06, "loss": 0.45229430198669435, "step": 2100 }, { "epoch": 0.3566787461305216, "grad_norm": 0.5350570897896791, "learning_rate": 8.126996098271068e-06, "loss": 0.44216156005859375, "step": 2110 }, { "epoch": 0.3583691667282966, "grad_norm": 0.4480111640687634, "learning_rate": 8.103919625061803e-06, "loss": 0.4482156753540039, "step": 2120 }, { "epoch": 0.3600595873260716, "grad_norm": 0.41191616176536405, "learning_rate": 8.080735074957706e-06, "loss": 0.450608491897583, "step": 2130 }, { "epoch": 0.36175000792384654, "grad_norm": 0.40297834092541657, "learning_rate": 8.05744325523296e-06, "loss": 0.44606647491455076, "step": 2140 }, { "epoch": 0.36344042852162156, "grad_norm": 0.4085693775301131, "learning_rate": 8.034044976896818e-06, "loss": 0.44092235565185545, "step": 2150 }, { "epoch": 0.36513084911939653, "grad_norm": 0.4579341322125244, "learning_rate": 8.01054105466538e-06, "loss": 0.4550692081451416, "step": 2160 }, { "epoch": 0.3668212697171715, "grad_norm": 0.41154951045968907, "learning_rate": 7.986932306933197e-06, "loss": 0.4456637382507324, "step": 2170 }, { "epoch": 0.36851169031494646, "grad_norm": 0.45743856816562845, "learning_rate": 7.963219555744802e-06, "loss": 0.44521183967590333, "step": 2180 }, { "epoch": 0.3702021109127215, "grad_norm": 0.45144429372810385, "learning_rate": 7.939403626766072e-06, "loss": 0.4406290531158447, "step": 2190 }, { "epoch": 0.37189253151049645, "grad_norm": 0.44161291067919384, "learning_rate": 7.915485349255477e-06, "loss": 0.4410409927368164, "step": 2200 }, { "epoch": 0.3735829521082714, "grad_norm": 0.5482442710608512, "learning_rate": 7.891465556035219e-06, "loss": 0.4488658905029297, "step": 2210 }, { "epoch": 0.37527337270604644, "grad_norm": 0.4458309105591097, "learning_rate": 7.867345083462215e-06, "loss": 0.44423704147338866, "step": 2220 }, { "epoch": 0.3769637933038214, "grad_norm": 0.41067167379424, "learning_rate": 7.843124771398997e-06, "loss": 0.4448094844818115, "step": 2230 }, { "epoch": 0.37865421390159637, "grad_norm": 0.39552535741630207, "learning_rate": 7.818805463184449e-06, "loss": 0.44344267845153806, "step": 2240 }, { "epoch": 0.3803446344993714, "grad_norm": 0.41067808788053767, "learning_rate": 7.794388005604451e-06, "loss": 0.4351670265197754, "step": 2250 }, { "epoch": 0.38203505509714636, "grad_norm": 0.47593510766741803, "learning_rate": 7.7698732488624e-06, "loss": 0.44367637634277346, "step": 2260 }, { "epoch": 0.3837254756949213, "grad_norm": 0.4086182566076896, "learning_rate": 7.745262046549588e-06, "loss": 0.44177837371826173, "step": 2270 }, { "epoch": 0.38541589629269635, "grad_norm": 0.43047766375857177, "learning_rate": 7.720555255615508e-06, "loss": 0.4466276168823242, "step": 2280 }, { "epoch": 0.3871063168904713, "grad_norm": 0.3965968852531915, "learning_rate": 7.695753736337987e-06, "loss": 0.44551968574523926, "step": 2290 }, { "epoch": 0.3887967374882463, "grad_norm": 0.4251738713536779, "learning_rate": 7.67085835229325e-06, "loss": 0.44549951553344724, "step": 2300 }, { "epoch": 0.3904871580860213, "grad_norm": 0.46815316080000746, "learning_rate": 7.645869970325848e-06, "loss": 0.43446955680847166, "step": 2310 }, { "epoch": 0.39217757868379627, "grad_norm": 0.4184962573632885, "learning_rate": 7.620789460518465e-06, "loss": 0.44701266288757324, "step": 2320 }, { "epoch": 0.39386799928157123, "grad_norm": 0.38087617164724735, "learning_rate": 7.595617696161635e-06, "loss": 0.445133113861084, "step": 2330 }, { "epoch": 0.39555841987934626, "grad_norm": 0.5453845457808255, "learning_rate": 7.570355553723325e-06, "loss": 0.44228591918945315, "step": 2340 }, { "epoch": 0.3972488404771212, "grad_norm": 0.5390485070913686, "learning_rate": 7.545003912818424e-06, "loss": 0.4409176826477051, "step": 2350 }, { "epoch": 0.3989392610748962, "grad_norm": 0.47828293578657854, "learning_rate": 7.5195636561781084e-06, "loss": 0.4372897148132324, "step": 2360 }, { "epoch": 0.40062968167267116, "grad_norm": 0.4156886189220343, "learning_rate": 7.4940356696191144e-06, "loss": 0.447946834564209, "step": 2370 }, { "epoch": 0.4023201022704462, "grad_norm": 0.4259105437856762, "learning_rate": 7.468420842012882e-06, "loss": 0.443576717376709, "step": 2380 }, { "epoch": 0.40401052286822114, "grad_norm": 0.42755879405461517, "learning_rate": 7.442720065254621e-06, "loss": 0.45192480087280273, "step": 2390 }, { "epoch": 0.4057009434659961, "grad_norm": 0.4446235368031096, "learning_rate": 7.416934234232236e-06, "loss": 0.444570255279541, "step": 2400 }, { "epoch": 0.40739136406377113, "grad_norm": 0.40743320730035465, "learning_rate": 7.3910642467951864e-06, "loss": 0.4399536609649658, "step": 2410 }, { "epoch": 0.4090817846615461, "grad_norm": 0.5746706375826369, "learning_rate": 7.36511100372321e-06, "loss": 0.43631649017333984, "step": 2420 }, { "epoch": 0.41077220525932107, "grad_norm": 0.4340142399235041, "learning_rate": 7.339075408694968e-06, "loss": 0.45034146308898926, "step": 2430 }, { "epoch": 0.4124626258570961, "grad_norm": 0.44523108344512796, "learning_rate": 7.312958368256569e-06, "loss": 0.43903651237487795, "step": 2440 }, { "epoch": 0.41415304645487105, "grad_norm": 0.5091716719016176, "learning_rate": 7.286760791790013e-06, "loss": 0.4393869400024414, "step": 2450 }, { "epoch": 0.415843467052646, "grad_norm": 0.43382179526470277, "learning_rate": 7.260483591481522e-06, "loss": 0.4424809455871582, "step": 2460 }, { "epoch": 0.41753388765042104, "grad_norm": 0.407012689135247, "learning_rate": 7.234127682289778e-06, "loss": 0.4412867546081543, "step": 2470 }, { "epoch": 0.419224308248196, "grad_norm": 0.41215616692154944, "learning_rate": 7.207693981914071e-06, "loss": 0.44841842651367186, "step": 2480 }, { "epoch": 0.420914728845971, "grad_norm": 0.4518038177652311, "learning_rate": 7.1811834107623344e-06, "loss": 0.44159650802612305, "step": 2490 }, { "epoch": 0.422605149443746, "grad_norm": 0.5163947809066302, "learning_rate": 7.154596891919105e-06, "loss": 0.4374223709106445, "step": 2500 }, { "epoch": 0.42429557004152096, "grad_norm": 0.39820310977742257, "learning_rate": 7.127935351113384e-06, "loss": 0.43600940704345703, "step": 2510 }, { "epoch": 0.42598599063929593, "grad_norm": 0.41241743553445703, "learning_rate": 7.10119971668639e-06, "loss": 0.44450817108154295, "step": 2520 }, { "epoch": 0.4276764112370709, "grad_norm": 0.425125793245871, "learning_rate": 7.074390919559249e-06, "loss": 0.4380540370941162, "step": 2530 }, { "epoch": 0.4293668318348459, "grad_norm": 0.6461727062118474, "learning_rate": 7.047509893200577e-06, "loss": 0.435422420501709, "step": 2540 }, { "epoch": 0.4310572524326209, "grad_norm": 0.5524697209445791, "learning_rate": 7.020557573593968e-06, "loss": 0.4344505310058594, "step": 2550 }, { "epoch": 0.43274767303039585, "grad_norm": 0.4106799649591827, "learning_rate": 6.993534899205418e-06, "loss": 0.43467392921447756, "step": 2560 }, { "epoch": 0.4344380936281709, "grad_norm": 0.47419462628081854, "learning_rate": 6.966442810950635e-06, "loss": 0.4437819480895996, "step": 2570 }, { "epoch": 0.43612851422594584, "grad_norm": 0.5091288622298781, "learning_rate": 6.939282252162286e-06, "loss": 0.4341723918914795, "step": 2580 }, { "epoch": 0.4378189348237208, "grad_norm": 0.40678856254540896, "learning_rate": 6.9120541685571444e-06, "loss": 0.4385373592376709, "step": 2590 }, { "epoch": 0.43950935542149583, "grad_norm": 0.40466633467374624, "learning_rate": 6.884759508203164e-06, "loss": 0.44423751831054686, "step": 2600 }, { "epoch": 0.4411997760192708, "grad_norm": 0.4150721188072644, "learning_rate": 6.857399221486467e-06, "loss": 0.44628586769104006, "step": 2610 }, { "epoch": 0.44289019661704576, "grad_norm": 0.4409008284827818, "learning_rate": 6.8299742610782535e-06, "loss": 0.44063844680786135, "step": 2620 }, { "epoch": 0.4445806172148208, "grad_norm": 0.4430457862634213, "learning_rate": 6.802485581901626e-06, "loss": 0.4308623313903809, "step": 2630 }, { "epoch": 0.44627103781259575, "grad_norm": 0.4592560146145528, "learning_rate": 6.774934141098344e-06, "loss": 0.4483074188232422, "step": 2640 }, { "epoch": 0.4479614584103707, "grad_norm": 0.4182419384464618, "learning_rate": 6.747320897995493e-06, "loss": 0.4336414813995361, "step": 2650 }, { "epoch": 0.44965187900814574, "grad_norm": 0.438063822803867, "learning_rate": 6.719646814072084e-06, "loss": 0.43684959411621094, "step": 2660 }, { "epoch": 0.4513422996059207, "grad_norm": 0.42289652128353217, "learning_rate": 6.691912852925574e-06, "loss": 0.44373302459716796, "step": 2670 }, { "epoch": 0.45303272020369567, "grad_norm": 0.4036390318140732, "learning_rate": 6.664119980238315e-06, "loss": 0.44059133529663086, "step": 2680 }, { "epoch": 0.4547231408014707, "grad_norm": 0.43302204039469716, "learning_rate": 6.636269163743928e-06, "loss": 0.44221057891845705, "step": 2690 }, { "epoch": 0.45641356139924566, "grad_norm": 0.40358739632993024, "learning_rate": 6.608361373193608e-06, "loss": 0.44517173767089846, "step": 2700 }, { "epoch": 0.4581039819970206, "grad_norm": 0.4358355709244199, "learning_rate": 6.580397580322358e-06, "loss": 0.42831597328186033, "step": 2710 }, { "epoch": 0.4597944025947956, "grad_norm": 0.43280631627772653, "learning_rate": 6.55237875881515e-06, "loss": 0.4354698181152344, "step": 2720 }, { "epoch": 0.4614848231925706, "grad_norm": 0.4245846691082156, "learning_rate": 6.52430588427303e-06, "loss": 0.4449951171875, "step": 2730 }, { "epoch": 0.4631752437903456, "grad_norm": 0.4314018816383608, "learning_rate": 6.49617993417914e-06, "loss": 0.44388618469238283, "step": 2740 }, { "epoch": 0.46486566438812055, "grad_norm": 0.44826365164777787, "learning_rate": 6.468001887864688e-06, "loss": 0.44153881072998047, "step": 2750 }, { "epoch": 0.46655608498589557, "grad_norm": 0.43908236075177787, "learning_rate": 6.43977272647484e-06, "loss": 0.44250946044921874, "step": 2760 }, { "epoch": 0.46824650558367054, "grad_norm": 0.3850389053146295, "learning_rate": 6.4114934329345715e-06, "loss": 0.4358978271484375, "step": 2770 }, { "epoch": 0.4699369261814455, "grad_norm": 0.6097960037471896, "learning_rate": 6.383164991914424e-06, "loss": 0.43413305282592773, "step": 2780 }, { "epoch": 0.4716273467792205, "grad_norm": 0.4461689157475998, "learning_rate": 6.354788389796238e-06, "loss": 0.4385429859161377, "step": 2790 }, { "epoch": 0.4733177673769955, "grad_norm": 0.5441220360355755, "learning_rate": 6.326364614638794e-06, "loss": 0.43816003799438474, "step": 2800 }, { "epoch": 0.47500818797477046, "grad_norm": 0.41572374444805754, "learning_rate": 6.297894656143415e-06, "loss": 0.438665771484375, "step": 2810 }, { "epoch": 0.4766986085725455, "grad_norm": 0.4571323669570309, "learning_rate": 6.269379505619504e-06, "loss": 0.4369631767272949, "step": 2820 }, { "epoch": 0.47838902917032045, "grad_norm": 0.4264556810440158, "learning_rate": 6.240820155950027e-06, "loss": 0.44220762252807616, "step": 2830 }, { "epoch": 0.4800794497680954, "grad_norm": 0.397942977911585, "learning_rate": 6.2122176015569405e-06, "loss": 0.43413677215576174, "step": 2840 }, { "epoch": 0.48176987036587043, "grad_norm": 0.45124864351563504, "learning_rate": 6.183572838366572e-06, "loss": 0.4400357246398926, "step": 2850 }, { "epoch": 0.4834602909636454, "grad_norm": 0.42428517380688374, "learning_rate": 6.1548868637749306e-06, "loss": 0.4344414234161377, "step": 2860 }, { "epoch": 0.48515071156142037, "grad_norm": 0.41383689495932036, "learning_rate": 6.126160676612992e-06, "loss": 0.4421424388885498, "step": 2870 }, { "epoch": 0.48684113215919533, "grad_norm": 0.4529042500836241, "learning_rate": 6.097395277111909e-06, "loss": 0.43562684059143064, "step": 2880 }, { "epoch": 0.48853155275697036, "grad_norm": 0.436461736013555, "learning_rate": 6.0685916668681925e-06, "loss": 0.4411266326904297, "step": 2890 }, { "epoch": 0.4902219733547453, "grad_norm": 0.4660224260239127, "learning_rate": 6.039750848808826e-06, "loss": 0.4391491889953613, "step": 2900 }, { "epoch": 0.4919123939525203, "grad_norm": 0.4265622760480254, "learning_rate": 6.010873827156352e-06, "loss": 0.43004140853881834, "step": 2910 }, { "epoch": 0.4936028145502953, "grad_norm": 0.5891326466041507, "learning_rate": 5.981961607393905e-06, "loss": 0.43016576766967773, "step": 2920 }, { "epoch": 0.4952932351480703, "grad_norm": 0.42731374205948136, "learning_rate": 5.953015196230201e-06, "loss": 0.437261962890625, "step": 2930 }, { "epoch": 0.49698365574584524, "grad_norm": 0.40601940500233463, "learning_rate": 5.924035601564478e-06, "loss": 0.43418092727661134, "step": 2940 }, { "epoch": 0.49867407634362027, "grad_norm": 0.44316397124344875, "learning_rate": 5.895023832451414e-06, "loss": 0.4333051681518555, "step": 2950 }, { "epoch": 0.5003644969413952, "grad_norm": 0.42103355983145363, "learning_rate": 5.865980899065979e-06, "loss": 0.4366158485412598, "step": 2960 }, { "epoch": 0.5020549175391702, "grad_norm": 0.4189038279835885, "learning_rate": 5.836907812668267e-06, "loss": 0.43808746337890625, "step": 2970 }, { "epoch": 0.5037453381369452, "grad_norm": 0.4045771583980379, "learning_rate": 5.8078055855682904e-06, "loss": 0.4391347885131836, "step": 2980 }, { "epoch": 0.5054357587347201, "grad_norm": 0.6123791815482257, "learning_rate": 5.778675231090715e-06, "loss": 0.4369372844696045, "step": 2990 }, { "epoch": 0.5071261793324952, "grad_norm": 0.4504376197363758, "learning_rate": 5.749517763539601e-06, "loss": 0.42980470657348635, "step": 3000 }, { "epoch": 0.5088165999302702, "grad_norm": 0.43408260936457443, "learning_rate": 5.720334198163063e-06, "loss": 0.4386304378509521, "step": 3010 }, { "epoch": 0.5105070205280451, "grad_norm": 0.4225926173781409, "learning_rate": 5.6911255511179295e-06, "loss": 0.4303256034851074, "step": 3020 }, { "epoch": 0.5121974411258201, "grad_norm": 0.4464137671609211, "learning_rate": 5.661892839434362e-06, "loss": 0.4334456443786621, "step": 3030 }, { "epoch": 0.5138878617235951, "grad_norm": 0.4264014041223435, "learning_rate": 5.63263708098044e-06, "loss": 0.4284989833831787, "step": 3040 }, { "epoch": 0.51557828232137, "grad_norm": 0.4469737217816344, "learning_rate": 5.603359294426717e-06, "loss": 0.4342545986175537, "step": 3050 }, { "epoch": 0.5172687029191451, "grad_norm": 0.42589508183124813, "learning_rate": 5.574060499210759e-06, "loss": 0.4403389930725098, "step": 3060 }, { "epoch": 0.5189591235169201, "grad_norm": 0.5297222464707912, "learning_rate": 5.54474171550164e-06, "loss": 0.435770320892334, "step": 3070 }, { "epoch": 0.520649544114695, "grad_norm": 0.3800451330600584, "learning_rate": 5.515403964164421e-06, "loss": 0.43670501708984377, "step": 3080 }, { "epoch": 0.52233996471247, "grad_norm": 0.5333744177944942, "learning_rate": 5.486048266724609e-06, "loss": 0.43523521423339845, "step": 3090 }, { "epoch": 0.524030385310245, "grad_norm": 0.5434046403400835, "learning_rate": 5.4566756453325835e-06, "loss": 0.4394557952880859, "step": 3100 }, { "epoch": 0.52572080590802, "grad_norm": 0.6610762094695175, "learning_rate": 5.427287122728008e-06, "loss": 0.4303136348724365, "step": 3110 }, { "epoch": 0.527411226505795, "grad_norm": 0.4682270464152013, "learning_rate": 5.39788372220422e-06, "loss": 0.4351104736328125, "step": 3120 }, { "epoch": 0.52910164710357, "grad_norm": 0.5276146962130529, "learning_rate": 5.368466467572595e-06, "loss": 0.42786803245544436, "step": 3130 }, { "epoch": 0.5307920677013449, "grad_norm": 0.3898103195251585, "learning_rate": 5.339036383126905e-06, "loss": 0.4315065383911133, "step": 3140 }, { "epoch": 0.5324824882991199, "grad_norm": 0.49019068218850664, "learning_rate": 5.309594493607646e-06, "loss": 0.4383066177368164, "step": 3150 }, { "epoch": 0.534172908896895, "grad_norm": 0.4028365280209736, "learning_rate": 5.280141824166363e-06, "loss": 0.43825907707214357, "step": 3160 }, { "epoch": 0.5358633294946699, "grad_norm": 0.4094965220781369, "learning_rate": 5.250679400329953e-06, "loss": 0.43253560066223146, "step": 3170 }, { "epoch": 0.5375537500924449, "grad_norm": 0.40846275540289434, "learning_rate": 5.221208247964951e-06, "loss": 0.43136014938354494, "step": 3180 }, { "epoch": 0.5392441706902199, "grad_norm": 0.49954149484857235, "learning_rate": 5.191729393241822e-06, "loss": 0.4329479694366455, "step": 3190 }, { "epoch": 0.5409345912879948, "grad_norm": 0.3931028479875822, "learning_rate": 5.162243862599221e-06, "loss": 0.4355682373046875, "step": 3200 }, { "epoch": 0.5426250118857698, "grad_norm": 0.42120237348360007, "learning_rate": 5.132752682708252e-06, "loss": 0.43157129287719725, "step": 3210 }, { "epoch": 0.5443154324835449, "grad_norm": 0.43276553833311104, "learning_rate": 5.103256880436724e-06, "loss": 0.4270349025726318, "step": 3220 }, { "epoch": 0.5460058530813198, "grad_norm": 0.5211667786145312, "learning_rate": 5.073757482813397e-06, "loss": 0.4324329853057861, "step": 3230 }, { "epoch": 0.5476962736790948, "grad_norm": 0.4755287439381494, "learning_rate": 5.044255516992218e-06, "loss": 0.435423755645752, "step": 3240 }, { "epoch": 0.5493866942768698, "grad_norm": 0.4637604896810374, "learning_rate": 5.014752010216558e-06, "loss": 0.4313016891479492, "step": 3250 }, { "epoch": 0.5510771148746447, "grad_norm": 0.40550687413116615, "learning_rate": 4.9852479897834424e-06, "loss": 0.4341496467590332, "step": 3260 }, { "epoch": 0.5527675354724197, "grad_norm": 0.4657009916810442, "learning_rate": 4.955744483007784e-06, "loss": 0.42819700241088865, "step": 3270 }, { "epoch": 0.5544579560701948, "grad_norm": 0.4956322489349777, "learning_rate": 4.926242517186603e-06, "loss": 0.4234212875366211, "step": 3280 }, { "epoch": 0.5561483766679697, "grad_norm": 0.40681714039435574, "learning_rate": 4.896743119563279e-06, "loss": 0.4332298278808594, "step": 3290 }, { "epoch": 0.5578387972657447, "grad_norm": 0.38587035769576944, "learning_rate": 4.867247317291751e-06, "loss": 0.438218355178833, "step": 3300 }, { "epoch": 0.5595292178635196, "grad_norm": 0.4551274587379102, "learning_rate": 4.8377561374007805e-06, "loss": 0.43701925277709963, "step": 3310 }, { "epoch": 0.5612196384612946, "grad_norm": 0.4040304584035835, "learning_rate": 4.808270606758179e-06, "loss": 0.4318737030029297, "step": 3320 }, { "epoch": 0.5629100590590697, "grad_norm": 0.4512054219017183, "learning_rate": 4.77879175203505e-06, "loss": 0.43484320640563967, "step": 3330 }, { "epoch": 0.5646004796568446, "grad_norm": 0.4082489988390108, "learning_rate": 4.74932059967005e-06, "loss": 0.4284003734588623, "step": 3340 }, { "epoch": 0.5662909002546196, "grad_norm": 0.42276817092848296, "learning_rate": 4.7198581758336396e-06, "loss": 0.43506608009338377, "step": 3350 }, { "epoch": 0.5679813208523946, "grad_norm": 0.42621910508451183, "learning_rate": 4.690405506392355e-06, "loss": 0.43180079460144044, "step": 3360 }, { "epoch": 0.5696717414501695, "grad_norm": 0.58603996179948, "learning_rate": 4.660963616873096e-06, "loss": 0.43529691696166994, "step": 3370 }, { "epoch": 0.5713621620479445, "grad_norm": 0.43475498420426534, "learning_rate": 4.631533532427405e-06, "loss": 0.42524237632751466, "step": 3380 }, { "epoch": 0.5730525826457196, "grad_norm": 0.44296635506788934, "learning_rate": 4.6021162777957815e-06, "loss": 0.42964849472045896, "step": 3390 }, { "epoch": 0.5747430032434945, "grad_norm": 0.4739115865696253, "learning_rate": 4.572712877271993e-06, "loss": 0.4318963050842285, "step": 3400 }, { "epoch": 0.5764334238412695, "grad_norm": 0.4251182604533514, "learning_rate": 4.543324354667418e-06, "loss": 0.429244327545166, "step": 3410 }, { "epoch": 0.5781238444390445, "grad_norm": 0.4422586426616063, "learning_rate": 4.513951733275395e-06, "loss": 0.42932448387145994, "step": 3420 }, { "epoch": 0.5798142650368194, "grad_norm": 0.4691091599178207, "learning_rate": 4.48459603583558e-06, "loss": 0.4259922027587891, "step": 3430 }, { "epoch": 0.5815046856345945, "grad_norm": 0.5464764518023947, "learning_rate": 4.455258284498363e-06, "loss": 0.42319440841674805, "step": 3440 }, { "epoch": 0.5831951062323695, "grad_norm": 0.41304172848458615, "learning_rate": 4.42593950078924e-06, "loss": 0.4340841293334961, "step": 3450 }, { "epoch": 0.5848855268301444, "grad_norm": 0.44352336434522394, "learning_rate": 4.396640705573284e-06, "loss": 0.42516536712646485, "step": 3460 }, { "epoch": 0.5865759474279194, "grad_norm": 0.4396217564214065, "learning_rate": 4.367362919019561e-06, "loss": 0.43398313522338866, "step": 3470 }, { "epoch": 0.5882663680256944, "grad_norm": 0.4639850426028468, "learning_rate": 4.338107160565639e-06, "loss": 0.42090563774108886, "step": 3480 }, { "epoch": 0.5899567886234693, "grad_norm": 0.49730143479919997, "learning_rate": 4.308874448882072e-06, "loss": 0.42863998413085935, "step": 3490 }, { "epoch": 0.5916472092212444, "grad_norm": 0.4320500925423939, "learning_rate": 4.279665801836938e-06, "loss": 0.43216619491577146, "step": 3500 }, { "epoch": 0.5933376298190194, "grad_norm": 0.38405843815880253, "learning_rate": 4.250482236460399e-06, "loss": 0.4306344985961914, "step": 3510 }, { "epoch": 0.5950280504167943, "grad_norm": 0.41562127671231863, "learning_rate": 4.2213247689092846e-06, "loss": 0.4339097499847412, "step": 3520 }, { "epoch": 0.5967184710145693, "grad_norm": 0.5206266110542977, "learning_rate": 4.192194414431712e-06, "loss": 0.42906818389892576, "step": 3530 }, { "epoch": 0.5984088916123443, "grad_norm": 0.44079942953014034, "learning_rate": 4.163092187331733e-06, "loss": 0.42707481384277346, "step": 3540 }, { "epoch": 0.6000993122101193, "grad_norm": 0.5283111002109292, "learning_rate": 4.1340191009340215e-06, "loss": 0.43108377456665037, "step": 3550 }, { "epoch": 0.6017897328078943, "grad_norm": 0.497346167548402, "learning_rate": 4.104976167548587e-06, "loss": 0.42334275245666503, "step": 3560 }, { "epoch": 0.6034801534056693, "grad_norm": 0.41994436685076686, "learning_rate": 4.075964398435522e-06, "loss": 0.4256128787994385, "step": 3570 }, { "epoch": 0.6051705740034442, "grad_norm": 0.44264493338347205, "learning_rate": 4.046984803769801e-06, "loss": 0.42928495407104494, "step": 3580 }, { "epoch": 0.6068609946012192, "grad_norm": 0.45103704784300114, "learning_rate": 4.018038392606096e-06, "loss": 0.4291574478149414, "step": 3590 }, { "epoch": 0.6085514151989942, "grad_norm": 0.4052714162982127, "learning_rate": 3.98912617284365e-06, "loss": 0.42896642684936526, "step": 3600 }, { "epoch": 0.6102418357967692, "grad_norm": 0.40164760162129637, "learning_rate": 3.960249151191178e-06, "loss": 0.4323906898498535, "step": 3610 }, { "epoch": 0.6119322563945442, "grad_norm": 0.4505206662224979, "learning_rate": 3.931408333131809e-06, "loss": 0.42946605682373046, "step": 3620 }, { "epoch": 0.6136226769923192, "grad_norm": 0.38809773635856676, "learning_rate": 3.902604722888092e-06, "loss": 0.4359119892120361, "step": 3630 }, { "epoch": 0.6153130975900941, "grad_norm": 0.4515178914649961, "learning_rate": 3.873839323387009e-06, "loss": 0.42914657592773436, "step": 3640 }, { "epoch": 0.6170035181878691, "grad_norm": 0.48547177051092294, "learning_rate": 3.845113136225072e-06, "loss": 0.43363237380981445, "step": 3650 }, { "epoch": 0.618693938785644, "grad_norm": 0.4431323439189628, "learning_rate": 3.81642716163343e-06, "loss": 0.427580738067627, "step": 3660 }, { "epoch": 0.6203843593834191, "grad_norm": 0.40919513437853744, "learning_rate": 3.7877823984430608e-06, "loss": 0.42395830154418945, "step": 3670 }, { "epoch": 0.6220747799811941, "grad_norm": 0.445422416784889, "learning_rate": 3.7591798440499755e-06, "loss": 0.4330305099487305, "step": 3680 }, { "epoch": 0.623765200578969, "grad_norm": 0.42604546747481925, "learning_rate": 3.7306204943804973e-06, "loss": 0.4383516311645508, "step": 3690 }, { "epoch": 0.625455621176744, "grad_norm": 0.4655383013749577, "learning_rate": 3.7021053438565863e-06, "loss": 0.43866143226623533, "step": 3700 }, { "epoch": 0.627146041774519, "grad_norm": 0.3989287897287261, "learning_rate": 3.673635385361206e-06, "loss": 0.43120598793029785, "step": 3710 }, { "epoch": 0.628836462372294, "grad_norm": 0.43139571316850417, "learning_rate": 3.6452116102037625e-06, "loss": 0.43346233367919923, "step": 3720 }, { "epoch": 0.630526882970069, "grad_norm": 0.43077204056512486, "learning_rate": 3.6168350080855785e-06, "loss": 0.4254899978637695, "step": 3730 }, { "epoch": 0.632217303567844, "grad_norm": 0.4116188525169869, "learning_rate": 3.5885065670654306e-06, "loss": 0.43075990676879883, "step": 3740 }, { "epoch": 0.6339077241656189, "grad_norm": 0.48980622755885445, "learning_rate": 3.560227273525162e-06, "loss": 0.43142094612121584, "step": 3750 }, { "epoch": 0.6355981447633939, "grad_norm": 0.45257900417419006, "learning_rate": 3.5319981121353133e-06, "loss": 0.4283766746520996, "step": 3760 }, { "epoch": 0.637288565361169, "grad_norm": 0.43598385649668175, "learning_rate": 3.503820065820861e-06, "loss": 0.4272482395172119, "step": 3770 }, { "epoch": 0.6389789859589439, "grad_norm": 0.4436945578231624, "learning_rate": 3.47569411572697e-06, "loss": 0.4362170696258545, "step": 3780 }, { "epoch": 0.6406694065567189, "grad_norm": 0.4423636015024226, "learning_rate": 3.447621241184852e-06, "loss": 0.432710075378418, "step": 3790 }, { "epoch": 0.6423598271544939, "grad_norm": 0.5550229086437715, "learning_rate": 3.4196024196776452e-06, "loss": 0.42471537590026853, "step": 3800 }, { "epoch": 0.6440502477522688, "grad_norm": 0.4441992666501525, "learning_rate": 3.391638626806393e-06, "loss": 0.429352855682373, "step": 3810 }, { "epoch": 0.6457406683500438, "grad_norm": 0.3852461787826987, "learning_rate": 3.363730836256074e-06, "loss": 0.42467470169067384, "step": 3820 }, { "epoch": 0.6474310889478189, "grad_norm": 0.47623628052793926, "learning_rate": 3.3358800197616856e-06, "loss": 0.4357749938964844, "step": 3830 }, { "epoch": 0.6491215095455938, "grad_norm": 0.42486869443758596, "learning_rate": 3.3080871470744273e-06, "loss": 0.4325972557067871, "step": 3840 }, { "epoch": 0.6508119301433688, "grad_norm": 0.5004616407501641, "learning_rate": 3.280353185927918e-06, "loss": 0.42637929916381834, "step": 3850 }, { "epoch": 0.6525023507411438, "grad_norm": 0.4596076466050951, "learning_rate": 3.252679102004509e-06, "loss": 0.4192366600036621, "step": 3860 }, { "epoch": 0.6541927713389187, "grad_norm": 0.4598817421019848, "learning_rate": 3.225065858901658e-06, "loss": 0.42298049926757814, "step": 3870 }, { "epoch": 0.6558831919366938, "grad_norm": 0.4869637478788273, "learning_rate": 3.197514418098375e-06, "loss": 0.4236114501953125, "step": 3880 }, { "epoch": 0.6575736125344688, "grad_norm": 0.4191779853522958, "learning_rate": 3.170025738921748e-06, "loss": 0.4369206428527832, "step": 3890 }, { "epoch": 0.6592640331322437, "grad_norm": 0.46354651428620175, "learning_rate": 3.142600778513534e-06, "loss": 0.4308767795562744, "step": 3900 }, { "epoch": 0.6609544537300187, "grad_norm": 0.433362558403198, "learning_rate": 3.1152404917968376e-06, "loss": 0.4263267517089844, "step": 3910 }, { "epoch": 0.6626448743277937, "grad_norm": 0.4183955388560034, "learning_rate": 3.087945831442859e-06, "loss": 0.4287719249725342, "step": 3920 }, { "epoch": 0.6643352949255686, "grad_norm": 0.4569797787084886, "learning_rate": 3.0607177478377146e-06, "loss": 0.42838282585144044, "step": 3930 }, { "epoch": 0.6660257155233437, "grad_norm": 0.405528119203322, "learning_rate": 3.033557189049367e-06, "loss": 0.4259345054626465, "step": 3940 }, { "epoch": 0.6677161361211187, "grad_norm": 0.4445510274119577, "learning_rate": 3.006465100794583e-06, "loss": 0.4231560230255127, "step": 3950 }, { "epoch": 0.6694065567188936, "grad_norm": 0.3956379066918208, "learning_rate": 2.979442426406034e-06, "loss": 0.42560606002807616, "step": 3960 }, { "epoch": 0.6710969773166686, "grad_norm": 0.487413886720085, "learning_rate": 2.9524901067994238e-06, "loss": 0.4272177696228027, "step": 3970 }, { "epoch": 0.6727873979144436, "grad_norm": 0.4557490354463524, "learning_rate": 2.9256090804407522e-06, "loss": 0.4238264083862305, "step": 3980 }, { "epoch": 0.6744778185122186, "grad_norm": 0.46762585507399806, "learning_rate": 2.8988002833136114e-06, "loss": 0.4286961555480957, "step": 3990 }, { "epoch": 0.6761682391099936, "grad_norm": 0.459519945726867, "learning_rate": 2.872064648886618e-06, "loss": 0.4258302688598633, "step": 4000 }, { "epoch": 0.6778586597077685, "grad_norm": 0.4158599550893325, "learning_rate": 2.845403108080895e-06, "loss": 0.42467894554138186, "step": 4010 }, { "epoch": 0.6795490803055435, "grad_norm": 0.764661370612818, "learning_rate": 2.8188165892376655e-06, "loss": 0.42564783096313474, "step": 4020 }, { "epoch": 0.6812395009033185, "grad_norm": 0.46090960281188625, "learning_rate": 2.792306018085932e-06, "loss": 0.4252346992492676, "step": 4030 }, { "epoch": 0.6829299215010934, "grad_norm": 0.5052241362723152, "learning_rate": 2.7658723177102243e-06, "loss": 0.42507052421569824, "step": 4040 }, { "epoch": 0.6846203420988685, "grad_norm": 0.4969762444434996, "learning_rate": 2.73951640851848e-06, "loss": 0.42835030555725095, "step": 4050 }, { "epoch": 0.6863107626966435, "grad_norm": 0.3887380837718711, "learning_rate": 2.713239208209989e-06, "loss": 0.4258549690246582, "step": 4060 }, { "epoch": 0.6880011832944184, "grad_norm": 0.3783428232134585, "learning_rate": 2.6870416317434334e-06, "loss": 0.4262125015258789, "step": 4070 }, { "epoch": 0.6896916038921934, "grad_norm": 0.41965865205854214, "learning_rate": 2.6609245913050345e-06, "loss": 0.42023792266845705, "step": 4080 }, { "epoch": 0.6913820244899684, "grad_norm": 0.40186299701928146, "learning_rate": 2.63488899627679e-06, "loss": 0.42939143180847167, "step": 4090 }, { "epoch": 0.6930724450877433, "grad_norm": 0.5147185623048582, "learning_rate": 2.6089357532048152e-06, "loss": 0.42742137908935546, "step": 4100 }, { "epoch": 0.6947628656855184, "grad_norm": 0.4004984558472973, "learning_rate": 2.583065765767766e-06, "loss": 0.42586841583251955, "step": 4110 }, { "epoch": 0.6964532862832934, "grad_norm": 0.49746517756929526, "learning_rate": 2.5572799347453813e-06, "loss": 0.4294744968414307, "step": 4120 }, { "epoch": 0.6981437068810683, "grad_norm": 0.45191836993834755, "learning_rate": 2.531579157987119e-06, "loss": 0.42096834182739257, "step": 4130 }, { "epoch": 0.6998341274788433, "grad_norm": 0.4388169816996864, "learning_rate": 2.505964330380886e-06, "loss": 0.43329315185546874, "step": 4140 }, { "epoch": 0.7015245480766183, "grad_norm": 0.4646683084770869, "learning_rate": 2.480436343821892e-06, "loss": 0.4317659378051758, "step": 4150 }, { "epoch": 0.7032149686743933, "grad_norm": 0.4144813387224547, "learning_rate": 2.4549960871815777e-06, "loss": 0.4222762107849121, "step": 4160 }, { "epoch": 0.7049053892721683, "grad_norm": 1.149880889208155, "learning_rate": 2.4296444462766766e-06, "loss": 0.426224422454834, "step": 4170 }, { "epoch": 0.7065958098699433, "grad_norm": 0.41212484110906356, "learning_rate": 2.4043823038383675e-06, "loss": 0.43084254264831545, "step": 4180 }, { "epoch": 0.7082862304677182, "grad_norm": 0.40025278610336024, "learning_rate": 2.3792105394815347e-06, "loss": 0.42110452651977537, "step": 4190 }, { "epoch": 0.7099766510654932, "grad_norm": 0.4205397675832194, "learning_rate": 2.3541300296741535e-06, "loss": 0.42673492431640625, "step": 4200 }, { "epoch": 0.7116670716632683, "grad_norm": 0.43501797735562275, "learning_rate": 2.3291416477067493e-06, "loss": 0.4395922660827637, "step": 4210 }, { "epoch": 0.7133574922610432, "grad_norm": 0.4286409355798364, "learning_rate": 2.304246263662014e-06, "loss": 0.42594170570373535, "step": 4220 }, { "epoch": 0.7150479128588182, "grad_norm": 0.39409267212647164, "learning_rate": 2.2794447443844935e-06, "loss": 0.42094078063964846, "step": 4230 }, { "epoch": 0.7167383334565932, "grad_norm": 0.448306587655997, "learning_rate": 2.254737953450413e-06, "loss": 0.4300067901611328, "step": 4240 }, { "epoch": 0.7184287540543681, "grad_norm": 0.41278204876846897, "learning_rate": 2.230126751137604e-06, "loss": 0.42948102951049805, "step": 4250 }, { "epoch": 0.7201191746521431, "grad_norm": 0.8234588798535949, "learning_rate": 2.2056119943955493e-06, "loss": 0.423651123046875, "step": 4260 }, { "epoch": 0.7218095952499182, "grad_norm": 0.4295386395064719, "learning_rate": 2.181194536815553e-06, "loss": 0.4270059585571289, "step": 4270 }, { "epoch": 0.7235000158476931, "grad_norm": 0.4845164711653868, "learning_rate": 2.1568752286010046e-06, "loss": 0.42238831520080566, "step": 4280 }, { "epoch": 0.7251904364454681, "grad_norm": 0.4188111308447989, "learning_rate": 2.132654916537786e-06, "loss": 0.42424306869506834, "step": 4290 }, { "epoch": 0.7268808570432431, "grad_norm": 0.4083567150439788, "learning_rate": 2.108534443964785e-06, "loss": 0.4173469066619873, "step": 4300 }, { "epoch": 0.728571277641018, "grad_norm": 0.49701750342474227, "learning_rate": 2.0845146507445234e-06, "loss": 0.4256436347961426, "step": 4310 }, { "epoch": 0.7302616982387931, "grad_norm": 0.4556476468988333, "learning_rate": 2.0605963732339294e-06, "loss": 0.4249898433685303, "step": 4320 }, { "epoch": 0.7319521188365681, "grad_norm": 0.390956985533463, "learning_rate": 2.0367804442551987e-06, "loss": 0.4315620422363281, "step": 4330 }, { "epoch": 0.733642539434343, "grad_norm": 0.3993855335495715, "learning_rate": 2.013067693066805e-06, "loss": 0.4309099197387695, "step": 4340 }, { "epoch": 0.735332960032118, "grad_norm": 0.4126264964735023, "learning_rate": 1.989458945334623e-06, "loss": 0.42850918769836427, "step": 4350 }, { "epoch": 0.7370233806298929, "grad_norm": 0.44736633124886904, "learning_rate": 1.9659550231031816e-06, "loss": 0.4222278594970703, "step": 4360 }, { "epoch": 0.7387138012276679, "grad_norm": 0.42368325672394164, "learning_rate": 1.942556744767044e-06, "loss": 0.42037811279296877, "step": 4370 }, { "epoch": 0.740404221825443, "grad_norm": 0.5501466174054114, "learning_rate": 1.919264925042295e-06, "loss": 0.41927204132080076, "step": 4380 }, { "epoch": 0.7420946424232179, "grad_norm": 0.39918622365681766, "learning_rate": 1.8960803749381973e-06, "loss": 0.42456836700439454, "step": 4390 }, { "epoch": 0.7437850630209929, "grad_norm": 0.43304426432279913, "learning_rate": 1.8730039017289326e-06, "loss": 0.4330174446105957, "step": 4400 }, { "epoch": 0.7454754836187679, "grad_norm": 0.39829603640659866, "learning_rate": 1.8500363089255074e-06, "loss": 0.4192543029785156, "step": 4410 }, { "epoch": 0.7471659042165428, "grad_norm": 0.41399088851086324, "learning_rate": 1.827178396247759e-06, "loss": 0.42575607299804685, "step": 4420 }, { "epoch": 0.7488563248143179, "grad_norm": 0.7360790909254504, "learning_rate": 1.8044309595965225e-06, "loss": 0.4194206237792969, "step": 4430 }, { "epoch": 0.7505467454120929, "grad_norm": 0.44707313603093624, "learning_rate": 1.7817947910259197e-06, "loss": 0.42949066162109373, "step": 4440 }, { "epoch": 0.7522371660098678, "grad_norm": 0.6041320702862265, "learning_rate": 1.7592706787157682e-06, "loss": 0.4255552291870117, "step": 4450 }, { "epoch": 0.7539275866076428, "grad_norm": 0.45773641763660167, "learning_rate": 1.7368594069441452e-06, "loss": 0.43103628158569335, "step": 4460 }, { "epoch": 0.7556180072054178, "grad_norm": 0.41301944176532956, "learning_rate": 1.7145617560600775e-06, "loss": 0.4204230785369873, "step": 4470 }, { "epoch": 0.7573084278031927, "grad_norm": 0.6409284079564223, "learning_rate": 1.6923785024563755e-06, "loss": 0.43199663162231444, "step": 4480 }, { "epoch": 0.7589988484009678, "grad_norm": 0.42176176532876736, "learning_rate": 1.670310418542589e-06, "loss": 0.4303304672241211, "step": 4490 }, { "epoch": 0.7606892689987428, "grad_norm": 0.41067768414220995, "learning_rate": 1.6483582727181203e-06, "loss": 0.42124075889587403, "step": 4500 }, { "epoch": 0.7623796895965177, "grad_norm": 0.4783665056517865, "learning_rate": 1.626522829345466e-06, "loss": 0.4269865989685059, "step": 4510 }, { "epoch": 0.7640701101942927, "grad_norm": 0.5049031164710026, "learning_rate": 1.604804848723603e-06, "loss": 0.426270866394043, "step": 4520 }, { "epoch": 0.7657605307920677, "grad_norm": 0.4857945955434734, "learning_rate": 1.583205087061519e-06, "loss": 0.4200442314147949, "step": 4530 }, { "epoch": 0.7674509513898427, "grad_norm": 0.6317053106938714, "learning_rate": 1.5617242964518737e-06, "loss": 0.4251349925994873, "step": 4540 }, { "epoch": 0.7691413719876177, "grad_norm": 0.44940467166388814, "learning_rate": 1.5403632248448126e-06, "loss": 0.4180570125579834, "step": 4550 }, { "epoch": 0.7708317925853927, "grad_norm": 0.4779559868678119, "learning_rate": 1.5191226160219353e-06, "loss": 0.42661218643188475, "step": 4560 }, { "epoch": 0.7725222131831676, "grad_norm": 0.48731681458704584, "learning_rate": 1.4980032095703812e-06, "loss": 0.4164596080780029, "step": 4570 }, { "epoch": 0.7742126337809426, "grad_norm": 0.42493021443963513, "learning_rate": 1.4770057408570932e-06, "loss": 0.42499027252197263, "step": 4580 }, { "epoch": 0.7759030543787176, "grad_norm": 0.4461752465394288, "learning_rate": 1.4561309410031927e-06, "loss": 0.4126904964447021, "step": 4590 }, { "epoch": 0.7775934749764926, "grad_norm": 0.45608491001161766, "learning_rate": 1.4353795368585455e-06, "loss": 0.42144598960876467, "step": 4600 }, { "epoch": 0.7792838955742676, "grad_norm": 0.49822249900794946, "learning_rate": 1.4147522509764354e-06, "loss": 0.43109354972839353, "step": 4610 }, { "epoch": 0.7809743161720426, "grad_norm": 0.4695623975406333, "learning_rate": 1.3942498015884148e-06, "loss": 0.42487325668334963, "step": 4620 }, { "epoch": 0.7826647367698175, "grad_norm": 0.6458133705476774, "learning_rate": 1.3738729025792908e-06, "loss": 0.4279775619506836, "step": 4630 }, { "epoch": 0.7843551573675925, "grad_norm": 0.5664119160533859, "learning_rate": 1.3536222634622704e-06, "loss": 0.43117513656616213, "step": 4640 }, { "epoch": 0.7860455779653676, "grad_norm": 0.42786217028357204, "learning_rate": 1.3334985893542596e-06, "loss": 0.42508134841918943, "step": 4650 }, { "epoch": 0.7877359985631425, "grad_norm": 0.43637907827702244, "learning_rate": 1.3135025809513047e-06, "loss": 0.4202974796295166, "step": 4660 }, { "epoch": 0.7894264191609175, "grad_norm": 0.438569542782335, "learning_rate": 1.293634934504196e-06, "loss": 0.4253946304321289, "step": 4670 }, { "epoch": 0.7911168397586925, "grad_norm": 0.587103245107529, "learning_rate": 1.273896341794229e-06, "loss": 0.4194544792175293, "step": 4680 }, { "epoch": 0.7928072603564674, "grad_norm": 0.4010620435372963, "learning_rate": 1.2542874901091111e-06, "loss": 0.4227456092834473, "step": 4690 }, { "epoch": 0.7944976809542424, "grad_norm": 0.4027753025082721, "learning_rate": 1.234809062219039e-06, "loss": 0.4224235534667969, "step": 4700 }, { "epoch": 0.7961881015520174, "grad_norm": 0.4860032717278702, "learning_rate": 1.2154617363529126e-06, "loss": 0.4228099822998047, "step": 4710 }, { "epoch": 0.7978785221497924, "grad_norm": 0.48458357986452577, "learning_rate": 1.1962461861747305e-06, "loss": 0.4361612319946289, "step": 4720 }, { "epoch": 0.7995689427475674, "grad_norm": 0.5128580678938933, "learning_rate": 1.1771630807601287e-06, "loss": 0.4273702144622803, "step": 4730 }, { "epoch": 0.8012593633453423, "grad_norm": 0.4046622055593574, "learning_rate": 1.1582130845730826e-06, "loss": 0.4226740837097168, "step": 4740 }, { "epoch": 0.8029497839431173, "grad_norm": 0.3997957153545625, "learning_rate": 1.1393968574427744e-06, "loss": 0.42818431854248046, "step": 4750 }, { "epoch": 0.8046402045408924, "grad_norm": 0.4025063941387675, "learning_rate": 1.1207150545406136e-06, "loss": 0.425289249420166, "step": 4760 }, { "epoch": 0.8063306251386673, "grad_norm": 0.4183154464205255, "learning_rate": 1.1021683263574313e-06, "loss": 0.4195976734161377, "step": 4770 }, { "epoch": 0.8080210457364423, "grad_norm": 0.4384474275798556, "learning_rate": 1.0837573186808214e-06, "loss": 0.43142261505126955, "step": 4780 }, { "epoch": 0.8097114663342173, "grad_norm": 0.4557749163813245, "learning_rate": 1.0654826725726608e-06, "loss": 0.4209465980529785, "step": 4790 }, { "epoch": 0.8114018869319922, "grad_norm": 0.5432867266168557, "learning_rate": 1.0473450243467865e-06, "loss": 0.4263154029846191, "step": 4800 }, { "epoch": 0.8130923075297672, "grad_norm": 0.5025593460716719, "learning_rate": 1.0293450055468374e-06, "loss": 0.4319735527038574, "step": 4810 }, { "epoch": 0.8147827281275423, "grad_norm": 0.4755935024526056, "learning_rate": 1.0114832429242705e-06, "loss": 0.4187938690185547, "step": 4820 }, { "epoch": 0.8164731487253172, "grad_norm": 0.4010757390976713, "learning_rate": 9.93760358416529e-07, "loss": 0.4261464595794678, "step": 4830 }, { "epoch": 0.8181635693230922, "grad_norm": 0.7131694816309398, "learning_rate": 9.761769691253931e-07, "loss": 0.4193047046661377, "step": 4840 }, { "epoch": 0.8198539899208672, "grad_norm": 0.5034513530726354, "learning_rate": 9.587336872954906e-07, "loss": 0.4203728199005127, "step": 4850 }, { "epoch": 0.8215444105186421, "grad_norm": 0.42178444814812144, "learning_rate": 9.414311202929771e-07, "loss": 0.42111892700195314, "step": 4860 }, { "epoch": 0.8232348311164172, "grad_norm": 0.5122204909729405, "learning_rate": 9.242698705843961e-07, "loss": 0.423065185546875, "step": 4870 }, { "epoch": 0.8249252517141922, "grad_norm": 0.3989956407391952, "learning_rate": 9.072505357156858e-07, "loss": 0.42530975341796873, "step": 4880 }, { "epoch": 0.8266156723119671, "grad_norm": 0.39938195227376233, "learning_rate": 8.903737082913905e-07, "loss": 0.4214590072631836, "step": 4890 }, { "epoch": 0.8283060929097421, "grad_norm": 0.40185499788640217, "learning_rate": 8.736399759540132e-07, "loss": 0.4245802879333496, "step": 4900 }, { "epoch": 0.8299965135075171, "grad_norm": 0.40431837535545917, "learning_rate": 8.570499213635635e-07, "loss": 0.41890692710876465, "step": 4910 }, { "epoch": 0.831686934105292, "grad_norm": 0.4407892358374833, "learning_rate": 8.406041221772593e-07, "loss": 0.4315225124359131, "step": 4920 }, { "epoch": 0.8333773547030671, "grad_norm": 0.587964877158027, "learning_rate": 8.243031510294225e-07, "loss": 0.4260035514831543, "step": 4930 }, { "epoch": 0.8350677753008421, "grad_norm": 0.3867363003903258, "learning_rate": 8.081475755115381e-07, "loss": 0.4230846881866455, "step": 4940 }, { "epoch": 0.836758195898617, "grad_norm": 0.4125267432837227, "learning_rate": 7.921379581524879e-07, "loss": 0.4201014518737793, "step": 4950 }, { "epoch": 0.838448616496392, "grad_norm": 0.4145768688095069, "learning_rate": 7.762748563989653e-07, "loss": 0.42217350006103516, "step": 4960 }, { "epoch": 0.840139037094167, "grad_norm": 0.3827238095008044, "learning_rate": 7.605588225960631e-07, "loss": 0.4181986808776855, "step": 4970 }, { "epoch": 0.841829457691942, "grad_norm": 0.45519859678121805, "learning_rate": 7.449904039680483e-07, "loss": 0.42473936080932617, "step": 4980 }, { "epoch": 0.843519878289717, "grad_norm": 0.38187166057209, "learning_rate": 7.295701425992984e-07, "loss": 0.42186822891235354, "step": 4990 }, { "epoch": 0.845210298887492, "grad_norm": 0.4100653082076505, "learning_rate": 7.142985754154336e-07, "loss": 0.41666412353515625, "step": 5000 }, { "epoch": 0.8469007194852669, "grad_norm": 0.3990265533675085, "learning_rate": 6.991762341646163e-07, "loss": 0.41996259689331056, "step": 5010 }, { "epoch": 0.8485911400830419, "grad_norm": 0.42655171589794805, "learning_rate": 6.842036453990386e-07, "loss": 0.41867885589599607, "step": 5020 }, { "epoch": 0.850281560680817, "grad_norm": 0.40280454432813595, "learning_rate": 6.69381330456591e-07, "loss": 0.42267436981201173, "step": 5030 }, { "epoch": 0.8519719812785919, "grad_norm": 0.7423046244692998, "learning_rate": 6.547098054427031e-07, "loss": 0.42384800910949705, "step": 5040 }, { "epoch": 0.8536624018763669, "grad_norm": 0.4385504136747162, "learning_rate": 6.401895812123737e-07, "loss": 0.42662858963012695, "step": 5050 }, { "epoch": 0.8553528224741418, "grad_norm": 0.40221301509494206, "learning_rate": 6.25821163352392e-07, "loss": 0.42463250160217286, "step": 5060 }, { "epoch": 0.8570432430719168, "grad_norm": 0.4678190928201736, "learning_rate": 6.116050521637218e-07, "loss": 0.4259337425231934, "step": 5070 }, { "epoch": 0.8587336636696918, "grad_norm": 1.5908098668249737, "learning_rate": 5.975417426440911e-07, "loss": 0.42664356231689454, "step": 5080 }, { "epoch": 0.8604240842674667, "grad_norm": 0.40958701025875693, "learning_rate": 5.836317244707451e-07, "loss": 0.42066402435302735, "step": 5090 }, { "epoch": 0.8621145048652418, "grad_norm": 0.38142342331351625, "learning_rate": 5.698754819834107e-07, "loss": 0.42577228546142576, "step": 5100 }, { "epoch": 0.8638049254630168, "grad_norm": 0.5995765733030113, "learning_rate": 5.562734941674175e-07, "loss": 0.42906174659729, "step": 5110 }, { "epoch": 0.8654953460607917, "grad_norm": 0.4124816632193718, "learning_rate": 5.428262346370305e-07, "loss": 0.42333269119262695, "step": 5120 }, { "epoch": 0.8671857666585667, "grad_norm": 0.43153283983780877, "learning_rate": 5.295341716189522e-07, "loss": 0.42881431579589846, "step": 5130 }, { "epoch": 0.8688761872563417, "grad_norm": 0.9191108566472296, "learning_rate": 5.163977679360221e-07, "loss": 0.41905965805053713, "step": 5140 }, { "epoch": 0.8705666078541167, "grad_norm": 0.4198392449021922, "learning_rate": 5.034174809911042e-07, "loss": 0.427170467376709, "step": 5150 }, { "epoch": 0.8722570284518917, "grad_norm": 0.4444926311135468, "learning_rate": 4.905937627511536e-07, "loss": 0.4227573394775391, "step": 5160 }, { "epoch": 0.8739474490496667, "grad_norm": 0.5606934658286145, "learning_rate": 4.779270597314861e-07, "loss": 0.4257050514221191, "step": 5170 }, { "epoch": 0.8756378696474416, "grad_norm": 0.41569630734941043, "learning_rate": 4.65417812980225e-07, "loss": 0.4265610694885254, "step": 5180 }, { "epoch": 0.8773282902452166, "grad_norm": 0.47494065883948217, "learning_rate": 4.5306645806294904e-07, "loss": 0.41692366600036623, "step": 5190 }, { "epoch": 0.8790187108429917, "grad_norm": 0.45085269223305413, "learning_rate": 4.4087342504752383e-07, "loss": 0.4252819061279297, "step": 5200 }, { "epoch": 0.8807091314407666, "grad_norm": 0.3913069944608277, "learning_rate": 4.288391384891261e-07, "loss": 0.42319507598876954, "step": 5210 }, { "epoch": 0.8823995520385416, "grad_norm": 0.48873707062317545, "learning_rate": 4.169640174154627e-07, "loss": 0.4205298900604248, "step": 5220 }, { "epoch": 0.8840899726363166, "grad_norm": 0.3939589445180343, "learning_rate": 4.052484753121799e-07, "loss": 0.4183074951171875, "step": 5230 }, { "epoch": 0.8857803932340915, "grad_norm": 0.9063181567182339, "learning_rate": 3.936929201084644e-07, "loss": 0.42419729232788084, "step": 5240 }, { "epoch": 0.8874708138318665, "grad_norm": 0.4205501036887155, "learning_rate": 3.822977541628453e-07, "loss": 0.424894905090332, "step": 5250 }, { "epoch": 0.8891612344296416, "grad_norm": 0.43108338551767145, "learning_rate": 3.7106337424917205e-07, "loss": 0.42048206329345705, "step": 5260 }, { "epoch": 0.8908516550274165, "grad_norm": 0.5544953283474947, "learning_rate": 3.599901715428139e-07, "loss": 0.4223438262939453, "step": 5270 }, { "epoch": 0.8925420756251915, "grad_norm": 0.4304365854392619, "learning_rate": 3.4907853160702777e-07, "loss": 0.42336835861206057, "step": 5280 }, { "epoch": 0.8942324962229665, "grad_norm": 0.3906588855234446, "learning_rate": 3.38328834379541e-07, "loss": 0.4154191970825195, "step": 5290 }, { "epoch": 0.8959229168207414, "grad_norm": 0.5031541905142518, "learning_rate": 3.277414541593144e-07, "loss": 0.41904850006103517, "step": 5300 }, { "epoch": 0.8976133374185165, "grad_norm": 0.4530444890179973, "learning_rate": 3.173167595935156e-07, "loss": 0.4230810165405273, "step": 5310 }, { "epoch": 0.8993037580162915, "grad_norm": 0.48713249648075824, "learning_rate": 3.0705511366468264e-07, "loss": 0.4264675617218018, "step": 5320 }, { "epoch": 0.9009941786140664, "grad_norm": 0.5208238749067577, "learning_rate": 2.969568736780809e-07, "loss": 0.4253951072692871, "step": 5330 }, { "epoch": 0.9026845992118414, "grad_norm": 0.4822048723169926, "learning_rate": 2.8702239124926536e-07, "loss": 0.42444214820861814, "step": 5340 }, { "epoch": 0.9043750198096164, "grad_norm": 0.4592549523630122, "learning_rate": 2.7725201229183595e-07, "loss": 0.41964020729064944, "step": 5350 }, { "epoch": 0.9060654404073913, "grad_norm": 0.5273092393546448, "learning_rate": 2.676460770053935e-07, "loss": 0.4201678276062012, "step": 5360 }, { "epoch": 0.9077558610051664, "grad_norm": 0.49079242898504394, "learning_rate": 2.5820491986369655e-07, "loss": 0.4261648654937744, "step": 5370 }, { "epoch": 0.9094462816029414, "grad_norm": 0.39447657056029006, "learning_rate": 2.4892886960300955e-07, "loss": 0.42170066833496095, "step": 5380 }, { "epoch": 0.9111367022007163, "grad_norm": 0.39984798676859423, "learning_rate": 2.3981824921066264e-07, "loss": 0.4172752857208252, "step": 5390 }, { "epoch": 0.9128271227984913, "grad_norm": 0.42983400440300035, "learning_rate": 2.3087337591379877e-07, "loss": 0.42790851593017576, "step": 5400 }, { "epoch": 0.9145175433962662, "grad_norm": 0.5322157941863958, "learning_rate": 2.2209456116833726e-07, "loss": 0.42197275161743164, "step": 5410 }, { "epoch": 0.9162079639940413, "grad_norm": 0.7319643214256284, "learning_rate": 2.1348211064811886e-07, "loss": 0.4245607376098633, "step": 5420 }, { "epoch": 0.9178983845918163, "grad_norm": 0.6154493827068899, "learning_rate": 2.050363242342679e-07, "loss": 0.4241465091705322, "step": 5430 }, { "epoch": 0.9195888051895912, "grad_norm": 0.38677628278359527, "learning_rate": 1.9675749600475137e-07, "loss": 0.42714385986328124, "step": 5440 }, { "epoch": 0.9212792257873662, "grad_norm": 0.4023137059518498, "learning_rate": 1.8864591422413647e-07, "loss": 0.4192478179931641, "step": 5450 }, { "epoch": 0.9229696463851412, "grad_norm": 0.5187932907244228, "learning_rate": 1.8070186133355482e-07, "loss": 0.41727705001831056, "step": 5460 }, { "epoch": 0.9246600669829161, "grad_norm": 0.965073641979697, "learning_rate": 1.7292561394086638e-07, "loss": 0.42743620872497556, "step": 5470 }, { "epoch": 0.9263504875806912, "grad_norm": 0.7609466724404457, "learning_rate": 1.6531744281103268e-07, "loss": 0.42295517921447756, "step": 5480 }, { "epoch": 0.9280409081784662, "grad_norm": 0.5111775114201492, "learning_rate": 1.578776128566828e-07, "loss": 0.4170988082885742, "step": 5490 }, { "epoch": 0.9297313287762411, "grad_norm": 0.4545039552455224, "learning_rate": 1.5060638312889288e-07, "loss": 0.4168592929840088, "step": 5500 }, { "epoch": 0.9314217493740161, "grad_norm": 0.3892891280788642, "learning_rate": 1.4350400680816555e-07, "loss": 0.4170875072479248, "step": 5510 }, { "epoch": 0.9331121699717911, "grad_norm": 0.40500981225510563, "learning_rate": 1.365707311956138e-07, "loss": 0.4208053112030029, "step": 5520 }, { "epoch": 0.934802590569566, "grad_norm": 0.41669067149487427, "learning_rate": 1.29806797704351e-07, "loss": 0.4162314414978027, "step": 5530 }, { "epoch": 0.9364930111673411, "grad_norm": 0.45468975448954785, "learning_rate": 1.2321244185108438e-07, "loss": 0.42525811195373536, "step": 5540 }, { "epoch": 0.9381834317651161, "grad_norm": 0.5324807640892435, "learning_rate": 1.1678789324791385e-07, "loss": 0.4258564949035645, "step": 5550 }, { "epoch": 0.939873852362891, "grad_norm": 0.42891448398762166, "learning_rate": 1.1053337559433774e-07, "loss": 0.42623538970947267, "step": 5560 }, { "epoch": 0.941564272960666, "grad_norm": 0.501340218520871, "learning_rate": 1.0444910666946362e-07, "loss": 0.42596940994262694, "step": 5570 }, { "epoch": 0.943254693558441, "grad_norm": 0.4031637084492024, "learning_rate": 9.853529832442643e-08, "loss": 0.4168665885925293, "step": 5580 }, { "epoch": 0.944945114156216, "grad_norm": 0.41610280627051877, "learning_rate": 9.27921564750095e-08, "loss": 0.4280057907104492, "step": 5590 }, { "epoch": 0.946635534753991, "grad_norm": 0.3941913310019063, "learning_rate": 8.721988109447632e-08, "loss": 0.4269193172454834, "step": 5600 }, { "epoch": 0.948325955351766, "grad_norm": 0.4025191194269793, "learning_rate": 8.181866620660839e-08, "loss": 0.42088003158569337, "step": 5610 }, { "epoch": 0.9500163759495409, "grad_norm": 0.4071300779033532, "learning_rate": 7.658869987894612e-08, "loss": 0.41224422454833987, "step": 5620 }, { "epoch": 0.9517067965473159, "grad_norm": 0.4950344979035128, "learning_rate": 7.153016421624525e-08, "loss": 0.4289411544799805, "step": 5630 }, { "epoch": 0.953397217145091, "grad_norm": 0.42576170549234565, "learning_rate": 6.66432353541302e-08, "loss": 0.4195157527923584, "step": 5640 }, { "epoch": 0.9550876377428659, "grad_norm": 0.41466397846386327, "learning_rate": 6.192808345296786e-08, "loss": 0.42493529319763185, "step": 5650 }, { "epoch": 0.9567780583406409, "grad_norm": 0.3861387762178082, "learning_rate": 5.7384872691936264e-08, "loss": 0.42592926025390626, "step": 5660 }, { "epoch": 0.9584684789384159, "grad_norm": 0.5844447829296148, "learning_rate": 5.301376126331248e-08, "loss": 0.42506136894226076, "step": 5670 }, { "epoch": 0.9601588995361908, "grad_norm": 0.4583100629468275, "learning_rate": 4.8814901366961985e-08, "loss": 0.4225175857543945, "step": 5680 }, { "epoch": 0.9618493201339658, "grad_norm": 0.4511824979451254, "learning_rate": 4.478843920504017e-08, "loss": 0.424249267578125, "step": 5690 }, { "epoch": 0.9635397407317409, "grad_norm": 0.36704192082082593, "learning_rate": 4.093451497690193e-08, "loss": 0.42214536666870117, "step": 5700 }, { "epoch": 0.9652301613295158, "grad_norm": 0.46415491364375805, "learning_rate": 3.725326287421838e-08, "loss": 0.42182073593139646, "step": 5710 }, { "epoch": 0.9669205819272908, "grad_norm": 0.43678313014125836, "learning_rate": 3.374481107630612e-08, "loss": 0.4228252410888672, "step": 5720 }, { "epoch": 0.9686110025250658, "grad_norm": 0.44999534894541443, "learning_rate": 3.040928174566415e-08, "loss": 0.4173929214477539, "step": 5730 }, { "epoch": 0.9703014231228407, "grad_norm": 0.5166241536078265, "learning_rate": 2.7246791023717854e-08, "loss": 0.42366867065429686, "step": 5740 }, { "epoch": 0.9719918437206158, "grad_norm": 0.49082550972111405, "learning_rate": 2.42574490267794e-08, "loss": 0.42436580657958983, "step": 5750 }, { "epoch": 0.9736822643183907, "grad_norm": 0.41566861360225094, "learning_rate": 2.1441359842206966e-08, "loss": 0.41978960037231444, "step": 5760 }, { "epoch": 0.9753726849161657, "grad_norm": 0.4467142353931868, "learning_rate": 1.8798621524788173e-08, "loss": 0.4175607681274414, "step": 5770 }, { "epoch": 0.9770631055139407, "grad_norm": 0.41925536664649327, "learning_rate": 1.6329326093320053e-08, "loss": 0.4178473472595215, "step": 5780 }, { "epoch": 0.9787535261117156, "grad_norm": 0.4142968221515882, "learning_rate": 1.4033559527407703e-08, "loss": 0.41806626319885254, "step": 5790 }, { "epoch": 0.9804439467094906, "grad_norm": 0.48270508356543623, "learning_rate": 1.1911401764468922e-08, "loss": 0.41689310073852537, "step": 5800 }, { "epoch": 0.9821343673072657, "grad_norm": 0.4223736800888985, "learning_rate": 9.96292669695198e-09, "loss": 0.418758487701416, "step": 5810 }, { "epoch": 0.9838247879050406, "grad_norm": 0.835433326472655, "learning_rate": 8.188202169763793e-09, "loss": 0.4206347942352295, "step": 5820 }, { "epoch": 0.9855152085028156, "grad_norm": 0.43870192086162735, "learning_rate": 6.5872899779045875e-09, "loss": 0.4244359016418457, "step": 5830 }, { "epoch": 0.9872056291005906, "grad_norm": 0.40318943557648546, "learning_rate": 5.160245864319069e-09, "loss": 0.41902694702148435, "step": 5840 }, { "epoch": 0.9888960496983655, "grad_norm": 0.44780330264589363, "learning_rate": 3.907119517954083e-09, "loss": 0.4193845748901367, "step": 5850 }, { "epoch": 0.9905864702961406, "grad_norm": 0.46299907040473565, "learning_rate": 2.827954572027225e-09, "loss": 0.41649885177612306, "step": 5860 }, { "epoch": 0.9922768908939156, "grad_norm": 0.7022500478758524, "learning_rate": 1.922788602511938e-09, "loss": 0.4247690200805664, "step": 5870 }, { "epoch": 0.9939673114916905, "grad_norm": 0.4194367425429352, "learning_rate": 1.1916531268230114e-09, "loss": 0.4229640007019043, "step": 5880 }, { "epoch": 0.9956577320894655, "grad_norm": 0.40623075402193076, "learning_rate": 6.345736027257853e-10, "loss": 0.42009563446044923, "step": 5890 }, { "epoch": 0.9973481526872405, "grad_norm": 0.47810385384154763, "learning_rate": 2.51569427442977e-10, "loss": 0.4222869396209717, "step": 5900 }, { "epoch": 0.9990385732850154, "grad_norm": 0.40176422604323225, "learning_rate": 4.2653936984660136e-11, "loss": 0.42395753860473634, "step": 5910 }, { "epoch": 1.0, "step": 5916, "total_flos": 1.962434144184251e+19, "train_loss": 0.44837146779828335, "train_runtime": 384303.2502, "train_samples_per_second": 1.97, "train_steps_per_second": 0.015 } ], "logging_steps": 10, "max_steps": 5916, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1479, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.962434144184251e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }