{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4971, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006035913686434284, "grad_norm": 13.465597639885102, "learning_rate": 1.8072289156626505e-07, "loss": 0.8887, "step": 10 }, { "epoch": 0.012071827372868568, "grad_norm": 12.414937705631793, "learning_rate": 3.8152610441767073e-07, "loss": 0.8871, "step": 20 }, { "epoch": 0.01810774105930285, "grad_norm": 5.604563580675332, "learning_rate": 5.823293172690764e-07, "loss": 0.7943, "step": 30 }, { "epoch": 0.024143654745737136, "grad_norm": 3.082127771205323, "learning_rate": 7.83132530120482e-07, "loss": 0.6989, "step": 40 }, { "epoch": 0.03017956843217142, "grad_norm": 1.9253363533227204, "learning_rate": 9.839357429718876e-07, "loss": 0.6283, "step": 50 }, { "epoch": 0.0362154821186057, "grad_norm": 1.2352792533570607, "learning_rate": 1.1847389558232934e-06, "loss": 0.5916, "step": 60 }, { "epoch": 0.04225139580503999, "grad_norm": 0.8094703225757798, "learning_rate": 1.385542168674699e-06, "loss": 0.5623, "step": 70 }, { "epoch": 0.04828730949147427, "grad_norm": 0.7924082712954621, "learning_rate": 1.5863453815261046e-06, "loss": 0.536, "step": 80 }, { "epoch": 0.05432322317790855, "grad_norm": 0.7765422489934142, "learning_rate": 1.7871485943775102e-06, "loss": 0.5246, "step": 90 }, { "epoch": 0.06035913686434284, "grad_norm": 1.2024335532490196, "learning_rate": 1.987951807228916e-06, "loss": 0.5112, "step": 100 }, { "epoch": 0.06639505055077713, "grad_norm": 0.7361271470838762, "learning_rate": 2.1887550200803216e-06, "loss": 0.4973, "step": 110 }, { "epoch": 0.0724309642372114, "grad_norm": 0.9636947285799289, "learning_rate": 2.389558232931727e-06, "loss": 0.4926, "step": 120 }, { "epoch": 0.07846687792364569, "grad_norm": 0.834221678860187, "learning_rate": 2.590361445783133e-06, "loss": 0.4868, "step": 130 }, { "epoch": 0.08450279161007998, "grad_norm": 0.721459089158014, "learning_rate": 2.791164658634538e-06, "loss": 0.4836, "step": 140 }, { "epoch": 0.09053870529651425, "grad_norm": 0.7680662169711512, "learning_rate": 2.991967871485944e-06, "loss": 0.4759, "step": 150 }, { "epoch": 0.09657461898294854, "grad_norm": 0.8074470103289187, "learning_rate": 3.1927710843373494e-06, "loss": 0.4753, "step": 160 }, { "epoch": 0.10261053266938283, "grad_norm": 0.7821569953929599, "learning_rate": 3.393574297188755e-06, "loss": 0.4689, "step": 170 }, { "epoch": 0.1086464463558171, "grad_norm": 0.8046561770221946, "learning_rate": 3.5943775100401606e-06, "loss": 0.4678, "step": 180 }, { "epoch": 0.1146823600422514, "grad_norm": 0.8787311236716008, "learning_rate": 3.7951807228915664e-06, "loss": 0.463, "step": 190 }, { "epoch": 0.12071827372868568, "grad_norm": 0.8810490177348705, "learning_rate": 3.995983935742972e-06, "loss": 0.4601, "step": 200 }, { "epoch": 0.12675418741511996, "grad_norm": 0.8889957032229883, "learning_rate": 4.196787148594378e-06, "loss": 0.4589, "step": 210 }, { "epoch": 0.13279010110155426, "grad_norm": 1.0047774511651133, "learning_rate": 4.397590361445783e-06, "loss": 0.4533, "step": 220 }, { "epoch": 0.13882601478798853, "grad_norm": 0.8382898002966074, "learning_rate": 4.598393574297189e-06, "loss": 0.4541, "step": 230 }, { "epoch": 0.1448619284744228, "grad_norm": 0.8867952107395614, "learning_rate": 4.799196787148594e-06, "loss": 0.4488, "step": 240 }, { "epoch": 0.1508978421608571, "grad_norm": 0.8306941422038123, "learning_rate": 5e-06, "loss": 0.4521, "step": 250 }, { "epoch": 0.15693375584729138, "grad_norm": 0.8874942497893604, "learning_rate": 5.200803212851407e-06, "loss": 0.4505, "step": 260 }, { "epoch": 0.16296966953372566, "grad_norm": 0.8911658824764517, "learning_rate": 5.401606425702812e-06, "loss": 0.446, "step": 270 }, { "epoch": 0.16900558322015996, "grad_norm": 0.8852855388263275, "learning_rate": 5.602409638554217e-06, "loss": 0.4452, "step": 280 }, { "epoch": 0.17504149690659423, "grad_norm": 0.9910318655855725, "learning_rate": 5.803212851405623e-06, "loss": 0.4413, "step": 290 }, { "epoch": 0.1810774105930285, "grad_norm": 1.0110708733608424, "learning_rate": 6.004016064257029e-06, "loss": 0.4397, "step": 300 }, { "epoch": 0.1871133242794628, "grad_norm": 0.9003864963841174, "learning_rate": 6.2048192771084344e-06, "loss": 0.4414, "step": 310 }, { "epoch": 0.19314923796589709, "grad_norm": 0.906399226331659, "learning_rate": 6.40562248995984e-06, "loss": 0.4372, "step": 320 }, { "epoch": 0.19918515165233136, "grad_norm": 1.071695971731785, "learning_rate": 6.606425702811245e-06, "loss": 0.4381, "step": 330 }, { "epoch": 0.20522106533876566, "grad_norm": 0.9761062070856111, "learning_rate": 6.8072289156626514e-06, "loss": 0.4343, "step": 340 }, { "epoch": 0.21125697902519994, "grad_norm": 0.9721838815300707, "learning_rate": 7.008032128514058e-06, "loss": 0.4373, "step": 350 }, { "epoch": 0.2172928927116342, "grad_norm": 0.8909414468062403, "learning_rate": 7.208835341365462e-06, "loss": 0.4336, "step": 360 }, { "epoch": 0.2233288063980685, "grad_norm": 0.964718436271309, "learning_rate": 7.4096385542168684e-06, "loss": 0.4348, "step": 370 }, { "epoch": 0.2293647200845028, "grad_norm": 1.025409695885071, "learning_rate": 7.610441767068274e-06, "loss": 0.4326, "step": 380 }, { "epoch": 0.23540063377093706, "grad_norm": 0.9270275542948012, "learning_rate": 7.81124497991968e-06, "loss": 0.4324, "step": 390 }, { "epoch": 0.24143654745737136, "grad_norm": 0.9390965093376761, "learning_rate": 8.012048192771085e-06, "loss": 0.4302, "step": 400 }, { "epoch": 0.24747246114380564, "grad_norm": 0.7707812298350031, "learning_rate": 8.21285140562249e-06, "loss": 0.4276, "step": 410 }, { "epoch": 0.2535083748302399, "grad_norm": 0.8215921682895242, "learning_rate": 8.413654618473896e-06, "loss": 0.4274, "step": 420 }, { "epoch": 0.2595442885166742, "grad_norm": 1.0290878620245738, "learning_rate": 8.614457831325302e-06, "loss": 0.427, "step": 430 }, { "epoch": 0.2655802022031085, "grad_norm": 0.9009095092288704, "learning_rate": 8.815261044176707e-06, "loss": 0.4232, "step": 440 }, { "epoch": 0.27161611588954276, "grad_norm": 0.9646916353387767, "learning_rate": 9.016064257028112e-06, "loss": 0.4235, "step": 450 }, { "epoch": 0.27765202957597707, "grad_norm": 0.8009669905789347, "learning_rate": 9.21686746987952e-06, "loss": 0.4248, "step": 460 }, { "epoch": 0.28368794326241137, "grad_norm": 2.57971922495045, "learning_rate": 9.417670682730925e-06, "loss": 0.4246, "step": 470 }, { "epoch": 0.2897238569488456, "grad_norm": 0.9225235875464007, "learning_rate": 9.61847389558233e-06, "loss": 0.4256, "step": 480 }, { "epoch": 0.2957597706352799, "grad_norm": 0.8937790567235143, "learning_rate": 9.819277108433736e-06, "loss": 0.4232, "step": 490 }, { "epoch": 0.3017956843217142, "grad_norm": 0.992661961364272, "learning_rate": 9.99999876677608e-06, "loss": 0.4236, "step": 500 }, { "epoch": 0.30783159800814847, "grad_norm": 1.1203639087859305, "learning_rate": 9.999850780641762e-06, "loss": 0.423, "step": 510 }, { "epoch": 0.31386751169458277, "grad_norm": 0.9504675074156581, "learning_rate": 9.999456158087994e-06, "loss": 0.4255, "step": 520 }, { "epoch": 0.31990342538101707, "grad_norm": 1.1526705071263037, "learning_rate": 9.998814918581017e-06, "loss": 0.4236, "step": 530 }, { "epoch": 0.3259393390674513, "grad_norm": 0.9400926329756719, "learning_rate": 9.99792709375238e-06, "loss": 0.4193, "step": 540 }, { "epoch": 0.3319752527538856, "grad_norm": 0.761979605644821, "learning_rate": 9.996792727397374e-06, "loss": 0.4178, "step": 550 }, { "epoch": 0.3380111664403199, "grad_norm": 0.7761858463434534, "learning_rate": 9.995411875472882e-06, "loss": 0.4172, "step": 560 }, { "epoch": 0.34404708012675417, "grad_norm": 0.8353265789234773, "learning_rate": 9.993784606094612e-06, "loss": 0.417, "step": 570 }, { "epoch": 0.35008299381318847, "grad_norm": 0.7921534241896437, "learning_rate": 9.991910999533739e-06, "loss": 0.4164, "step": 580 }, { "epoch": 0.35611890749962277, "grad_norm": 0.8368518529458858, "learning_rate": 9.98979114821294e-06, "loss": 0.4212, "step": 590 }, { "epoch": 0.362154821186057, "grad_norm": 0.8526689259731893, "learning_rate": 9.98742515670185e-06, "loss": 0.413, "step": 600 }, { "epoch": 0.3681907348724913, "grad_norm": 0.8691355689423315, "learning_rate": 9.98481314171188e-06, "loss": 0.4147, "step": 610 }, { "epoch": 0.3742266485589256, "grad_norm": 0.7413766525933784, "learning_rate": 9.981955232090484e-06, "loss": 0.4202, "step": 620 }, { "epoch": 0.38026256224535987, "grad_norm": 0.862826800304683, "learning_rate": 9.978851568814789e-06, "loss": 0.4144, "step": 630 }, { "epoch": 0.38629847593179417, "grad_norm": 0.852995884285724, "learning_rate": 9.975502304984643e-06, "loss": 0.4159, "step": 640 }, { "epoch": 0.3923343896182285, "grad_norm": 0.8190268708459463, "learning_rate": 9.971907605815065e-06, "loss": 0.4133, "step": 650 }, { "epoch": 0.3983703033046627, "grad_norm": 0.7826738241592833, "learning_rate": 9.968067648628092e-06, "loss": 0.417, "step": 660 }, { "epoch": 0.404406216991097, "grad_norm": 0.8234056482304477, "learning_rate": 9.963982622844037e-06, "loss": 0.4151, "step": 670 }, { "epoch": 0.4104421306775313, "grad_norm": 0.8389822495874198, "learning_rate": 9.959652729972138e-06, "loss": 0.4142, "step": 680 }, { "epoch": 0.41647804436396557, "grad_norm": 0.7530220222404655, "learning_rate": 9.955078183600626e-06, "loss": 0.4135, "step": 690 }, { "epoch": 0.4225139580503999, "grad_norm": 0.8094044727188283, "learning_rate": 9.950259209386182e-06, "loss": 0.4076, "step": 700 }, { "epoch": 0.4285498717368342, "grad_norm": 0.7704390882655109, "learning_rate": 9.945196045042812e-06, "loss": 0.41, "step": 710 }, { "epoch": 0.4345857854232684, "grad_norm": 0.9003987196323937, "learning_rate": 9.93988894033011e-06, "loss": 0.4114, "step": 720 }, { "epoch": 0.4406216991097027, "grad_norm": 0.8729571471009108, "learning_rate": 9.934338157040953e-06, "loss": 0.4128, "step": 730 }, { "epoch": 0.446657612796137, "grad_norm": 0.7801434856688376, "learning_rate": 9.928543968988576e-06, "loss": 0.4103, "step": 740 }, { "epoch": 0.4526935264825713, "grad_norm": 0.9417689284475159, "learning_rate": 9.922506661993067e-06, "loss": 0.4086, "step": 750 }, { "epoch": 0.4587294401690056, "grad_norm": 0.8877368018323296, "learning_rate": 9.91622653386727e-06, "loss": 0.4139, "step": 760 }, { "epoch": 0.4647653538554399, "grad_norm": 0.7960343939884429, "learning_rate": 9.909703894402093e-06, "loss": 0.4072, "step": 770 }, { "epoch": 0.4708012675418741, "grad_norm": 0.7142525800658928, "learning_rate": 9.90293906535123e-06, "loss": 0.4069, "step": 780 }, { "epoch": 0.4768371812283084, "grad_norm": 0.8168998091378754, "learning_rate": 9.895932380415277e-06, "loss": 0.4053, "step": 790 }, { "epoch": 0.48287309491474273, "grad_norm": 0.7851582099155968, "learning_rate": 9.888684185225291e-06, "loss": 0.4096, "step": 800 }, { "epoch": 0.48890900860117703, "grad_norm": 0.7313895363802666, "learning_rate": 9.881194837325722e-06, "loss": 0.4035, "step": 810 }, { "epoch": 0.4949449222876113, "grad_norm": 0.801599057157289, "learning_rate": 9.873464706156785e-06, "loss": 0.4082, "step": 820 }, { "epoch": 0.5009808359740455, "grad_norm": 0.7959824627607599, "learning_rate": 9.865494173036238e-06, "loss": 0.4086, "step": 830 }, { "epoch": 0.5070167496604798, "grad_norm": 0.7643194639900054, "learning_rate": 9.857283631140563e-06, "loss": 0.4097, "step": 840 }, { "epoch": 0.5130526633469141, "grad_norm": 0.8141162481887632, "learning_rate": 9.848833485485577e-06, "loss": 0.4068, "step": 850 }, { "epoch": 0.5190885770333484, "grad_norm": 0.7263606575446551, "learning_rate": 9.840144152906455e-06, "loss": 0.4052, "step": 860 }, { "epoch": 0.5251244907197827, "grad_norm": 0.7326820835121685, "learning_rate": 9.831216062037163e-06, "loss": 0.403, "step": 870 }, { "epoch": 0.531160404406217, "grad_norm": 0.7722145618849807, "learning_rate": 9.822049653289318e-06, "loss": 0.4041, "step": 880 }, { "epoch": 0.5371963180926512, "grad_norm": 0.7035970302521439, "learning_rate": 9.81264537883046e-06, "loss": 0.401, "step": 890 }, { "epoch": 0.5432322317790855, "grad_norm": 0.6580207236042055, "learning_rate": 9.803003702561753e-06, "loss": 0.4057, "step": 900 }, { "epoch": 0.5492681454655198, "grad_norm": 0.6960070468306416, "learning_rate": 9.79312510009509e-06, "loss": 0.4103, "step": 910 }, { "epoch": 0.5553040591519541, "grad_norm": 0.7088936549744779, "learning_rate": 9.783010058729644e-06, "loss": 0.4024, "step": 920 }, { "epoch": 0.5613399728383884, "grad_norm": 0.8173990374915286, "learning_rate": 9.772659077427824e-06, "loss": 0.3983, "step": 930 }, { "epoch": 0.5673758865248227, "grad_norm": 0.7248588219467303, "learning_rate": 9.762072666790658e-06, "loss": 0.4042, "step": 940 }, { "epoch": 0.5734118002112569, "grad_norm": 0.6953286894486166, "learning_rate": 9.751251349032615e-06, "loss": 0.4052, "step": 950 }, { "epoch": 0.5794477138976912, "grad_norm": 0.6805775618542874, "learning_rate": 9.74019565795584e-06, "loss": 0.4028, "step": 960 }, { "epoch": 0.5854836275841255, "grad_norm": 0.7073250522342893, "learning_rate": 9.728906138923823e-06, "loss": 0.4031, "step": 970 }, { "epoch": 0.5915195412705598, "grad_norm": 0.8161486510568995, "learning_rate": 9.71738334883449e-06, "loss": 0.4012, "step": 980 }, { "epoch": 0.5975554549569941, "grad_norm": 0.7478470587664012, "learning_rate": 9.705627856092743e-06, "loss": 0.4035, "step": 990 }, { "epoch": 0.6035913686434284, "grad_norm": 1.2181648223419725, "learning_rate": 9.69364024058242e-06, "loss": 0.3994, "step": 1000 }, { "epoch": 0.6096272823298626, "grad_norm": 0.724496170506016, "learning_rate": 9.681421093637677e-06, "loss": 0.4003, "step": 1010 }, { "epoch": 0.6156631960162969, "grad_norm": 0.7245373569956688, "learning_rate": 9.668971018013835e-06, "loss": 0.3993, "step": 1020 }, { "epoch": 0.6216991097027312, "grad_norm": 1.3707555561464966, "learning_rate": 9.656290627857638e-06, "loss": 0.4031, "step": 1030 }, { "epoch": 0.6277350233891655, "grad_norm": 0.8617205371794142, "learning_rate": 9.643380548676957e-06, "loss": 0.3989, "step": 1040 }, { "epoch": 0.6337709370755998, "grad_norm": 0.7218421707442351, "learning_rate": 9.63024141730994e-06, "loss": 0.4009, "step": 1050 }, { "epoch": 0.6398068507620341, "grad_norm": 0.7919863849580143, "learning_rate": 9.616873881893593e-06, "loss": 0.402, "step": 1060 }, { "epoch": 0.6458427644484683, "grad_norm": 0.7643496416415103, "learning_rate": 9.603278601831806e-06, "loss": 0.3966, "step": 1070 }, { "epoch": 0.6518786781349026, "grad_norm": 0.8387350986976135, "learning_rate": 9.58945624776284e-06, "loss": 0.3974, "step": 1080 }, { "epoch": 0.6579145918213369, "grad_norm": 0.7195707742464319, "learning_rate": 9.575407501526218e-06, "loss": 0.4033, "step": 1090 }, { "epoch": 0.6639505055077712, "grad_norm": 0.8948583587192116, "learning_rate": 9.561133056129122e-06, "loss": 0.4005, "step": 1100 }, { "epoch": 0.6699864191942055, "grad_norm": 0.7784558611785358, "learning_rate": 9.546633615712184e-06, "loss": 0.3969, "step": 1110 }, { "epoch": 0.6760223328806398, "grad_norm": 0.7279188084081983, "learning_rate": 9.531909895514766e-06, "loss": 0.3968, "step": 1120 }, { "epoch": 0.6820582465670741, "grad_norm": 0.7707824454002812, "learning_rate": 9.516962621839667e-06, "loss": 0.3941, "step": 1130 }, { "epoch": 0.6880941602535083, "grad_norm": 0.7559246242676043, "learning_rate": 9.501792532017304e-06, "loss": 0.3935, "step": 1140 }, { "epoch": 0.6941300739399426, "grad_norm": 0.7670492895949397, "learning_rate": 9.48640037436934e-06, "loss": 0.3962, "step": 1150 }, { "epoch": 0.7001659876263769, "grad_norm": 0.7574175499302432, "learning_rate": 9.470786908171761e-06, "loss": 0.396, "step": 1160 }, { "epoch": 0.7062019013128112, "grad_norm": 1.1364368407573255, "learning_rate": 9.454952903617434e-06, "loss": 0.3987, "step": 1170 }, { "epoch": 0.7122378149992455, "grad_norm": 0.6929517509246322, "learning_rate": 9.438899141778105e-06, "loss": 0.3959, "step": 1180 }, { "epoch": 0.7182737286856798, "grad_norm": 0.7239918001848392, "learning_rate": 9.42262641456588e-06, "loss": 0.3961, "step": 1190 }, { "epoch": 0.724309642372114, "grad_norm": 0.7351627240649914, "learning_rate": 9.406135524694146e-06, "loss": 0.3946, "step": 1200 }, { "epoch": 0.7303455560585483, "grad_norm": 0.7178193311197739, "learning_rate": 9.389427285637986e-06, "loss": 0.3934, "step": 1210 }, { "epoch": 0.7363814697449826, "grad_norm": 0.7197436378060236, "learning_rate": 9.372502521594052e-06, "loss": 0.3951, "step": 1220 }, { "epoch": 0.7424173834314169, "grad_norm": 0.7020942866993558, "learning_rate": 9.355362067439899e-06, "loss": 0.3953, "step": 1230 }, { "epoch": 0.7484532971178512, "grad_norm": 0.6493652144119091, "learning_rate": 9.338006768692807e-06, "loss": 0.3976, "step": 1240 }, { "epoch": 0.7544892108042855, "grad_norm": 0.7452091082245685, "learning_rate": 9.320437481468077e-06, "loss": 0.3947, "step": 1250 }, { "epoch": 0.7605251244907197, "grad_norm": 0.7211982596336295, "learning_rate": 9.302655072436789e-06, "loss": 0.3978, "step": 1260 }, { "epoch": 0.766561038177154, "grad_norm": 0.8069527677411222, "learning_rate": 9.284660418783064e-06, "loss": 0.3961, "step": 1270 }, { "epoch": 0.7725969518635883, "grad_norm": 0.6964974366663241, "learning_rate": 9.266454408160779e-06, "loss": 0.395, "step": 1280 }, { "epoch": 0.7786328655500226, "grad_norm": 0.6951835215600591, "learning_rate": 9.248037938649792e-06, "loss": 0.3918, "step": 1290 }, { "epoch": 0.784668779236457, "grad_norm": 0.7011033108204148, "learning_rate": 9.229411918711637e-06, "loss": 0.3911, "step": 1300 }, { "epoch": 0.7907046929228913, "grad_norm": 0.6699999752789259, "learning_rate": 9.210577267144703e-06, "loss": 0.3917, "step": 1310 }, { "epoch": 0.7967406066093254, "grad_norm": 0.7952469588442095, "learning_rate": 9.191534913038926e-06, "loss": 0.393, "step": 1320 }, { "epoch": 0.8027765202957597, "grad_norm": 0.7362949625214187, "learning_rate": 9.172285795729945e-06, "loss": 0.3916, "step": 1330 }, { "epoch": 0.808812433982194, "grad_norm": 0.777349182077021, "learning_rate": 9.152830864752773e-06, "loss": 0.396, "step": 1340 }, { "epoch": 0.8148483476686283, "grad_norm": 0.6858011231159463, "learning_rate": 9.133171079794952e-06, "loss": 0.3949, "step": 1350 }, { "epoch": 0.8208842613550626, "grad_norm": 0.8252893789848457, "learning_rate": 9.113307410649222e-06, "loss": 0.3951, "step": 1360 }, { "epoch": 0.826920175041497, "grad_norm": 0.742614174317752, "learning_rate": 9.093240837165668e-06, "loss": 0.3912, "step": 1370 }, { "epoch": 0.8329560887279311, "grad_norm": 0.6712408370389595, "learning_rate": 9.072972349203401e-06, "loss": 0.3938, "step": 1380 }, { "epoch": 0.8389920024143654, "grad_norm": 0.7390425813359819, "learning_rate": 9.052502946581718e-06, "loss": 0.3902, "step": 1390 }, { "epoch": 0.8450279161007997, "grad_norm": 0.9031901060003036, "learning_rate": 9.031833639030789e-06, "loss": 0.39, "step": 1400 }, { "epoch": 0.851063829787234, "grad_norm": 0.8073830235615219, "learning_rate": 9.010965446141842e-06, "loss": 0.3907, "step": 1410 }, { "epoch": 0.8570997434736684, "grad_norm": 0.7197468777451328, "learning_rate": 8.989899397316875e-06, "loss": 0.3933, "step": 1420 }, { "epoch": 0.8631356571601027, "grad_norm": 0.7874409375571629, "learning_rate": 8.96863653171787e-06, "loss": 0.3941, "step": 1430 }, { "epoch": 0.8691715708465368, "grad_norm": 0.7047790860975574, "learning_rate": 8.947177898215538e-06, "loss": 0.3918, "step": 1440 }, { "epoch": 0.8752074845329711, "grad_norm": 0.6732410856766448, "learning_rate": 8.925524555337575e-06, "loss": 0.3948, "step": 1450 }, { "epoch": 0.8812433982194054, "grad_norm": 0.6379130166882847, "learning_rate": 8.90367757121645e-06, "loss": 0.392, "step": 1460 }, { "epoch": 0.8872793119058398, "grad_norm": 0.6453169279070088, "learning_rate": 8.881638023536715e-06, "loss": 0.3902, "step": 1470 }, { "epoch": 0.893315225592274, "grad_norm": 0.8925532684482897, "learning_rate": 8.859406999481839e-06, "loss": 0.3897, "step": 1480 }, { "epoch": 0.8993511392787084, "grad_norm": 0.7321151042406583, "learning_rate": 8.836985595680585e-06, "loss": 0.3903, "step": 1490 }, { "epoch": 0.9053870529651425, "grad_norm": 0.717542202485072, "learning_rate": 8.81437491815291e-06, "loss": 0.3907, "step": 1500 }, { "epoch": 0.9114229666515768, "grad_norm": 0.6899069830042462, "learning_rate": 8.791576082255414e-06, "loss": 0.3914, "step": 1510 }, { "epoch": 0.9174588803380112, "grad_norm": 0.7416902913208727, "learning_rate": 8.768590212626305e-06, "loss": 0.3914, "step": 1520 }, { "epoch": 0.9234947940244455, "grad_norm": 0.648187852127454, "learning_rate": 8.745418443129944e-06, "loss": 0.3878, "step": 1530 }, { "epoch": 0.9295307077108798, "grad_norm": 0.6971446829374528, "learning_rate": 8.722061916800892e-06, "loss": 0.3889, "step": 1540 }, { "epoch": 0.9355666213973141, "grad_norm": 0.6897656341763103, "learning_rate": 8.698521785787543e-06, "loss": 0.3916, "step": 1550 }, { "epoch": 0.9416025350837482, "grad_norm": 0.6707821534631215, "learning_rate": 8.674799211295272e-06, "loss": 0.3872, "step": 1560 }, { "epoch": 0.9476384487701826, "grad_norm": 0.7047440310341709, "learning_rate": 8.650895363529172e-06, "loss": 0.3893, "step": 1570 }, { "epoch": 0.9536743624566169, "grad_norm": 0.7111300925227007, "learning_rate": 8.626811421636318e-06, "loss": 0.3899, "step": 1580 }, { "epoch": 0.9597102761430512, "grad_norm": 0.742242466940292, "learning_rate": 8.602548573647603e-06, "loss": 0.3933, "step": 1590 }, { "epoch": 0.9657461898294855, "grad_norm": 0.6405514647772552, "learning_rate": 8.578108016419138e-06, "loss": 0.3886, "step": 1600 }, { "epoch": 0.9717821035159198, "grad_norm": 0.6969067995610034, "learning_rate": 8.553490955573207e-06, "loss": 0.3875, "step": 1610 }, { "epoch": 0.9778180172023541, "grad_norm": 0.6404080311189763, "learning_rate": 8.528698605438801e-06, "loss": 0.3915, "step": 1620 }, { "epoch": 0.9838539308887883, "grad_norm": 0.689314089106684, "learning_rate": 8.50373218899171e-06, "loss": 0.3897, "step": 1630 }, { "epoch": 0.9898898445752226, "grad_norm": 0.6238451440610306, "learning_rate": 8.478592937794202e-06, "loss": 0.3865, "step": 1640 }, { "epoch": 0.9959257582616569, "grad_norm": 0.6246538104726604, "learning_rate": 8.453282091934262e-06, "loss": 0.3891, "step": 1650 }, { "epoch": 1.0018107741059303, "grad_norm": 0.6650133535244673, "learning_rate": 8.427800899964438e-06, "loss": 0.3775, "step": 1660 }, { "epoch": 1.0078466877923646, "grad_norm": 0.7340465665361768, "learning_rate": 8.402150618840229e-06, "loss": 0.3658, "step": 1670 }, { "epoch": 1.013882601478799, "grad_norm": 0.8803678131362109, "learning_rate": 8.376332513858091e-06, "loss": 0.3643, "step": 1680 }, { "epoch": 1.0199185151652332, "grad_norm": 0.6784266807756097, "learning_rate": 8.350347858593035e-06, "loss": 0.3632, "step": 1690 }, { "epoch": 1.0259544288516673, "grad_norm": 0.6757297253946429, "learning_rate": 8.324197934835775e-06, "loss": 0.3611, "step": 1700 }, { "epoch": 1.0319903425381016, "grad_norm": 0.6937615226816463, "learning_rate": 8.297884032529525e-06, "loss": 0.3641, "step": 1710 }, { "epoch": 1.038026256224536, "grad_norm": 0.6656265896882699, "learning_rate": 8.271407449706347e-06, "loss": 0.3634, "step": 1720 }, { "epoch": 1.0440621699109702, "grad_norm": 0.6758693000716391, "learning_rate": 8.244769492423144e-06, "loss": 0.3651, "step": 1730 }, { "epoch": 1.0500980835974045, "grad_norm": 0.7271602756269683, "learning_rate": 8.217971474697205e-06, "loss": 0.3655, "step": 1740 }, { "epoch": 1.0561339972838388, "grad_norm": 0.7262048623607191, "learning_rate": 8.191014718441413e-06, "loss": 0.3646, "step": 1750 }, { "epoch": 1.0621699109702731, "grad_norm": 0.7594858496478063, "learning_rate": 8.163900553399022e-06, "loss": 0.3683, "step": 1760 }, { "epoch": 1.0682058246567074, "grad_norm": 0.6834326812737692, "learning_rate": 8.13663031707806e-06, "loss": 0.3657, "step": 1770 }, { "epoch": 1.0742417383431417, "grad_norm": 0.829231127715137, "learning_rate": 8.109205354685367e-06, "loss": 0.3657, "step": 1780 }, { "epoch": 1.080277652029576, "grad_norm": 0.7172584884654448, "learning_rate": 8.081627019060223e-06, "loss": 0.3612, "step": 1790 }, { "epoch": 1.0863135657160103, "grad_norm": 0.700123283944604, "learning_rate": 8.053896670607616e-06, "loss": 0.3669, "step": 1800 }, { "epoch": 1.0923494794024446, "grad_norm": 0.6802763184360072, "learning_rate": 8.026015677231137e-06, "loss": 0.36, "step": 1810 }, { "epoch": 1.0983853930888787, "grad_norm": 0.6976972839342949, "learning_rate": 7.997985414265513e-06, "loss": 0.3645, "step": 1820 }, { "epoch": 1.104421306775313, "grad_norm": 0.6892045690564895, "learning_rate": 7.969807264408745e-06, "loss": 0.3664, "step": 1830 }, { "epoch": 1.1104572204617473, "grad_norm": 0.6606374628961976, "learning_rate": 7.94148261765391e-06, "loss": 0.3611, "step": 1840 }, { "epoch": 1.1164931341481816, "grad_norm": 0.7063672325182395, "learning_rate": 7.913012871220605e-06, "loss": 0.3652, "step": 1850 }, { "epoch": 1.122529047834616, "grad_norm": 0.6353061774622171, "learning_rate": 7.884399429486e-06, "loss": 0.3619, "step": 1860 }, { "epoch": 1.1285649615210502, "grad_norm": 0.6646621743965846, "learning_rate": 7.855643703915585e-06, "loss": 0.3638, "step": 1870 }, { "epoch": 1.1346008752074845, "grad_norm": 0.6379034557335701, "learning_rate": 7.826747112993532e-06, "loss": 0.3595, "step": 1880 }, { "epoch": 1.1406367888939188, "grad_norm": 0.6995974469144366, "learning_rate": 7.797711082152726e-06, "loss": 0.3628, "step": 1890 }, { "epoch": 1.1466727025803531, "grad_norm": 0.6564170955860726, "learning_rate": 7.768537043704447e-06, "loss": 0.3637, "step": 1900 }, { "epoch": 1.1527086162667874, "grad_norm": 0.7572552114374352, "learning_rate": 7.739226436767721e-06, "loss": 0.362, "step": 1910 }, { "epoch": 1.1587445299532217, "grad_norm": 0.7571612085211564, "learning_rate": 7.709780707198328e-06, "loss": 0.3638, "step": 1920 }, { "epoch": 1.164780443639656, "grad_norm": 0.6792493024466744, "learning_rate": 7.680201307517479e-06, "loss": 0.3625, "step": 1930 }, { "epoch": 1.1708163573260904, "grad_norm": 0.664259682779261, "learning_rate": 7.650489696840164e-06, "loss": 0.3646, "step": 1940 }, { "epoch": 1.1768522710125244, "grad_norm": 0.6270149603322056, "learning_rate": 7.6206473408031775e-06, "loss": 0.3624, "step": 1950 }, { "epoch": 1.1828881846989587, "grad_norm": 0.6383894085325998, "learning_rate": 7.590675711492823e-06, "loss": 0.3643, "step": 1960 }, { "epoch": 1.188924098385393, "grad_norm": 0.6816453891866903, "learning_rate": 7.56057628737229e-06, "loss": 0.3637, "step": 1970 }, { "epoch": 1.1949600120718273, "grad_norm": 0.7133078108250313, "learning_rate": 7.530350553208726e-06, "loss": 0.3585, "step": 1980 }, { "epoch": 1.2009959257582616, "grad_norm": 0.6322767475179056, "learning_rate": 7.500000000000001e-06, "loss": 0.361, "step": 1990 }, { "epoch": 1.207031839444696, "grad_norm": 0.6987380190815154, "learning_rate": 7.469526124901149e-06, "loss": 0.3623, "step": 2000 }, { "epoch": 1.2130677531311302, "grad_norm": 0.6219916214226197, "learning_rate": 7.4389304311505195e-06, "loss": 0.3637, "step": 2010 }, { "epoch": 1.2191036668175645, "grad_norm": 0.6591583924033313, "learning_rate": 7.408214427995628e-06, "loss": 0.3644, "step": 2020 }, { "epoch": 1.2251395805039988, "grad_norm": 0.7005471225701302, "learning_rate": 7.3773796306187e-06, "loss": 0.3595, "step": 2030 }, { "epoch": 1.2311754941904332, "grad_norm": 0.6332845796820719, "learning_rate": 7.346427560061931e-06, "loss": 0.3652, "step": 2040 }, { "epoch": 1.2372114078768675, "grad_norm": 0.6778224076333697, "learning_rate": 7.315359743152464e-06, "loss": 0.3606, "step": 2050 }, { "epoch": 1.2432473215633015, "grad_norm": 0.6582665893949518, "learning_rate": 7.284177712427056e-06, "loss": 0.3599, "step": 2060 }, { "epoch": 1.2492832352497358, "grad_norm": 0.6584059931101761, "learning_rate": 7.252883006056495e-06, "loss": 0.3622, "step": 2070 }, { "epoch": 1.2553191489361701, "grad_norm": 0.6857700496450303, "learning_rate": 7.221477167769716e-06, "loss": 0.3633, "step": 2080 }, { "epoch": 1.2613550626226044, "grad_norm": 0.6856644672766703, "learning_rate": 7.189961746777657e-06, "loss": 0.363, "step": 2090 }, { "epoch": 1.2673909763090387, "grad_norm": 0.6857005736783666, "learning_rate": 7.1583382976968295e-06, "loss": 0.3618, "step": 2100 }, { "epoch": 1.273426889995473, "grad_norm": 0.6166440607694041, "learning_rate": 7.126608380472642e-06, "loss": 0.3593, "step": 2110 }, { "epoch": 1.2794628036819073, "grad_norm": 0.6673854300030073, "learning_rate": 7.094773560302438e-06, "loss": 0.3616, "step": 2120 }, { "epoch": 1.2854987173683416, "grad_norm": 0.6261609808400934, "learning_rate": 7.062835407558295e-06, "loss": 0.3623, "step": 2130 }, { "epoch": 1.291534631054776, "grad_norm": 0.6573770008704372, "learning_rate": 7.030795497709559e-06, "loss": 0.3616, "step": 2140 }, { "epoch": 1.2975705447412103, "grad_norm": 0.63175357402283, "learning_rate": 6.99865541124513e-06, "loss": 0.363, "step": 2150 }, { "epoch": 1.3036064584276446, "grad_norm": 0.7095581591416922, "learning_rate": 6.9664167335954866e-06, "loss": 0.3604, "step": 2160 }, { "epoch": 1.3096423721140789, "grad_norm": 0.6211244267814455, "learning_rate": 6.9340810550545004e-06, "loss": 0.3584, "step": 2170 }, { "epoch": 1.3156782858005132, "grad_norm": 0.6411383893721285, "learning_rate": 6.901649970700966e-06, "loss": 0.3616, "step": 2180 }, { "epoch": 1.3217141994869475, "grad_norm": 0.6508872294411808, "learning_rate": 6.869125080319934e-06, "loss": 0.3626, "step": 2190 }, { "epoch": 1.3277501131733815, "grad_norm": 0.6456129899609592, "learning_rate": 6.836507988323785e-06, "loss": 0.3612, "step": 2200 }, { "epoch": 1.3337860268598158, "grad_norm": 0.6885055595324049, "learning_rate": 6.803800303673096e-06, "loss": 0.3588, "step": 2210 }, { "epoch": 1.3398219405462501, "grad_norm": 0.6841559054058574, "learning_rate": 6.77100363979726e-06, "loss": 0.3608, "step": 2220 }, { "epoch": 1.3458578542326844, "grad_norm": 0.7229876827512576, "learning_rate": 6.738119614514913e-06, "loss": 0.3655, "step": 2230 }, { "epoch": 1.3518937679191187, "grad_norm": 0.6235312062043321, "learning_rate": 6.705149849954116e-06, "loss": 0.3607, "step": 2240 }, { "epoch": 1.357929681605553, "grad_norm": 0.6372979896414575, "learning_rate": 6.672095972472339e-06, "loss": 0.3613, "step": 2250 }, { "epoch": 1.3639655952919874, "grad_norm": 0.5943237749223176, "learning_rate": 6.638959612576243e-06, "loss": 0.3578, "step": 2260 }, { "epoch": 1.3700015089784217, "grad_norm": 0.6331473442190148, "learning_rate": 6.605742404841241e-06, "loss": 0.3606, "step": 2270 }, { "epoch": 1.376037422664856, "grad_norm": 0.6352200712052698, "learning_rate": 6.572445987830869e-06, "loss": 0.3602, "step": 2280 }, { "epoch": 1.38207333635129, "grad_norm": 0.6315011206585134, "learning_rate": 6.539072004015962e-06, "loss": 0.3585, "step": 2290 }, { "epoch": 1.3881092500377243, "grad_norm": 0.672467399271792, "learning_rate": 6.505622099693624e-06, "loss": 0.359, "step": 2300 }, { "epoch": 1.3941451637241586, "grad_norm": 0.6540330679200106, "learning_rate": 6.4720979249060245e-06, "loss": 0.357, "step": 2310 }, { "epoch": 1.400181077410593, "grad_norm": 0.6296334356002367, "learning_rate": 6.438501133359006e-06, "loss": 0.363, "step": 2320 }, { "epoch": 1.4062169910970272, "grad_norm": 0.5755292937597596, "learning_rate": 6.404833382340498e-06, "loss": 0.3579, "step": 2330 }, { "epoch": 1.4122529047834615, "grad_norm": 0.6273216809842853, "learning_rate": 6.3710963326387845e-06, "loss": 0.361, "step": 2340 }, { "epoch": 1.4182888184698959, "grad_norm": 0.659504858020357, "learning_rate": 6.337291648460554e-06, "loss": 0.3648, "step": 2350 }, { "epoch": 1.4243247321563302, "grad_norm": 0.646430703430766, "learning_rate": 6.303420997348828e-06, "loss": 0.3609, "step": 2360 }, { "epoch": 1.4303606458427645, "grad_norm": 0.70677217944382, "learning_rate": 6.269486050100692e-06, "loss": 0.3583, "step": 2370 }, { "epoch": 1.4363965595291988, "grad_norm": 0.6982928562021034, "learning_rate": 6.2354884806848825e-06, "loss": 0.3587, "step": 2380 }, { "epoch": 1.442432473215633, "grad_norm": 0.635748838083391, "learning_rate": 6.201429966159203e-06, "loss": 0.3603, "step": 2390 }, { "epoch": 1.4484683869020674, "grad_norm": 0.6591941857655591, "learning_rate": 6.167312186587813e-06, "loss": 0.3587, "step": 2400 }, { "epoch": 1.4545043005885017, "grad_norm": 0.6513018732706167, "learning_rate": 6.133136824958334e-06, "loss": 0.3583, "step": 2410 }, { "epoch": 1.460540214274936, "grad_norm": 0.6895727383237782, "learning_rate": 6.098905567098846e-06, "loss": 0.3638, "step": 2420 }, { "epoch": 1.4665761279613703, "grad_norm": 0.6281650394691185, "learning_rate": 6.064620101594715e-06, "loss": 0.3629, "step": 2430 }, { "epoch": 1.4726120416478046, "grad_norm": 0.7324490252015554, "learning_rate": 6.030282119705306e-06, "loss": 0.3621, "step": 2440 }, { "epoch": 1.4786479553342387, "grad_norm": 0.6803933740478001, "learning_rate": 5.99589331528055e-06, "loss": 0.3613, "step": 2450 }, { "epoch": 1.484683869020673, "grad_norm": 0.6535344969186776, "learning_rate": 5.961455384677393e-06, "loss": 0.3588, "step": 2460 }, { "epoch": 1.4907197827071073, "grad_norm": 0.6220530519094237, "learning_rate": 5.92697002667611e-06, "loss": 0.3614, "step": 2470 }, { "epoch": 1.4967556963935416, "grad_norm": 0.5997735782443615, "learning_rate": 5.892438942396515e-06, "loss": 0.3562, "step": 2480 }, { "epoch": 1.5027916100799759, "grad_norm": 0.5881600037112182, "learning_rate": 5.857863835214041e-06, "loss": 0.36, "step": 2490 }, { "epoch": 1.5088275237664102, "grad_norm": 0.6301732957095514, "learning_rate": 5.823246410675714e-06, "loss": 0.3602, "step": 2500 }, { "epoch": 1.5148634374528445, "grad_norm": 0.6369138058336548, "learning_rate": 5.788588376416026e-06, "loss": 0.3575, "step": 2510 }, { "epoch": 1.5208993511392785, "grad_norm": 1.8916358390305654, "learning_rate": 5.753891442072693e-06, "loss": 0.3584, "step": 2520 }, { "epoch": 1.5269352648257128, "grad_norm": 0.6400402583906231, "learning_rate": 5.719157319202325e-06, "loss": 0.3539, "step": 2530 }, { "epoch": 1.5329711785121471, "grad_norm": 0.6223661041265537, "learning_rate": 5.684387721195997e-06, "loss": 0.3595, "step": 2540 }, { "epoch": 1.5390070921985815, "grad_norm": 0.6649761362975228, "learning_rate": 5.649584363194725e-06, "loss": 0.36, "step": 2550 }, { "epoch": 1.5450430058850158, "grad_norm": 0.5989851062495032, "learning_rate": 5.6147489620048655e-06, "loss": 0.3582, "step": 2560 }, { "epoch": 1.55107891957145, "grad_norm": 0.6435791376898407, "learning_rate": 5.579883236013429e-06, "loss": 0.3559, "step": 2570 }, { "epoch": 1.5571148332578844, "grad_norm": 0.5973586913854247, "learning_rate": 5.544988905103304e-06, "loss": 0.3581, "step": 2580 }, { "epoch": 1.5631507469443187, "grad_norm": 0.6331916860819433, "learning_rate": 5.510067690568429e-06, "loss": 0.3573, "step": 2590 }, { "epoch": 1.569186660630753, "grad_norm": 0.6000249694556851, "learning_rate": 5.475121315028876e-06, "loss": 0.3574, "step": 2600 }, { "epoch": 1.5752225743171873, "grad_norm": 0.5919987411148389, "learning_rate": 5.4401515023458805e-06, "loss": 0.3622, "step": 2610 }, { "epoch": 1.5812584880036216, "grad_norm": 0.6130160505042299, "learning_rate": 5.4051599775368e-06, "loss": 0.3585, "step": 2620 }, { "epoch": 1.5872944016900559, "grad_norm": 0.6196465067482942, "learning_rate": 5.370148466690026e-06, "loss": 0.3524, "step": 2630 }, { "epoch": 1.5933303153764902, "grad_norm": 0.6396523422153624, "learning_rate": 5.335118696879836e-06, "loss": 0.3584, "step": 2640 }, { "epoch": 1.5993662290629245, "grad_norm": 0.6247037129381725, "learning_rate": 5.3000723960812e-06, "loss": 0.358, "step": 2650 }, { "epoch": 1.6054021427493588, "grad_norm": 0.6296280096461855, "learning_rate": 5.265011293084539e-06, "loss": 0.3557, "step": 2660 }, { "epoch": 1.611438056435793, "grad_norm": 0.6270649643037325, "learning_rate": 5.2299371174104505e-06, "loss": 0.3586, "step": 2670 }, { "epoch": 1.6174739701222274, "grad_norm": 0.6724245016825049, "learning_rate": 5.194851599224392e-06, "loss": 0.3563, "step": 2680 }, { "epoch": 1.6235098838086617, "grad_norm": 0.6246722692854128, "learning_rate": 5.159756469251327e-06, "loss": 0.3587, "step": 2690 }, { "epoch": 1.629545797495096, "grad_norm": 0.5856892553580461, "learning_rate": 5.1246534586903655e-06, "loss": 0.3538, "step": 2700 }, { "epoch": 1.63558171118153, "grad_norm": 0.6199649535926036, "learning_rate": 5.089544299129349e-06, "loss": 0.3552, "step": 2710 }, { "epoch": 1.6416176248679644, "grad_norm": 0.6395106688159933, "learning_rate": 5.054430722459442e-06, "loss": 0.3575, "step": 2720 }, { "epoch": 1.6476535385543987, "grad_norm": 0.6217763272730691, "learning_rate": 5.019314460789708e-06, "loss": 0.3568, "step": 2730 }, { "epoch": 1.653689452240833, "grad_norm": 0.6159996290026578, "learning_rate": 4.984197246361649e-06, "loss": 0.3565, "step": 2740 }, { "epoch": 1.6597253659272673, "grad_norm": 0.6021051813495957, "learning_rate": 4.949080811463767e-06, "loss": 0.3577, "step": 2750 }, { "epoch": 1.6657612796137016, "grad_norm": 0.6102206368388114, "learning_rate": 4.913966888346118e-06, "loss": 0.3556, "step": 2760 }, { "epoch": 1.6717971933001357, "grad_norm": 0.5968837038838994, "learning_rate": 4.8788572091348435e-06, "loss": 0.3581, "step": 2770 }, { "epoch": 1.67783310698657, "grad_norm": 0.5981355700097328, "learning_rate": 4.843753505746748e-06, "loss": 0.358, "step": 2780 }, { "epoch": 1.6838690206730043, "grad_norm": 0.6567740858768865, "learning_rate": 4.8086575098038505e-06, "loss": 0.3573, "step": 2790 }, { "epoch": 1.6899049343594386, "grad_norm": 0.6773288375423023, "learning_rate": 4.773570952547975e-06, "loss": 0.3552, "step": 2800 }, { "epoch": 1.6959408480458729, "grad_norm": 0.6202686068367487, "learning_rate": 4.738495564755345e-06, "loss": 0.3547, "step": 2810 }, { "epoch": 1.7019767617323072, "grad_norm": 0.5595337919079114, "learning_rate": 4.703433076651205e-06, "loss": 0.353, "step": 2820 }, { "epoch": 1.7080126754187415, "grad_norm": 0.6583890978208258, "learning_rate": 4.668385217824482e-06, "loss": 0.3583, "step": 2830 }, { "epoch": 1.7140485891051758, "grad_norm": 0.5898922057879373, "learning_rate": 4.633353717142448e-06, "loss": 0.3524, "step": 2840 }, { "epoch": 1.72008450279161, "grad_norm": 0.5938698503556435, "learning_rate": 4.5983403026654625e-06, "loss": 0.3554, "step": 2850 }, { "epoch": 1.7261204164780444, "grad_norm": 0.632653867195755, "learning_rate": 4.563346701561699e-06, "loss": 0.3535, "step": 2860 }, { "epoch": 1.7321563301644787, "grad_norm": 0.634481958151908, "learning_rate": 4.528374640021975e-06, "loss": 0.3548, "step": 2870 }, { "epoch": 1.738192243850913, "grad_norm": 0.6554591212571549, "learning_rate": 4.493425843174581e-06, "loss": 0.3523, "step": 2880 }, { "epoch": 1.7442281575373473, "grad_norm": 0.639030241328894, "learning_rate": 4.4585020350001885e-06, "loss": 0.3571, "step": 2890 }, { "epoch": 1.7502640712237816, "grad_norm": 0.579081823243162, "learning_rate": 4.423604938246815e-06, "loss": 0.358, "step": 2900 }, { "epoch": 1.7562999849102159, "grad_norm": 0.5786332593667859, "learning_rate": 4.38873627434483e-06, "loss": 0.3546, "step": 2910 }, { "epoch": 1.7623358985966502, "grad_norm": 0.5844630643462843, "learning_rate": 4.353897763322053e-06, "loss": 0.3557, "step": 2920 }, { "epoch": 1.7683718122830845, "grad_norm": 0.6362540824300466, "learning_rate": 4.319091123718891e-06, "loss": 0.3577, "step": 2930 }, { "epoch": 1.7744077259695188, "grad_norm": 0.6152238906869951, "learning_rate": 4.284318072503581e-06, "loss": 0.3558, "step": 2940 }, { "epoch": 1.7804436396559529, "grad_norm": 0.5871415463947245, "learning_rate": 4.249580324987482e-06, "loss": 0.3565, "step": 2950 }, { "epoch": 1.7864795533423872, "grad_norm": 0.5894304003956816, "learning_rate": 4.2148795947404664e-06, "loss": 0.3548, "step": 2960 }, { "epoch": 1.7925154670288215, "grad_norm": 0.5546376741165042, "learning_rate": 4.180217593506394e-06, "loss": 0.3545, "step": 2970 }, { "epoch": 1.7985513807152558, "grad_norm": 0.5882950021870835, "learning_rate": 4.1455960311186645e-06, "loss": 0.3578, "step": 2980 }, { "epoch": 1.80458729440169, "grad_norm": 0.6581353476419389, "learning_rate": 4.111016615415887e-06, "loss": 0.3545, "step": 2990 }, { "epoch": 1.8106232080881244, "grad_norm": 0.728199708802779, "learning_rate": 4.076481052157621e-06, "loss": 0.3567, "step": 3000 }, { "epoch": 1.8166591217745585, "grad_norm": 0.5836951966903218, "learning_rate": 4.0419910449402385e-06, "loss": 0.3541, "step": 3010 }, { "epoch": 1.8226950354609928, "grad_norm": 0.583825208842142, "learning_rate": 4.0075482951128965e-06, "loss": 0.3557, "step": 3020 }, { "epoch": 1.828730949147427, "grad_norm": 0.627394077298899, "learning_rate": 3.973154501693597e-06, "loss": 0.352, "step": 3030 }, { "epoch": 1.8347668628338614, "grad_norm": 0.6500394437203815, "learning_rate": 3.938811361285386e-06, "loss": 0.3543, "step": 3040 }, { "epoch": 1.8408027765202957, "grad_norm": 0.5787408936785984, "learning_rate": 3.904520567992655e-06, "loss": 0.3539, "step": 3050 }, { "epoch": 1.84683869020673, "grad_norm": 0.6006488260082842, "learning_rate": 3.870283813337587e-06, "loss": 0.3534, "step": 3060 }, { "epoch": 1.8528746038931643, "grad_norm": 0.6017706438925717, "learning_rate": 3.836102786176697e-06, "loss": 0.3533, "step": 3070 }, { "epoch": 1.8589105175795986, "grad_norm": 0.6160731963284618, "learning_rate": 3.8019791726175353e-06, "loss": 0.3537, "step": 3080 }, { "epoch": 1.8649464312660329, "grad_norm": 0.7394723530516694, "learning_rate": 3.767914655935513e-06, "loss": 0.3512, "step": 3090 }, { "epoch": 1.8709823449524672, "grad_norm": 0.5969802619046902, "learning_rate": 3.73391091649086e-06, "loss": 0.3514, "step": 3100 }, { "epoch": 1.8770182586389015, "grad_norm": 0.6434909203687009, "learning_rate": 3.6999696316457468e-06, "loss": 0.3525, "step": 3110 }, { "epoch": 1.8830541723253358, "grad_norm": 0.6185839002292769, "learning_rate": 3.6660924756815314e-06, "loss": 0.3516, "step": 3120 }, { "epoch": 1.88909008601177, "grad_norm": 0.5764246370880874, "learning_rate": 3.63228111971618e-06, "loss": 0.3543, "step": 3130 }, { "epoch": 1.8951259996982044, "grad_norm": 0.5724269342695871, "learning_rate": 3.5985372316218187e-06, "loss": 0.3524, "step": 3140 }, { "epoch": 1.9011619133846387, "grad_norm": 0.5893980753783277, "learning_rate": 3.5648624759424723e-06, "loss": 0.3487, "step": 3150 }, { "epoch": 1.907197827071073, "grad_norm": 0.6385286384600478, "learning_rate": 3.5312585138119503e-06, "loss": 0.353, "step": 3160 }, { "epoch": 1.9132337407575073, "grad_norm": 0.643587632906283, "learning_rate": 3.4977270028719013e-06, "loss": 0.3498, "step": 3170 }, { "epoch": 1.9192696544439416, "grad_norm": 0.6189874783125575, "learning_rate": 3.4642695971900506e-06, "loss": 0.3542, "step": 3180 }, { "epoch": 1.925305568130376, "grad_norm": 0.6320316722606764, "learning_rate": 3.4308879471785986e-06, "loss": 0.3523, "step": 3190 }, { "epoch": 1.93134148181681, "grad_norm": 0.6715762862677156, "learning_rate": 3.3975836995128176e-06, "loss": 0.3505, "step": 3200 }, { "epoch": 1.9373773955032443, "grad_norm": 0.5947951437286136, "learning_rate": 3.3643584970498166e-06, "loss": 0.356, "step": 3210 }, { "epoch": 1.9434133091896786, "grad_norm": 0.5953138896683005, "learning_rate": 3.3312139787474986e-06, "loss": 0.3552, "step": 3220 }, { "epoch": 1.9494492228761129, "grad_norm": 0.5696476474991146, "learning_rate": 3.298151779583725e-06, "loss": 0.3496, "step": 3230 }, { "epoch": 1.9554851365625472, "grad_norm": 0.6131972032987533, "learning_rate": 3.2651735304756505e-06, "loss": 0.3536, "step": 3240 }, { "epoch": 1.9615210502489815, "grad_norm": 0.6336317988993604, "learning_rate": 3.2322808581992825e-06, "loss": 0.3563, "step": 3250 }, { "epoch": 1.9675569639354156, "grad_norm": 0.6341579320490388, "learning_rate": 3.1994753853092284e-06, "loss": 0.3482, "step": 3260 }, { "epoch": 1.9735928776218499, "grad_norm": 0.5954681993221721, "learning_rate": 3.166758730058653e-06, "loss": 0.3518, "step": 3270 }, { "epoch": 1.9796287913082842, "grad_norm": 0.5893599270303087, "learning_rate": 3.134132506319467e-06, "loss": 0.3536, "step": 3280 }, { "epoch": 1.9856647049947185, "grad_norm": 0.5689301232875419, "learning_rate": 3.101598323502698e-06, "loss": 0.3537, "step": 3290 }, { "epoch": 1.9917006186811528, "grad_norm": 0.6116819898452338, "learning_rate": 3.0691577864791176e-06, "loss": 0.3515, "step": 3300 }, { "epoch": 1.997736532367587, "grad_norm": 0.5926997741101551, "learning_rate": 3.036812495500058e-06, "loss": 0.3504, "step": 3310 }, { "epoch": 2.0036215482118607, "grad_norm": 0.5928785278377309, "learning_rate": 3.0045640461184917e-06, "loss": 0.339, "step": 3320 }, { "epoch": 2.009657461898295, "grad_norm": 0.6039984062866832, "learning_rate": 2.97241402911031e-06, "loss": 0.3325, "step": 3330 }, { "epoch": 2.0156933755847293, "grad_norm": 0.6671960610879556, "learning_rate": 2.940364030395856e-06, "loss": 0.3284, "step": 3340 }, { "epoch": 2.0217292892711636, "grad_norm": 0.5808483500966948, "learning_rate": 2.908415630961702e-06, "loss": 0.3265, "step": 3350 }, { "epoch": 2.027765202957598, "grad_norm": 0.6017580883286716, "learning_rate": 2.876570406782645e-06, "loss": 0.3296, "step": 3360 }, { "epoch": 2.033801116644032, "grad_norm": 0.6067555273933171, "learning_rate": 2.844829928743987e-06, "loss": 0.3315, "step": 3370 }, { "epoch": 2.0398370303304665, "grad_norm": 0.5774545226545359, "learning_rate": 2.813195762564018e-06, "loss": 0.3268, "step": 3380 }, { "epoch": 2.0458729440169003, "grad_norm": 0.5888748284507602, "learning_rate": 2.781669468716811e-06, "loss": 0.3292, "step": 3390 }, { "epoch": 2.0519088577033346, "grad_norm": 0.6137376399757654, "learning_rate": 2.7502526023552227e-06, "loss": 0.3258, "step": 3400 }, { "epoch": 2.057944771389769, "grad_norm": 0.59390579398881, "learning_rate": 2.718946713234185e-06, "loss": 0.3295, "step": 3410 }, { "epoch": 2.0639806850762032, "grad_norm": 0.6555105104152712, "learning_rate": 2.6877533456342714e-06, "loss": 0.3301, "step": 3420 }, { "epoch": 2.0700165987626375, "grad_norm": 0.6048063575727766, "learning_rate": 2.6566740382855005e-06, "loss": 0.3289, "step": 3430 }, { "epoch": 2.076052512449072, "grad_norm": 0.6014841818951663, "learning_rate": 2.625710324291442e-06, "loss": 0.3325, "step": 3440 }, { "epoch": 2.082088426135506, "grad_norm": 0.6035697169885135, "learning_rate": 2.5948637310535886e-06, "loss": 0.3296, "step": 3450 }, { "epoch": 2.0881243398219405, "grad_norm": 0.6112233467387164, "learning_rate": 2.5641357801960186e-06, "loss": 0.3278, "step": 3460 }, { "epoch": 2.0941602535083748, "grad_norm": 0.5870217829586826, "learning_rate": 2.5335279874903185e-06, "loss": 0.3313, "step": 3470 }, { "epoch": 2.100196167194809, "grad_norm": 0.5897131296840935, "learning_rate": 2.503041862780827e-06, "loss": 0.3296, "step": 3480 }, { "epoch": 2.1062320808812434, "grad_norm": 0.5718259687035243, "learning_rate": 2.47267890991016e-06, "loss": 0.3281, "step": 3490 }, { "epoch": 2.1122679945676777, "grad_norm": 0.5777856500315681, "learning_rate": 2.4424406266450045e-06, "loss": 0.3296, "step": 3500 }, { "epoch": 2.118303908254112, "grad_norm": 0.6262457739159312, "learning_rate": 2.412328504602264e-06, "loss": 0.3336, "step": 3510 }, { "epoch": 2.1243398219405463, "grad_norm": 0.589194023665236, "learning_rate": 2.382344029175462e-06, "loss": 0.3349, "step": 3520 }, { "epoch": 2.1303757356269806, "grad_norm": 0.6140628916832596, "learning_rate": 2.3524886794614653e-06, "loss": 0.331, "step": 3530 }, { "epoch": 2.136411649313415, "grad_norm": 0.6028871935735021, "learning_rate": 2.322763928187543e-06, "loss": 0.3307, "step": 3540 }, { "epoch": 2.142447562999849, "grad_norm": 0.5798390235554982, "learning_rate": 2.293171241638698e-06, "loss": 0.3298, "step": 3550 }, { "epoch": 2.1484834766862835, "grad_norm": 0.5950496656474389, "learning_rate": 2.263712079585345e-06, "loss": 0.3305, "step": 3560 }, { "epoch": 2.154519390372718, "grad_norm": 0.5926734664470145, "learning_rate": 2.2343878952113012e-06, "loss": 0.3276, "step": 3570 }, { "epoch": 2.160555304059152, "grad_norm": 0.5877698580097848, "learning_rate": 2.2052001350421096e-06, "loss": 0.3268, "step": 3580 }, { "epoch": 2.1665912177455864, "grad_norm": 0.5888247000199527, "learning_rate": 2.1761502388736655e-06, "loss": 0.3327, "step": 3590 }, { "epoch": 2.1726271314320207, "grad_norm": 0.5807991121980183, "learning_rate": 2.14723963970121e-06, "loss": 0.3315, "step": 3600 }, { "epoch": 2.178663045118455, "grad_norm": 0.5763459777490838, "learning_rate": 2.118469763648643e-06, "loss": 0.3278, "step": 3610 }, { "epoch": 2.1846989588048893, "grad_norm": 0.5588744726618396, "learning_rate": 2.0898420298981537e-06, "loss": 0.3296, "step": 3620 }, { "epoch": 2.1907348724913236, "grad_norm": 0.6040859182215225, "learning_rate": 2.061357850620243e-06, "loss": 0.3279, "step": 3630 }, { "epoch": 2.1967707861777575, "grad_norm": 0.6083091005217864, "learning_rate": 2.0330186309040394e-06, "loss": 0.3298, "step": 3640 }, { "epoch": 2.2028066998641918, "grad_norm": 0.568667447432841, "learning_rate": 2.0048257686879997e-06, "loss": 0.3286, "step": 3650 }, { "epoch": 2.208842613550626, "grad_norm": 0.586169393672314, "learning_rate": 1.9767806546909457e-06, "loss": 0.3316, "step": 3660 }, { "epoch": 2.2148785272370604, "grad_norm": 0.5855668928973393, "learning_rate": 1.9488846723434646e-06, "loss": 0.3262, "step": 3670 }, { "epoch": 2.2209144409234947, "grad_norm": 0.5920501956876788, "learning_rate": 1.921139197719664e-06, "loss": 0.3298, "step": 3680 }, { "epoch": 2.226950354609929, "grad_norm": 0.6343784219115092, "learning_rate": 1.893545599469292e-06, "loss": 0.3316, "step": 3690 }, { "epoch": 2.2329862682963633, "grad_norm": 0.56167618088226, "learning_rate": 1.86610523875023e-06, "loss": 0.3288, "step": 3700 }, { "epoch": 2.2390221819827976, "grad_norm": 0.5937195941687996, "learning_rate": 1.8388194691613308e-06, "loss": 0.3285, "step": 3710 }, { "epoch": 2.245058095669232, "grad_norm": 0.6068056100462802, "learning_rate": 1.811689636675672e-06, "loss": 0.3295, "step": 3720 }, { "epoch": 2.251094009355666, "grad_norm": 0.5998463693882512, "learning_rate": 1.7847170795741414e-06, "loss": 0.33, "step": 3730 }, { "epoch": 2.2571299230421005, "grad_norm": 0.5707846476820784, "learning_rate": 1.7579031283794234e-06, "loss": 0.3324, "step": 3740 }, { "epoch": 2.2631658367285348, "grad_norm": 0.6070101386107148, "learning_rate": 1.7312491057903808e-06, "loss": 0.3288, "step": 3750 }, { "epoch": 2.269201750414969, "grad_norm": 0.5684370763425239, "learning_rate": 1.7047563266167888e-06, "loss": 0.3291, "step": 3760 }, { "epoch": 2.2752376641014034, "grad_norm": 0.5367883177519198, "learning_rate": 1.678426097714489e-06, "loss": 0.3265, "step": 3770 }, { "epoch": 2.2812735777878377, "grad_norm": 0.5853244396608877, "learning_rate": 1.6522597179209187e-06, "loss": 0.3259, "step": 3780 }, { "epoch": 2.287309491474272, "grad_norm": 0.5641343283784108, "learning_rate": 1.6262584779910472e-06, "loss": 0.3286, "step": 3790 }, { "epoch": 2.2933454051607063, "grad_norm": 0.5563209895809159, "learning_rate": 1.600423660533692e-06, "loss": 0.3281, "step": 3800 }, { "epoch": 2.2993813188471406, "grad_norm": 0.5805361043294971, "learning_rate": 1.5747565399482605e-06, "loss": 0.3299, "step": 3810 }, { "epoch": 2.305417232533575, "grad_norm": 0.5811803574606669, "learning_rate": 1.5492583823618878e-06, "loss": 0.3289, "step": 3820 }, { "epoch": 2.311453146220009, "grad_norm": 0.6040233888147246, "learning_rate": 1.523930445566963e-06, "loss": 0.3308, "step": 3830 }, { "epoch": 2.3174890599064435, "grad_norm": 0.6059976475921155, "learning_rate": 1.4987739789591056e-06, "loss": 0.3294, "step": 3840 }, { "epoch": 2.323524973592878, "grad_norm": 0.5905386095910952, "learning_rate": 1.4737902234755203e-06, "loss": 0.3301, "step": 3850 }, { "epoch": 2.329560887279312, "grad_norm": 0.5747067002149818, "learning_rate": 1.448980411533782e-06, "loss": 0.3278, "step": 3860 }, { "epoch": 2.335596800965746, "grad_norm": 0.5732211405787891, "learning_rate": 1.4243457669710564e-06, "loss": 0.3245, "step": 3870 }, { "epoch": 2.3416327146521807, "grad_norm": 0.6079651710560006, "learning_rate": 1.3998875049837141e-06, "loss": 0.3268, "step": 3880 }, { "epoch": 2.3476686283386146, "grad_norm": 0.5783578941572416, "learning_rate": 1.3756068320673938e-06, "loss": 0.3283, "step": 3890 }, { "epoch": 2.353704542025049, "grad_norm": 0.5532376575030373, "learning_rate": 1.3515049459574847e-06, "loss": 0.3254, "step": 3900 }, { "epoch": 2.359740455711483, "grad_norm": 0.5467274114487632, "learning_rate": 1.3275830355700519e-06, "loss": 0.3257, "step": 3910 }, { "epoch": 2.3657763693979175, "grad_norm": 0.5922264462167515, "learning_rate": 1.3038422809431733e-06, "loss": 0.3291, "step": 3920 }, { "epoch": 2.3718122830843518, "grad_norm": 0.5807751637804499, "learning_rate": 1.280283853178742e-06, "loss": 0.3281, "step": 3930 }, { "epoch": 2.377848196770786, "grad_norm": 0.5751202261036737, "learning_rate": 1.256908914384698e-06, "loss": 0.3321, "step": 3940 }, { "epoch": 2.3838841104572204, "grad_norm": 0.5829573981972134, "learning_rate": 1.233718617617689e-06, "loss": 0.3303, "step": 3950 }, { "epoch": 2.3899200241436547, "grad_norm": 0.5614143554083199, "learning_rate": 1.2107141068262119e-06, "loss": 0.3276, "step": 3960 }, { "epoch": 2.395955937830089, "grad_norm": 0.5657826082869326, "learning_rate": 1.1878965167941658e-06, "loss": 0.3279, "step": 3970 }, { "epoch": 2.4019918515165233, "grad_norm": 0.5583977788315128, "learning_rate": 1.1652669730848837e-06, "loss": 0.3259, "step": 3980 }, { "epoch": 2.4080277652029576, "grad_norm": 0.5670227130617606, "learning_rate": 1.1428265919856057e-06, "loss": 0.3319, "step": 3990 }, { "epoch": 2.414063678889392, "grad_norm": 0.5345020446470288, "learning_rate": 1.1205764804524172e-06, "loss": 0.3258, "step": 4000 }, { "epoch": 2.420099592575826, "grad_norm": 0.5742530447532448, "learning_rate": 1.0985177360556421e-06, "loss": 0.3281, "step": 4010 }, { "epoch": 2.4261355062622605, "grad_norm": 0.5681633515485598, "learning_rate": 1.0766514469257006e-06, "loss": 0.33, "step": 4020 }, { "epoch": 2.432171419948695, "grad_norm": 0.5469547021834809, "learning_rate": 1.0549786916994387e-06, "loss": 0.3271, "step": 4030 }, { "epoch": 2.438207333635129, "grad_norm": 0.5467836338693935, "learning_rate": 1.0335005394669062e-06, "loss": 0.3282, "step": 4040 }, { "epoch": 2.4442432473215634, "grad_norm": 0.5496370736783344, "learning_rate": 1.012218049718639e-06, "loss": 0.3267, "step": 4050 }, { "epoch": 2.4502791610079977, "grad_norm": 0.5532695447765059, "learning_rate": 9.911322722933825e-07, "loss": 0.3267, "step": 4060 }, { "epoch": 2.456315074694432, "grad_norm": 0.5593061519759683, "learning_rate": 9.702442473263035e-07, "loss": 0.3261, "step": 4070 }, { "epoch": 2.4623509883808663, "grad_norm": 0.5675718980431652, "learning_rate": 9.495550051976937e-07, "loss": 0.33, "step": 4080 }, { "epoch": 2.4683869020673006, "grad_norm": 0.55036807877547, "learning_rate": 9.290655664821296e-07, "loss": 0.326, "step": 4090 }, { "epoch": 2.474422815753735, "grad_norm": 0.6047393707132771, "learning_rate": 9.087769418981352e-07, "loss": 0.3294, "step": 4100 }, { "epoch": 2.480458729440169, "grad_norm": 0.5377856224781872, "learning_rate": 8.88690132258323e-07, "loss": 0.3301, "step": 4110 }, { "epoch": 2.486494643126603, "grad_norm": 0.5404023215833121, "learning_rate": 8.688061284200266e-07, "loss": 0.3308, "step": 4120 }, { "epoch": 2.492530556813038, "grad_norm": 0.5429038964087051, "learning_rate": 8.491259112364192e-07, "loss": 0.3277, "step": 4130 }, { "epoch": 2.4985664704994717, "grad_norm": 0.5556392061166345, "learning_rate": 8.296504515081333e-07, "loss": 0.328, "step": 4140 }, { "epoch": 2.5046023841859064, "grad_norm": 0.5550852188468128, "learning_rate": 8.103807099353733e-07, "loss": 0.3303, "step": 4150 }, { "epoch": 2.5106382978723403, "grad_norm": 0.5683960534884703, "learning_rate": 7.913176370705166e-07, "loss": 0.3303, "step": 4160 }, { "epoch": 2.5166742115587746, "grad_norm": 0.5647058376594801, "learning_rate": 7.724621732712373e-07, "loss": 0.3281, "step": 4170 }, { "epoch": 2.522710125245209, "grad_norm": 0.5396463872633352, "learning_rate": 7.538152486541078e-07, "loss": 0.3224, "step": 4180 }, { "epoch": 2.528746038931643, "grad_norm": 0.5769965957501234, "learning_rate": 7.353777830487247e-07, "loss": 0.3298, "step": 4190 }, { "epoch": 2.5347819526180775, "grad_norm": 0.5617546845646423, "learning_rate": 7.171506859523298e-07, "loss": 0.3284, "step": 4200 }, { "epoch": 2.540817866304512, "grad_norm": 0.5370456459767287, "learning_rate": 6.991348564849504e-07, "loss": 0.3272, "step": 4210 }, { "epoch": 2.546853779990946, "grad_norm": 0.5449920129863155, "learning_rate": 6.813311833450426e-07, "loss": 0.3244, "step": 4220 }, { "epoch": 2.5528896936773804, "grad_norm": 0.5814796250543772, "learning_rate": 6.637405447656542e-07, "loss": 0.3286, "step": 4230 }, { "epoch": 2.5589256073638147, "grad_norm": 0.5802300234417045, "learning_rate": 6.463638084711088e-07, "loss": 0.3303, "step": 4240 }, { "epoch": 2.564961521050249, "grad_norm": 0.5682016106324166, "learning_rate": 6.29201831634188e-07, "loss": 0.3275, "step": 4250 }, { "epoch": 2.5709974347366833, "grad_norm": 0.628799960343276, "learning_rate": 6.122554608338605e-07, "loss": 0.3278, "step": 4260 }, { "epoch": 2.5770333484231176, "grad_norm": 0.5261749879449605, "learning_rate": 5.955255320135195e-07, "loss": 0.3287, "step": 4270 }, { "epoch": 2.583069262109552, "grad_norm": 0.5365103226953842, "learning_rate": 5.790128704397424e-07, "loss": 0.3242, "step": 4280 }, { "epoch": 2.589105175795986, "grad_norm": 0.5482210552849281, "learning_rate": 5.627182906615825e-07, "loss": 0.3254, "step": 4290 }, { "epoch": 2.5951410894824205, "grad_norm": 0.5270093070193902, "learning_rate": 5.466425964703914e-07, "loss": 0.3268, "step": 4300 }, { "epoch": 2.601177003168855, "grad_norm": 0.5351843851712077, "learning_rate": 5.307865808601664e-07, "loss": 0.3267, "step": 4310 }, { "epoch": 2.607212916855289, "grad_norm": 0.5551045883829538, "learning_rate": 5.151510259884329e-07, "loss": 0.3261, "step": 4320 }, { "epoch": 2.6132488305417234, "grad_norm": 0.5716515174477422, "learning_rate": 4.997367031376627e-07, "loss": 0.3283, "step": 4330 }, { "epoch": 2.6192847442281577, "grad_norm": 0.5484469831279773, "learning_rate": 4.84544372677228e-07, "loss": 0.3279, "step": 4340 }, { "epoch": 2.6253206579145916, "grad_norm": 0.5627722024643765, "learning_rate": 4.6957478402589076e-07, "loss": 0.3285, "step": 4350 }, { "epoch": 2.6313565716010263, "grad_norm": 0.54030007506572, "learning_rate": 4.548286756148401e-07, "loss": 0.328, "step": 4360 }, { "epoch": 2.63739248528746, "grad_norm": 0.5688872966757411, "learning_rate": 4.4030677485125906e-07, "loss": 0.3291, "step": 4370 }, { "epoch": 2.643428398973895, "grad_norm": 0.5611453338620043, "learning_rate": 4.2600979808244627e-07, "loss": 0.3267, "step": 4380 }, { "epoch": 2.649464312660329, "grad_norm": 0.5591585705521456, "learning_rate": 4.119384505604834e-07, "loss": 0.3285, "step": 4390 }, { "epoch": 2.655500226346763, "grad_norm": 0.5403567309346599, "learning_rate": 3.980934264074393e-07, "loss": 0.3234, "step": 4400 }, { "epoch": 2.6615361400331974, "grad_norm": 0.5366841662024877, "learning_rate": 3.8447540858113197e-07, "loss": 0.3289, "step": 4410 }, { "epoch": 2.6675720537196317, "grad_norm": 0.5505493242335168, "learning_rate": 3.710850688414419e-07, "loss": 0.329, "step": 4420 }, { "epoch": 2.673607967406066, "grad_norm": 0.5572305600353893, "learning_rate": 3.579230677171702e-07, "loss": 0.326, "step": 4430 }, { "epoch": 2.6796438810925003, "grad_norm": 0.5320801899819191, "learning_rate": 3.4499005447346024e-07, "loss": 0.3272, "step": 4440 }, { "epoch": 2.6856797947789346, "grad_norm": 0.5621605698475473, "learning_rate": 3.32286667079767e-07, "loss": 0.3232, "step": 4450 }, { "epoch": 2.691715708465369, "grad_norm": 0.5535800034831663, "learning_rate": 3.1981353217838853e-07, "loss": 0.3267, "step": 4460 }, { "epoch": 2.697751622151803, "grad_norm": 0.5541989505631728, "learning_rate": 3.0757126505355284e-07, "loss": 0.3271, "step": 4470 }, { "epoch": 2.7037875358382375, "grad_norm": 0.554309743511386, "learning_rate": 2.9556046960106997e-07, "loss": 0.3275, "step": 4480 }, { "epoch": 2.709823449524672, "grad_norm": 0.5441084268121339, "learning_rate": 2.837817382985375e-07, "loss": 0.3265, "step": 4490 }, { "epoch": 2.715859363211106, "grad_norm": 0.5661752729331364, "learning_rate": 2.722356521761188e-07, "loss": 0.3251, "step": 4500 }, { "epoch": 2.7218952768975404, "grad_norm": 0.5284497315283775, "learning_rate": 2.6092278078788004e-07, "loss": 0.3249, "step": 4510 }, { "epoch": 2.7279311905839747, "grad_norm": 0.52978683625873, "learning_rate": 2.4984368218369305e-07, "loss": 0.3282, "step": 4520 }, { "epoch": 2.733967104270409, "grad_norm": 0.5435219044017648, "learning_rate": 2.389989028817108e-07, "loss": 0.3283, "step": 4530 }, { "epoch": 2.7400030179568433, "grad_norm": 0.5516780362582209, "learning_rate": 2.2838897784140612e-07, "loss": 0.3274, "step": 4540 }, { "epoch": 2.7460389316432776, "grad_norm": 0.546719555306795, "learning_rate": 2.1801443043718285e-07, "loss": 0.3298, "step": 4550 }, { "epoch": 2.752074845329712, "grad_norm": 0.5563980632574993, "learning_rate": 2.0787577243255807e-07, "loss": 0.3267, "step": 4560 }, { "epoch": 2.758110759016146, "grad_norm": 0.535638021015215, "learning_rate": 1.9797350395492077e-07, "loss": 0.3253, "step": 4570 }, { "epoch": 2.76414667270258, "grad_norm": 0.5347609121819951, "learning_rate": 1.8830811347085697e-07, "loss": 0.3252, "step": 4580 }, { "epoch": 2.770182586389015, "grad_norm": 0.5415863482391344, "learning_rate": 1.788800777620542e-07, "loss": 0.3276, "step": 4590 }, { "epoch": 2.7762185000754487, "grad_norm": 0.5466212280219622, "learning_rate": 1.6968986190178728e-07, "loss": 0.326, "step": 4600 }, { "epoch": 2.7822544137618834, "grad_norm": 0.5391843573715891, "learning_rate": 1.60737919231973e-07, "loss": 0.3265, "step": 4610 }, { "epoch": 2.7882903274483173, "grad_norm": 0.5465887305789703, "learning_rate": 1.5202469134080633e-07, "loss": 0.3291, "step": 4620 }, { "epoch": 2.794326241134752, "grad_norm": 0.5447449635613493, "learning_rate": 1.4355060804098043e-07, "loss": 0.3254, "step": 4630 }, { "epoch": 2.800362154821186, "grad_norm": 0.5376834372862567, "learning_rate": 1.3531608734848433e-07, "loss": 0.3252, "step": 4640 }, { "epoch": 2.80639806850762, "grad_norm": 0.5419447242645747, "learning_rate": 1.273215354619789e-07, "loss": 0.3277, "step": 4650 }, { "epoch": 2.8124339821940545, "grad_norm": 0.521436211709283, "learning_rate": 1.1956734674276492e-07, "loss": 0.3267, "step": 4660 }, { "epoch": 2.818469895880489, "grad_norm": 0.5443036316275357, "learning_rate": 1.1205390369532553e-07, "loss": 0.328, "step": 4670 }, { "epoch": 2.824505809566923, "grad_norm": 0.5736771187575125, "learning_rate": 1.0478157694846002e-07, "loss": 0.3269, "step": 4680 }, { "epoch": 2.8305417232533574, "grad_norm": 0.5533030963421177, "learning_rate": 9.775072523700135e-08, "loss": 0.3274, "step": 4690 }, { "epoch": 2.8365776369397917, "grad_norm": 0.5297867847542854, "learning_rate": 9.096169538411747e-08, "loss": 0.3251, "step": 4700 }, { "epoch": 2.842613550626226, "grad_norm": 0.5603590658940372, "learning_rate": 8.441482228420505e-08, "loss": 0.3261, "step": 4710 }, { "epoch": 2.8486494643126603, "grad_norm": 0.5648155137748375, "learning_rate": 7.81104288863721e-08, "loss": 0.3238, "step": 4720 }, { "epoch": 2.8546853779990946, "grad_norm": 0.5155233113764542, "learning_rate": 7.204882617850129e-08, "loss": 0.3284, "step": 4730 }, { "epoch": 2.860721291685529, "grad_norm": 0.5283055469638852, "learning_rate": 6.623031317191386e-08, "loss": 0.3243, "step": 4740 }, { "epoch": 2.866757205371963, "grad_norm": 0.5324757215458941, "learning_rate": 6.065517688661926e-08, "loss": 0.3266, "step": 4750 }, { "epoch": 2.8727931190583975, "grad_norm": 0.5270694862009192, "learning_rate": 5.532369233715418e-08, "loss": 0.3263, "step": 4760 }, { "epoch": 2.878829032744832, "grad_norm": 0.5184850936640313, "learning_rate": 5.02361225190201e-08, "loss": 0.325, "step": 4770 }, { "epoch": 2.884864946431266, "grad_norm": 0.5522807685327075, "learning_rate": 4.539271839570702e-08, "loss": 0.3303, "step": 4780 }, { "epoch": 2.8909008601177004, "grad_norm": 0.5833975533295399, "learning_rate": 4.079371888631667e-08, "loss": 0.3287, "step": 4790 }, { "epoch": 2.8969367738041347, "grad_norm": 0.5435014494666157, "learning_rate": 3.643935085377193e-08, "loss": 0.3291, "step": 4800 }, { "epoch": 2.902972687490569, "grad_norm": 0.5317951774661862, "learning_rate": 3.232982909363247e-08, "loss": 0.3302, "step": 4810 }, { "epoch": 2.9090086011770033, "grad_norm": 0.5470417295465569, "learning_rate": 2.8465356323494897e-08, "loss": 0.3293, "step": 4820 }, { "epoch": 2.915044514863437, "grad_norm": 0.5361189628769133, "learning_rate": 2.4846123172992953e-08, "loss": 0.3281, "step": 4830 }, { "epoch": 2.921080428549872, "grad_norm": 0.54177713240335, "learning_rate": 2.147230817439616e-08, "loss": 0.326, "step": 4840 }, { "epoch": 2.927116342236306, "grad_norm": 0.5323052126594137, "learning_rate": 1.834407775380187e-08, "loss": 0.3281, "step": 4850 }, { "epoch": 2.9331522559227405, "grad_norm": 0.5359903004283559, "learning_rate": 1.5461586222924596e-08, "loss": 0.3261, "step": 4860 }, { "epoch": 2.9391881696091744, "grad_norm": 0.5543427271655068, "learning_rate": 1.2824975771486558e-08, "loss": 0.3264, "step": 4870 }, { "epoch": 2.945224083295609, "grad_norm": 0.5384618149718552, "learning_rate": 1.0434376460201067e-08, "loss": 0.3271, "step": 4880 }, { "epoch": 2.951259996982043, "grad_norm": 0.5121547980752482, "learning_rate": 8.289906214358767e-09, "loss": 0.3252, "step": 4890 }, { "epoch": 2.9572959106684773, "grad_norm": 0.5126843579972032, "learning_rate": 6.391670818008955e-09, "loss": 0.3255, "step": 4900 }, { "epoch": 2.9633318243549116, "grad_norm": 0.5323524113852374, "learning_rate": 4.7397639087432e-09, "loss": 0.3267, "step": 4910 }, { "epoch": 2.969367738041346, "grad_norm": 0.5324354968490875, "learning_rate": 3.3342669730729303e-09, "loss": 0.3255, "step": 4920 }, { "epoch": 2.97540365172778, "grad_norm": 0.550825930999869, "learning_rate": 2.1752493424148647e-09, "loss": 0.328, "step": 4930 }, { "epoch": 2.9814395654142145, "grad_norm": 0.5289245768111625, "learning_rate": 1.2627681896670852e-09, "loss": 0.3265, "step": 4940 }, { "epoch": 2.987475479100649, "grad_norm": 0.5548931035545003, "learning_rate": 5.968685263885165e-10, "loss": 0.329, "step": 4950 }, { "epoch": 2.993511392787083, "grad_norm": 0.5268141741402684, "learning_rate": 1.7758320058236522e-10, "loss": 0.3264, "step": 4960 }, { "epoch": 2.9995473064735174, "grad_norm": 0.5394909098085136, "learning_rate": 4.932895071863009e-12, "loss": 0.3267, "step": 4970 }, { "epoch": 3.0, "step": 4971, "total_flos": 3906508525600768.0, "train_loss": 0.3714317911300974, "train_runtime": 271631.3463, "train_samples_per_second": 4.684, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 4971, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3906508525600768.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }