{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4394, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022762839664248113, "grad_norm": 10.595879842373915, "learning_rate": 2.0454545454545456e-07, "loss": 0.9592, "step": 10 }, { "epoch": 0.004552567932849623, "grad_norm": 8.506899480641875, "learning_rate": 4.3181818181818187e-07, "loss": 0.9341, "step": 20 }, { "epoch": 0.006828851899274435, "grad_norm": 4.048540973587426, "learning_rate": 6.590909090909091e-07, "loss": 0.8698, "step": 30 }, { "epoch": 0.009105135865699245, "grad_norm": 1.7983922183043597, "learning_rate": 8.863636363636364e-07, "loss": 0.7762, "step": 40 }, { "epoch": 0.011381419832124057, "grad_norm": 1.301643851792194, "learning_rate": 1.1136363636363637e-06, "loss": 0.7056, "step": 50 }, { "epoch": 0.01365770379854887, "grad_norm": 0.8116123655731647, "learning_rate": 1.3409090909090911e-06, "loss": 0.636, "step": 60 }, { "epoch": 0.015933987764973682, "grad_norm": 0.7273739176052791, "learning_rate": 1.5681818181818184e-06, "loss": 0.6355, "step": 70 }, { "epoch": 0.01821027173139849, "grad_norm": 0.7380885266969118, "learning_rate": 1.7954545454545456e-06, "loss": 0.6222, "step": 80 }, { "epoch": 0.020486555697823303, "grad_norm": 0.6975134603454084, "learning_rate": 2.022727272727273e-06, "loss": 0.5979, "step": 90 }, { "epoch": 0.022762839664248115, "grad_norm": 0.7291820146919704, "learning_rate": 2.25e-06, "loss": 0.5945, "step": 100 }, { "epoch": 0.025039123630672927, "grad_norm": 0.7008207721708937, "learning_rate": 2.4772727272727275e-06, "loss": 0.5705, "step": 110 }, { "epoch": 0.02731540759709774, "grad_norm": 0.6449537277719501, "learning_rate": 2.7045454545454545e-06, "loss": 0.5718, "step": 120 }, { "epoch": 0.029591691563522548, "grad_norm": 0.6058495976490267, "learning_rate": 2.931818181818182e-06, "loss": 0.5594, "step": 130 }, { "epoch": 0.031867975529947364, "grad_norm": 0.681338176363733, "learning_rate": 3.1590909090909094e-06, "loss": 0.564, "step": 140 }, { "epoch": 0.034144259496372176, "grad_norm": 0.7273342639401195, "learning_rate": 3.3863636363636364e-06, "loss": 0.5657, "step": 150 }, { "epoch": 0.03642054346279698, "grad_norm": 0.6489466886504726, "learning_rate": 3.6136363636363643e-06, "loss": 0.5484, "step": 160 }, { "epoch": 0.03869682742922179, "grad_norm": 0.6973554298115421, "learning_rate": 3.840909090909091e-06, "loss": 0.5445, "step": 170 }, { "epoch": 0.040973111395646605, "grad_norm": 0.6394889437481598, "learning_rate": 4.068181818181818e-06, "loss": 0.548, "step": 180 }, { "epoch": 0.04324939536207142, "grad_norm": 0.6430348845599891, "learning_rate": 4.295454545454546e-06, "loss": 0.5481, "step": 190 }, { "epoch": 0.04552567932849623, "grad_norm": 0.7221066666159676, "learning_rate": 4.522727272727273e-06, "loss": 0.5492, "step": 200 }, { "epoch": 0.04780196329492104, "grad_norm": 0.6814488809728508, "learning_rate": 4.75e-06, "loss": 0.5429, "step": 210 }, { "epoch": 0.050078247261345854, "grad_norm": 0.6336335480126656, "learning_rate": 4.977272727272728e-06, "loss": 0.5323, "step": 220 }, { "epoch": 0.052354531227770666, "grad_norm": 0.7343888412812761, "learning_rate": 5.204545454545455e-06, "loss": 0.5483, "step": 230 }, { "epoch": 0.05463081519419548, "grad_norm": 0.8008844122645815, "learning_rate": 5.431818181818182e-06, "loss": 0.5286, "step": 240 }, { "epoch": 0.05690709916062029, "grad_norm": 0.7508926850083371, "learning_rate": 5.65909090909091e-06, "loss": 0.5387, "step": 250 }, { "epoch": 0.059183383127045096, "grad_norm": 0.6226438437116334, "learning_rate": 5.886363636363637e-06, "loss": 0.5311, "step": 260 }, { "epoch": 0.06145966709346991, "grad_norm": 0.6829241845904538, "learning_rate": 6.113636363636364e-06, "loss": 0.5325, "step": 270 }, { "epoch": 0.06373595105989473, "grad_norm": 0.7361115947795334, "learning_rate": 6.340909090909091e-06, "loss": 0.5344, "step": 280 }, { "epoch": 0.06601223502631953, "grad_norm": 0.7763867544917724, "learning_rate": 6.568181818181819e-06, "loss": 0.5255, "step": 290 }, { "epoch": 0.06828851899274435, "grad_norm": 0.7805934170558461, "learning_rate": 6.795454545454546e-06, "loss": 0.5237, "step": 300 }, { "epoch": 0.07056480295916916, "grad_norm": 0.8252464031314231, "learning_rate": 7.022727272727273e-06, "loss": 0.5242, "step": 310 }, { "epoch": 0.07284108692559396, "grad_norm": 0.7334894747403866, "learning_rate": 7.25e-06, "loss": 0.521, "step": 320 }, { "epoch": 0.07511737089201878, "grad_norm": 0.6846204422508828, "learning_rate": 7.477272727272727e-06, "loss": 0.5229, "step": 330 }, { "epoch": 0.07739365485844359, "grad_norm": 0.8052410281091429, "learning_rate": 7.704545454545456e-06, "loss": 0.5262, "step": 340 }, { "epoch": 0.0796699388248684, "grad_norm": 0.6991957963444353, "learning_rate": 7.931818181818182e-06, "loss": 0.5335, "step": 350 }, { "epoch": 0.08194622279129321, "grad_norm": 0.7125373155196583, "learning_rate": 8.15909090909091e-06, "loss": 0.5125, "step": 360 }, { "epoch": 0.08422250675771803, "grad_norm": 0.7997954899533661, "learning_rate": 8.386363636363638e-06, "loss": 0.5121, "step": 370 }, { "epoch": 0.08649879072414283, "grad_norm": 0.7683875875855587, "learning_rate": 8.613636363636364e-06, "loss": 0.5121, "step": 380 }, { "epoch": 0.08877507469056765, "grad_norm": 0.67604209965691, "learning_rate": 8.840909090909092e-06, "loss": 0.5124, "step": 390 }, { "epoch": 0.09105135865699246, "grad_norm": 0.8371326669517628, "learning_rate": 9.06818181818182e-06, "loss": 0.5253, "step": 400 }, { "epoch": 0.09332764262341726, "grad_norm": 0.8604199705110958, "learning_rate": 9.295454545454546e-06, "loss": 0.5057, "step": 410 }, { "epoch": 0.09560392658984208, "grad_norm": 0.7001685508024218, "learning_rate": 9.522727272727274e-06, "loss": 0.512, "step": 420 }, { "epoch": 0.09788021055626689, "grad_norm": 0.746358066866932, "learning_rate": 9.75e-06, "loss": 0.5152, "step": 430 }, { "epoch": 0.10015649452269171, "grad_norm": 0.7485703568589076, "learning_rate": 9.977272727272728e-06, "loss": 0.51, "step": 440 }, { "epoch": 0.10243277848911651, "grad_norm": 0.9902309913638483, "learning_rate": 9.999872165053986e-06, "loss": 0.5156, "step": 450 }, { "epoch": 0.10470906245554133, "grad_norm": 0.9153541257496008, "learning_rate": 9.999430274867309e-06, "loss": 0.5059, "step": 460 }, { "epoch": 0.10698534642196614, "grad_norm": 0.6739494150459265, "learning_rate": 9.998672779119897e-06, "loss": 0.4995, "step": 470 }, { "epoch": 0.10926163038839096, "grad_norm": 0.7147898292959126, "learning_rate": 9.997599725631174e-06, "loss": 0.5021, "step": 480 }, { "epoch": 0.11153791435481576, "grad_norm": 0.7526289283194069, "learning_rate": 9.996211182141184e-06, "loss": 0.5113, "step": 490 }, { "epoch": 0.11381419832124058, "grad_norm": 0.8898690822057496, "learning_rate": 9.994507236306327e-06, "loss": 0.5081, "step": 500 }, { "epoch": 0.11609048228766539, "grad_norm": 0.8227051185656225, "learning_rate": 9.99248799569382e-06, "loss": 0.5034, "step": 510 }, { "epoch": 0.11836676625409019, "grad_norm": 0.6375442627559045, "learning_rate": 9.990153587774895e-06, "loss": 0.5021, "step": 520 }, { "epoch": 0.12064305022051501, "grad_norm": 0.6931479406599156, "learning_rate": 9.98750415991677e-06, "loss": 0.5044, "step": 530 }, { "epoch": 0.12291933418693982, "grad_norm": 0.8097527394823517, "learning_rate": 9.984539879373335e-06, "loss": 0.5088, "step": 540 }, { "epoch": 0.12519561815336464, "grad_norm": 0.8559434520312957, "learning_rate": 9.981260933274597e-06, "loss": 0.5111, "step": 550 }, { "epoch": 0.12747190211978945, "grad_norm": 0.6767290786737241, "learning_rate": 9.977667528614869e-06, "loss": 0.4952, "step": 560 }, { "epoch": 0.12974818608621425, "grad_norm": 0.6584609354184026, "learning_rate": 9.973759892239696e-06, "loss": 0.4916, "step": 570 }, { "epoch": 0.13202447005263906, "grad_norm": 1.0067522800390891, "learning_rate": 9.969538270831538e-06, "loss": 0.4938, "step": 580 }, { "epoch": 0.13430075401906388, "grad_norm": 0.7842725791926335, "learning_rate": 9.9650029308942e-06, "loss": 0.5031, "step": 590 }, { "epoch": 0.1365770379854887, "grad_norm": 0.7213504076724643, "learning_rate": 9.960154158736011e-06, "loss": 0.4987, "step": 600 }, { "epoch": 0.1388533219519135, "grad_norm": 0.7639458215235444, "learning_rate": 9.954992260451737e-06, "loss": 0.4916, "step": 610 }, { "epoch": 0.1411296059183383, "grad_norm": 0.6736103272836254, "learning_rate": 9.949517561903268e-06, "loss": 0.4953, "step": 620 }, { "epoch": 0.14340588988476313, "grad_norm": 0.7206570170845565, "learning_rate": 9.943730408699047e-06, "loss": 0.4949, "step": 630 }, { "epoch": 0.14568217385118792, "grad_norm": 0.7325803766368787, "learning_rate": 9.937631166172248e-06, "loss": 0.5015, "step": 640 }, { "epoch": 0.14795845781761274, "grad_norm": 0.7277234400694914, "learning_rate": 9.931220219357714e-06, "loss": 0.5065, "step": 650 }, { "epoch": 0.15023474178403756, "grad_norm": 6.746710290505448, "learning_rate": 9.924497972967652e-06, "loss": 0.4918, "step": 660 }, { "epoch": 0.15251102575046238, "grad_norm": 0.6381821044001871, "learning_rate": 9.91746485136609e-06, "loss": 0.4842, "step": 670 }, { "epoch": 0.15478730971688717, "grad_norm": 0.6894734847882242, "learning_rate": 9.91012129854207e-06, "loss": 0.4874, "step": 680 }, { "epoch": 0.157063593683312, "grad_norm": 0.7239229961207453, "learning_rate": 9.90246777808164e-06, "loss": 0.4817, "step": 690 }, { "epoch": 0.1593398776497368, "grad_norm": 0.6946825414002048, "learning_rate": 9.894504773138573e-06, "loss": 0.5014, "step": 700 }, { "epoch": 0.16161616161616163, "grad_norm": 0.7364735544657125, "learning_rate": 9.88623278640388e-06, "loss": 0.4861, "step": 710 }, { "epoch": 0.16389244558258642, "grad_norm": 0.6496771859407187, "learning_rate": 9.877652340074063e-06, "loss": 0.4892, "step": 720 }, { "epoch": 0.16616872954901124, "grad_norm": 0.6578218852464074, "learning_rate": 9.868763975818156e-06, "loss": 0.4866, "step": 730 }, { "epoch": 0.16844501351543606, "grad_norm": 0.8508879887579466, "learning_rate": 9.859568254743535e-06, "loss": 0.4986, "step": 740 }, { "epoch": 0.17072129748186085, "grad_norm": 0.783429883650226, "learning_rate": 9.850065757360485e-06, "loss": 0.4988, "step": 750 }, { "epoch": 0.17299758144828567, "grad_norm": 0.8186071721692408, "learning_rate": 9.840257083545562e-06, "loss": 0.4818, "step": 760 }, { "epoch": 0.1752738654147105, "grad_norm": 0.710423604497616, "learning_rate": 9.83014285250372e-06, "loss": 0.4888, "step": 770 }, { "epoch": 0.1775501493811353, "grad_norm": 0.6800493756535392, "learning_rate": 9.81972370272923e-06, "loss": 0.4863, "step": 780 }, { "epoch": 0.1798264333475601, "grad_norm": 0.7168752838596176, "learning_rate": 9.809000291965354e-06, "loss": 0.4911, "step": 790 }, { "epoch": 0.18210271731398492, "grad_norm": 0.7743139671538918, "learning_rate": 9.797973297162842e-06, "loss": 0.4967, "step": 800 }, { "epoch": 0.18437900128040974, "grad_norm": 0.6820565783740362, "learning_rate": 9.78664341443719e-06, "loss": 0.4766, "step": 810 }, { "epoch": 0.18665528524683453, "grad_norm": 0.745509395049587, "learning_rate": 9.775011359024692e-06, "loss": 0.4831, "step": 820 }, { "epoch": 0.18893156921325935, "grad_norm": 0.6587611779174343, "learning_rate": 9.763077865237293e-06, "loss": 0.486, "step": 830 }, { "epoch": 0.19120785317968417, "grad_norm": 0.7570644152194894, "learning_rate": 9.750843686416233e-06, "loss": 0.4876, "step": 840 }, { "epoch": 0.193484137146109, "grad_norm": 0.7438169073992149, "learning_rate": 9.738309594884489e-06, "loss": 0.498, "step": 850 }, { "epoch": 0.19576042111253378, "grad_norm": 0.8075454984835392, "learning_rate": 9.725476381898018e-06, "loss": 0.4761, "step": 860 }, { "epoch": 0.1980367050789586, "grad_norm": 0.7687120756908921, "learning_rate": 9.712344857595804e-06, "loss": 0.4735, "step": 870 }, { "epoch": 0.20031298904538342, "grad_norm": 0.6692318540483727, "learning_rate": 9.698915850948725e-06, "loss": 0.4796, "step": 880 }, { "epoch": 0.20258927301180824, "grad_norm": 0.7357514918311532, "learning_rate": 9.685190209707214e-06, "loss": 0.4881, "step": 890 }, { "epoch": 0.20486555697823303, "grad_norm": 0.7672275789241998, "learning_rate": 9.67116880034774e-06, "loss": 0.4857, "step": 900 }, { "epoch": 0.20714184094465785, "grad_norm": 0.7135300642695123, "learning_rate": 9.656852508018111e-06, "loss": 0.4791, "step": 910 }, { "epoch": 0.20941812491108266, "grad_norm": 0.8360299716625731, "learning_rate": 9.642242236481604e-06, "loss": 0.4849, "step": 920 }, { "epoch": 0.21169440887750746, "grad_norm": 0.8029294625940537, "learning_rate": 9.6273389080599e-06, "loss": 0.4797, "step": 930 }, { "epoch": 0.21397069284393228, "grad_norm": 0.7293278863253679, "learning_rate": 9.612143463574866e-06, "loss": 0.4822, "step": 940 }, { "epoch": 0.2162469768103571, "grad_norm": 0.7388020334147711, "learning_rate": 9.596656862289158e-06, "loss": 0.4918, "step": 950 }, { "epoch": 0.2185232607767819, "grad_norm": 0.6641555373860104, "learning_rate": 9.580880081845674e-06, "loss": 0.4776, "step": 960 }, { "epoch": 0.2207995447432067, "grad_norm": 0.8539043098487583, "learning_rate": 9.564814118205825e-06, "loss": 0.48, "step": 970 }, { "epoch": 0.22307582870963152, "grad_norm": 0.7736828871091971, "learning_rate": 9.548459985586668e-06, "loss": 0.4819, "step": 980 }, { "epoch": 0.22535211267605634, "grad_norm": 0.7761396197941033, "learning_rate": 9.531818716396879e-06, "loss": 0.4874, "step": 990 }, { "epoch": 0.22762839664248116, "grad_norm": 0.8766761552909957, "learning_rate": 9.514891361171584e-06, "loss": 0.477, "step": 1000 }, { "epoch": 0.22990468060890595, "grad_norm": 0.850710977709625, "learning_rate": 9.497678988506027e-06, "loss": 0.4809, "step": 1010 }, { "epoch": 0.23218096457533077, "grad_norm": 0.7569136767466481, "learning_rate": 9.480182684988128e-06, "loss": 0.4798, "step": 1020 }, { "epoch": 0.2344572485417556, "grad_norm": 1.0189056868147057, "learning_rate": 9.462403555129875e-06, "loss": 0.4799, "step": 1030 }, { "epoch": 0.23673353250818038, "grad_norm": 0.7004478108183485, "learning_rate": 9.444342721297607e-06, "loss": 0.4786, "step": 1040 }, { "epoch": 0.2390098164746052, "grad_norm": 0.8359707826058439, "learning_rate": 9.426001323641156e-06, "loss": 0.4715, "step": 1050 }, { "epoch": 0.24128610044103002, "grad_norm": 0.6888590261970808, "learning_rate": 9.40738052002187e-06, "loss": 0.4682, "step": 1060 }, { "epoch": 0.24356238440745484, "grad_norm": 0.7568931250342633, "learning_rate": 9.388481485939532e-06, "loss": 0.4746, "step": 1070 }, { "epoch": 0.24583866837387963, "grad_norm": 0.7349584513921489, "learning_rate": 9.369305414458128e-06, "loss": 0.4763, "step": 1080 }, { "epoch": 0.24811495234030445, "grad_norm": 0.8343501997237606, "learning_rate": 9.349853516130556e-06, "loss": 0.484, "step": 1090 }, { "epoch": 0.25039123630672927, "grad_norm": 0.7193295893335135, "learning_rate": 9.330127018922195e-06, "loss": 0.4834, "step": 1100 }, { "epoch": 0.2526675202731541, "grad_norm": 0.7580309140105336, "learning_rate": 9.310127168133378e-06, "loss": 0.4812, "step": 1110 }, { "epoch": 0.2549438042395789, "grad_norm": 0.7613918214537019, "learning_rate": 9.289855226320796e-06, "loss": 0.4727, "step": 1120 }, { "epoch": 0.25722008820600367, "grad_norm": 0.725239008063755, "learning_rate": 9.269312473217777e-06, "loss": 0.4803, "step": 1130 }, { "epoch": 0.2594963721724285, "grad_norm": 0.8482520214554938, "learning_rate": 9.248500205653518e-06, "loss": 0.4745, "step": 1140 }, { "epoch": 0.2617726561388533, "grad_norm": 0.7908430286997805, "learning_rate": 9.22741973747119e-06, "loss": 0.471, "step": 1150 }, { "epoch": 0.26404894010527813, "grad_norm": 0.7036666630426738, "learning_rate": 9.20607239944503e-06, "loss": 0.4676, "step": 1160 }, { "epoch": 0.26632522407170295, "grad_norm": 0.7263755674894325, "learning_rate": 9.18445953919631e-06, "loss": 0.4738, "step": 1170 }, { "epoch": 0.26860150803812777, "grad_norm": 0.7487742930473097, "learning_rate": 9.16258252110827e-06, "loss": 0.4749, "step": 1180 }, { "epoch": 0.2708777920045526, "grad_norm": 0.7931436305160696, "learning_rate": 9.140442726239986e-06, "loss": 0.4739, "step": 1190 }, { "epoch": 0.2731540759709774, "grad_norm": 0.7971178665228899, "learning_rate": 9.118041552239187e-06, "loss": 0.4715, "step": 1200 }, { "epoch": 0.27543035993740217, "grad_norm": 0.7352453908333937, "learning_rate": 9.095380413254029e-06, "loss": 0.4735, "step": 1210 }, { "epoch": 0.277706643903827, "grad_norm": 0.6686848966506087, "learning_rate": 9.072460739843807e-06, "loss": 0.4701, "step": 1220 }, { "epoch": 0.2799829278702518, "grad_norm": 0.7632913563075477, "learning_rate": 9.049283978888665e-06, "loss": 0.4709, "step": 1230 }, { "epoch": 0.2822592118366766, "grad_norm": 0.9126191629516692, "learning_rate": 9.025851593498245e-06, "loss": 0.4812, "step": 1240 }, { "epoch": 0.28453549580310145, "grad_norm": 0.7586654591086177, "learning_rate": 9.002165062919321e-06, "loss": 0.4759, "step": 1250 }, { "epoch": 0.28681177976952626, "grad_norm": 0.6020016779620824, "learning_rate": 8.978225882442431e-06, "loss": 0.4585, "step": 1260 }, { "epoch": 0.2890880637359511, "grad_norm": 0.6918094336131744, "learning_rate": 8.95403556330747e-06, "loss": 0.4734, "step": 1270 }, { "epoch": 0.29136434770237585, "grad_norm": 0.8432310682535402, "learning_rate": 8.929595632608286e-06, "loss": 0.4657, "step": 1280 }, { "epoch": 0.29364063166880067, "grad_norm": 0.7968867596780567, "learning_rate": 8.904907633196287e-06, "loss": 0.4689, "step": 1290 }, { "epoch": 0.2959169156352255, "grad_norm": 0.8975405798229933, "learning_rate": 8.879973123583041e-06, "loss": 0.4742, "step": 1300 }, { "epoch": 0.2981931996016503, "grad_norm": 0.8375787090323347, "learning_rate": 8.854793677841878e-06, "loss": 0.4679, "step": 1310 }, { "epoch": 0.3004694835680751, "grad_norm": 0.7641222076424216, "learning_rate": 8.829370885508538e-06, "loss": 0.4668, "step": 1320 }, { "epoch": 0.30274576753449994, "grad_norm": 0.7968284482238742, "learning_rate": 8.803706351480819e-06, "loss": 0.4621, "step": 1330 }, { "epoch": 0.30502205150092476, "grad_norm": 0.7504856443013833, "learning_rate": 8.777801695917257e-06, "loss": 0.4638, "step": 1340 }, { "epoch": 0.3072983354673495, "grad_norm": 0.7386127839372575, "learning_rate": 8.751658554134861e-06, "loss": 0.472, "step": 1350 }, { "epoch": 0.30957461943377435, "grad_norm": 0.7519795926557973, "learning_rate": 8.725278576505865e-06, "loss": 0.463, "step": 1360 }, { "epoch": 0.31185090340019916, "grad_norm": 0.6860694902316768, "learning_rate": 8.698663428353551e-06, "loss": 0.469, "step": 1370 }, { "epoch": 0.314127187366624, "grad_norm": 0.84686290617533, "learning_rate": 8.671814789847116e-06, "loss": 0.4727, "step": 1380 }, { "epoch": 0.3164034713330488, "grad_norm": 0.7119241552397302, "learning_rate": 8.64473435589561e-06, "loss": 0.4706, "step": 1390 }, { "epoch": 0.3186797552994736, "grad_norm": 0.7745989228725879, "learning_rate": 8.617423836040937e-06, "loss": 0.4679, "step": 1400 }, { "epoch": 0.32095603926589844, "grad_norm": 0.7227304748917153, "learning_rate": 8.589884954349928e-06, "loss": 0.4738, "step": 1410 }, { "epoch": 0.32323232323232326, "grad_norm": 1.228623655278939, "learning_rate": 8.562119449305517e-06, "loss": 0.4648, "step": 1420 }, { "epoch": 0.325508607198748, "grad_norm": 0.6887447415599827, "learning_rate": 8.534129073696984e-06, "loss": 0.4707, "step": 1430 }, { "epoch": 0.32778489116517284, "grad_norm": 0.7108435007854059, "learning_rate": 8.505915594509304e-06, "loss": 0.4633, "step": 1440 }, { "epoch": 0.33006117513159766, "grad_norm": 0.7768742017040517, "learning_rate": 8.477480792811607e-06, "loss": 0.466, "step": 1450 }, { "epoch": 0.3323374590980225, "grad_norm": 0.9552934171248785, "learning_rate": 8.448826463644733e-06, "loss": 0.4615, "step": 1460 }, { "epoch": 0.3346137430644473, "grad_norm": 0.7969014798994358, "learning_rate": 8.419954415907925e-06, "loss": 0.4685, "step": 1470 }, { "epoch": 0.3368900270308721, "grad_norm": 0.8468048307712729, "learning_rate": 8.390866472244624e-06, "loss": 0.4599, "step": 1480 }, { "epoch": 0.33916631099729694, "grad_norm": 0.8691044067047762, "learning_rate": 8.36156446892742e-06, "loss": 0.4722, "step": 1490 }, { "epoch": 0.3414425949637217, "grad_norm": 0.9380457631209852, "learning_rate": 8.332050255742126e-06, "loss": 0.4741, "step": 1500 }, { "epoch": 0.3437188789301465, "grad_norm": 0.8859250973757697, "learning_rate": 8.302325695871e-06, "loss": 0.4621, "step": 1510 }, { "epoch": 0.34599516289657134, "grad_norm": 0.7488919862243867, "learning_rate": 8.272392665775132e-06, "loss": 0.4604, "step": 1520 }, { "epoch": 0.34827144686299616, "grad_norm": 0.9715182317776861, "learning_rate": 8.242253055075989e-06, "loss": 0.463, "step": 1530 }, { "epoch": 0.350547730829421, "grad_norm": 0.9442966262864598, "learning_rate": 8.211908766436114e-06, "loss": 0.4599, "step": 1540 }, { "epoch": 0.3528240147958458, "grad_norm": 0.885636304605366, "learning_rate": 8.181361715439023e-06, "loss": 0.4753, "step": 1550 }, { "epoch": 0.3551002987622706, "grad_norm": 0.8187381110333617, "learning_rate": 8.15061383046828e-06, "loss": 0.468, "step": 1560 }, { "epoch": 0.3573765827286954, "grad_norm": 0.7537652164973274, "learning_rate": 8.119667052585753e-06, "loss": 0.4591, "step": 1570 }, { "epoch": 0.3596528666951202, "grad_norm": 0.8597815188650263, "learning_rate": 8.088523335409086e-06, "loss": 0.4562, "step": 1580 }, { "epoch": 0.361929150661545, "grad_norm": 0.745664972159197, "learning_rate": 8.057184644988363e-06, "loss": 0.4603, "step": 1590 }, { "epoch": 0.36420543462796984, "grad_norm": 0.8238816020218861, "learning_rate": 8.025652959682004e-06, "loss": 0.4677, "step": 1600 }, { "epoch": 0.36648171859439466, "grad_norm": 0.7416721413551182, "learning_rate": 7.993930270031863e-06, "loss": 0.4619, "step": 1610 }, { "epoch": 0.3687580025608195, "grad_norm": 0.7143674127418036, "learning_rate": 7.962018578637578e-06, "loss": 0.4629, "step": 1620 }, { "epoch": 0.3710342865272443, "grad_norm": 0.7454781957945812, "learning_rate": 7.929919900030147e-06, "loss": 0.4645, "step": 1630 }, { "epoch": 0.37331057049366906, "grad_norm": 0.763351971097948, "learning_rate": 7.897636260544752e-06, "loss": 0.4619, "step": 1640 }, { "epoch": 0.3755868544600939, "grad_norm": 0.7405410645904156, "learning_rate": 7.865169698192842e-06, "loss": 0.4628, "step": 1650 }, { "epoch": 0.3778631384265187, "grad_norm": 0.7347060655230769, "learning_rate": 7.832522262533481e-06, "loss": 0.4649, "step": 1660 }, { "epoch": 0.3801394223929435, "grad_norm": 0.7023247967552008, "learning_rate": 7.799696014543949e-06, "loss": 0.4593, "step": 1670 }, { "epoch": 0.38241570635936833, "grad_norm": 0.9339187019341834, "learning_rate": 7.766693026489655e-06, "loss": 0.4541, "step": 1680 }, { "epoch": 0.38469199032579315, "grad_norm": 0.7809209531769312, "learning_rate": 7.733515381793305e-06, "loss": 0.4653, "step": 1690 }, { "epoch": 0.386968274292218, "grad_norm": 0.8161647769578234, "learning_rate": 7.70016517490338e-06, "loss": 0.4643, "step": 1700 }, { "epoch": 0.3892445582586428, "grad_norm": 0.8219287065471175, "learning_rate": 7.666644511161925e-06, "loss": 0.4573, "step": 1710 }, { "epoch": 0.39152084222506756, "grad_norm": 0.730307940342026, "learning_rate": 7.632955506671633e-06, "loss": 0.4587, "step": 1720 }, { "epoch": 0.3937971261914924, "grad_norm": 0.7616849447054718, "learning_rate": 7.599100288162267e-06, "loss": 0.462, "step": 1730 }, { "epoch": 0.3960734101579172, "grad_norm": 0.8087396986104316, "learning_rate": 7.565080992856393e-06, "loss": 0.4626, "step": 1740 }, { "epoch": 0.398349694124342, "grad_norm": 0.725916588847792, "learning_rate": 7.530899768334476e-06, "loss": 0.4679, "step": 1750 }, { "epoch": 0.40062597809076683, "grad_norm": 0.8356414931081636, "learning_rate": 7.496558772399289e-06, "loss": 0.4562, "step": 1760 }, { "epoch": 0.40290226205719165, "grad_norm": 0.8552053129727125, "learning_rate": 7.462060172939711e-06, "loss": 0.4593, "step": 1770 }, { "epoch": 0.40517854602361647, "grad_norm": 0.7085366390905654, "learning_rate": 7.427406147793861e-06, "loss": 0.4641, "step": 1780 }, { "epoch": 0.40745482999004123, "grad_norm": 0.858819978106255, "learning_rate": 7.392598884611617e-06, "loss": 0.4595, "step": 1790 }, { "epoch": 0.40973111395646605, "grad_norm": 0.8915575128268939, "learning_rate": 7.357640580716516e-06, "loss": 0.4609, "step": 1800 }, { "epoch": 0.4120073979228909, "grad_norm": 0.8410874583168452, "learning_rate": 7.32253344296704e-06, "loss": 0.4519, "step": 1810 }, { "epoch": 0.4142836818893157, "grad_norm": 0.7553762047323214, "learning_rate": 7.2872796876173e-06, "loss": 0.4509, "step": 1820 }, { "epoch": 0.4165599658557405, "grad_norm": 0.7579109862136422, "learning_rate": 7.251881540177125e-06, "loss": 0.4639, "step": 1830 }, { "epoch": 0.41883624982216533, "grad_norm": 0.8596157431482481, "learning_rate": 7.2163412352715745e-06, "loss": 0.4665, "step": 1840 }, { "epoch": 0.42111253378859015, "grad_norm": 0.8318429397770328, "learning_rate": 7.180661016499868e-06, "loss": 0.46, "step": 1850 }, { "epoch": 0.4233888177550149, "grad_norm": 0.6921114545354697, "learning_rate": 7.144843136293746e-06, "loss": 0.4578, "step": 1860 }, { "epoch": 0.42566510172143973, "grad_norm": 0.6714500243037865, "learning_rate": 7.108889855775289e-06, "loss": 0.4507, "step": 1870 }, { "epoch": 0.42794138568786455, "grad_norm": 0.7710332319490364, "learning_rate": 7.0728034446141654e-06, "loss": 0.4571, "step": 1880 }, { "epoch": 0.43021766965428937, "grad_norm": 0.8446602836780411, "learning_rate": 7.036586180884357e-06, "loss": 0.4707, "step": 1890 }, { "epoch": 0.4324939536207142, "grad_norm": 0.7674212691654148, "learning_rate": 7.000240350920344e-06, "loss": 0.462, "step": 1900 }, { "epoch": 0.434770237587139, "grad_norm": 0.8556511437021528, "learning_rate": 6.96376824917278e-06, "loss": 0.4533, "step": 1910 }, { "epoch": 0.4370465215535638, "grad_norm": 0.8078502008317322, "learning_rate": 6.927172178063636e-06, "loss": 0.4608, "step": 1920 }, { "epoch": 0.43932280551998865, "grad_norm": 0.7150517188869988, "learning_rate": 6.890454447840862e-06, "loss": 0.4535, "step": 1930 }, { "epoch": 0.4415990894864134, "grad_norm": 0.7523733013230715, "learning_rate": 6.853617376432542e-06, "loss": 0.4624, "step": 1940 }, { "epoch": 0.44387537345283823, "grad_norm": 0.9728144367431893, "learning_rate": 6.816663289300567e-06, "loss": 0.4597, "step": 1950 }, { "epoch": 0.44615165741926305, "grad_norm": 0.7177459968030785, "learning_rate": 6.779594519293833e-06, "loss": 0.4607, "step": 1960 }, { "epoch": 0.44842794138568787, "grad_norm": 0.8088528853225262, "learning_rate": 6.742413406500967e-06, "loss": 0.4629, "step": 1970 }, { "epoch": 0.4507042253521127, "grad_norm": 0.9175593683030001, "learning_rate": 6.705122298102611e-06, "loss": 0.4636, "step": 1980 }, { "epoch": 0.4529805093185375, "grad_norm": 0.7621550740057926, "learning_rate": 6.667723548223241e-06, "loss": 0.4704, "step": 1990 }, { "epoch": 0.4552567932849623, "grad_norm": 0.7894831335092004, "learning_rate": 6.630219517782557e-06, "loss": 0.4665, "step": 2000 }, { "epoch": 0.4575330772513871, "grad_norm": 0.7289053283342039, "learning_rate": 6.592612574346442e-06, "loss": 0.4496, "step": 2010 }, { "epoch": 0.4598093612178119, "grad_norm": 0.7268790659257058, "learning_rate": 6.554905091977506e-06, "loss": 0.4529, "step": 2020 }, { "epoch": 0.4620856451842367, "grad_norm": 0.7436765967755353, "learning_rate": 6.5170994510852035e-06, "loss": 0.4548, "step": 2030 }, { "epoch": 0.46436192915066155, "grad_norm": 0.7695864257948672, "learning_rate": 6.479198038275578e-06, "loss": 0.4539, "step": 2040 }, { "epoch": 0.46663821311708636, "grad_norm": 0.8295487862322445, "learning_rate": 6.441203246200587e-06, "loss": 0.4634, "step": 2050 }, { "epoch": 0.4689144970835112, "grad_norm": 0.8241912559218441, "learning_rate": 6.403117473407065e-06, "loss": 0.4496, "step": 2060 }, { "epoch": 0.471190781049936, "grad_norm": 0.7076343515274734, "learning_rate": 6.364943124185308e-06, "loss": 0.4497, "step": 2070 }, { "epoch": 0.47346706501636077, "grad_norm": 0.8310935730693106, "learning_rate": 6.3266826084172835e-06, "loss": 0.4648, "step": 2080 }, { "epoch": 0.4757433489827856, "grad_norm": 0.7704280812181517, "learning_rate": 6.288338341424515e-06, "loss": 0.455, "step": 2090 }, { "epoch": 0.4780196329492104, "grad_norm": 0.8181658490570038, "learning_rate": 6.249912743815595e-06, "loss": 0.4596, "step": 2100 }, { "epoch": 0.4802959169156352, "grad_norm": 0.7431508637124334, "learning_rate": 6.211408241333379e-06, "loss": 0.4538, "step": 2110 }, { "epoch": 0.48257220088206004, "grad_norm": 0.709619776250267, "learning_rate": 6.172827264701857e-06, "loss": 0.4537, "step": 2120 }, { "epoch": 0.48484848484848486, "grad_norm": 0.8223387777060188, "learning_rate": 6.134172249472702e-06, "loss": 0.4551, "step": 2130 }, { "epoch": 0.4871247688149097, "grad_norm": 0.7768854003850929, "learning_rate": 6.095445635871516e-06, "loss": 0.4657, "step": 2140 }, { "epoch": 0.48940105278133444, "grad_norm": 0.769025632798714, "learning_rate": 6.0566498686437855e-06, "loss": 0.4557, "step": 2150 }, { "epoch": 0.49167733674775926, "grad_norm": 0.6692137861485721, "learning_rate": 6.0177873969005475e-06, "loss": 0.4563, "step": 2160 }, { "epoch": 0.4939536207141841, "grad_norm": 0.8907000112703206, "learning_rate": 5.978860673963784e-06, "loss": 0.4548, "step": 2170 }, { "epoch": 0.4962299046806089, "grad_norm": 0.8129551036581305, "learning_rate": 5.939872157211545e-06, "loss": 0.4501, "step": 2180 }, { "epoch": 0.4985061886470337, "grad_norm": 0.9050604723863666, "learning_rate": 5.900824307922819e-06, "loss": 0.4529, "step": 2190 }, { "epoch": 0.5007824726134585, "grad_norm": 0.9464291168776866, "learning_rate": 5.861719591122158e-06, "loss": 0.4597, "step": 2200 }, { "epoch": 0.5030587565798833, "grad_norm": 0.7629842734151758, "learning_rate": 5.8225604754240635e-06, "loss": 0.4547, "step": 2210 }, { "epoch": 0.5053350405463082, "grad_norm": 0.737677886868225, "learning_rate": 5.783349432877146e-06, "loss": 0.4568, "step": 2220 }, { "epoch": 0.5076113245127329, "grad_norm": 0.7440218389833005, "learning_rate": 5.744088938808068e-06, "loss": 0.4554, "step": 2230 }, { "epoch": 0.5098876084791578, "grad_norm": 0.8000074715652351, "learning_rate": 5.70478147166529e-06, "loss": 0.4671, "step": 2240 }, { "epoch": 0.5121638924455826, "grad_norm": 0.9238644399016241, "learning_rate": 5.665429512862597e-06, "loss": 0.4574, "step": 2250 }, { "epoch": 0.5144401764120073, "grad_norm": 0.758841769369074, "learning_rate": 5.626035546622457e-06, "loss": 0.4558, "step": 2260 }, { "epoch": 0.5167164603784322, "grad_norm": 0.7971224800656472, "learning_rate": 5.586602059819199e-06, "loss": 0.4496, "step": 2270 }, { "epoch": 0.518992744344857, "grad_norm": 0.9171620412959115, "learning_rate": 5.547131541822018e-06, "loss": 0.4558, "step": 2280 }, { "epoch": 0.5212690283112819, "grad_norm": 0.7842020256066858, "learning_rate": 5.5076264843378225e-06, "loss": 0.4527, "step": 2290 }, { "epoch": 0.5235453122777066, "grad_norm": 0.9188201380044063, "learning_rate": 5.4680893812539436e-06, "loss": 0.4608, "step": 2300 }, { "epoch": 0.5258215962441315, "grad_norm": 0.7861154037939578, "learning_rate": 5.428522728480697e-06, "loss": 0.4523, "step": 2310 }, { "epoch": 0.5280978802105563, "grad_norm": 0.7920300857523709, "learning_rate": 5.388929023793817e-06, "loss": 0.4568, "step": 2320 }, { "epoch": 0.5303741641769811, "grad_norm": 0.7612825596142501, "learning_rate": 5.349310766676781e-06, "loss": 0.4483, "step": 2330 }, { "epoch": 0.5326504481434059, "grad_norm": 0.7537687250775554, "learning_rate": 5.3096704581630195e-06, "loss": 0.4563, "step": 2340 }, { "epoch": 0.5349267321098307, "grad_norm": 0.751390092998076, "learning_rate": 5.270010600678034e-06, "loss": 0.4578, "step": 2350 }, { "epoch": 0.5372030160762555, "grad_norm": 0.8063126059500658, "learning_rate": 5.230333697881413e-06, "loss": 0.4424, "step": 2360 }, { "epoch": 0.5394793000426803, "grad_norm": 0.7268784420755078, "learning_rate": 5.190642254508789e-06, "loss": 0.4488, "step": 2370 }, { "epoch": 0.5417555840091052, "grad_norm": 0.8219467384704279, "learning_rate": 5.15093877621372e-06, "loss": 0.4443, "step": 2380 }, { "epoch": 0.5440318679755299, "grad_norm": 0.9341715266763854, "learning_rate": 5.111225769409505e-06, "loss": 0.4563, "step": 2390 }, { "epoch": 0.5463081519419548, "grad_norm": 0.8890086015346076, "learning_rate": 5.071505741110958e-06, "loss": 0.4531, "step": 2400 }, { "epoch": 0.5485844359083796, "grad_norm": 0.7859192247678671, "learning_rate": 5.031781198776157e-06, "loss": 0.4448, "step": 2410 }, { "epoch": 0.5508607198748043, "grad_norm": 0.8457709944734434, "learning_rate": 4.9920546501481355e-06, "loss": 0.4502, "step": 2420 }, { "epoch": 0.5531370038412292, "grad_norm": 0.777261473128808, "learning_rate": 4.952328603096588e-06, "loss": 0.4493, "step": 2430 }, { "epoch": 0.555413287807654, "grad_norm": 0.7489538278905294, "learning_rate": 4.912605565459537e-06, "loss": 0.4532, "step": 2440 }, { "epoch": 0.5576895717740789, "grad_norm": 0.7471858030987701, "learning_rate": 4.872888044885031e-06, "loss": 0.4662, "step": 2450 }, { "epoch": 0.5599658557405036, "grad_norm": 0.76997073617317, "learning_rate": 4.833178548672836e-06, "loss": 0.449, "step": 2460 }, { "epoch": 0.5622421397069285, "grad_norm": 0.8114769999661829, "learning_rate": 4.793479583616152e-06, "loss": 0.4511, "step": 2470 }, { "epoch": 0.5645184236733533, "grad_norm": 0.7887812825481647, "learning_rate": 4.753793655843362e-06, "loss": 0.4531, "step": 2480 }, { "epoch": 0.566794707639778, "grad_norm": 0.8266580905214915, "learning_rate": 4.714123270659836e-06, "loss": 0.4499, "step": 2490 }, { "epoch": 0.5690709916062029, "grad_norm": 0.9772993025496673, "learning_rate": 4.674470932389759e-06, "loss": 0.462, "step": 2500 }, { "epoch": 0.5713472755726277, "grad_norm": 0.7550741577854698, "learning_rate": 4.634839144218047e-06, "loss": 0.4424, "step": 2510 }, { "epoch": 0.5736235595390525, "grad_norm": 0.8265913157075914, "learning_rate": 4.595230408032324e-06, "loss": 0.4468, "step": 2520 }, { "epoch": 0.5758998435054773, "grad_norm": 0.7723721397391996, "learning_rate": 4.555647224264978e-06, "loss": 0.4448, "step": 2530 }, { "epoch": 0.5781761274719022, "grad_norm": 0.7438678281440869, "learning_rate": 4.516092091735324e-06, "loss": 0.4537, "step": 2540 }, { "epoch": 0.5804524114383269, "grad_norm": 0.827117915360568, "learning_rate": 4.47656750749184e-06, "loss": 0.4558, "step": 2550 }, { "epoch": 0.5827286954047517, "grad_norm": 0.7273943037424042, "learning_rate": 4.4370759666545495e-06, "loss": 0.4444, "step": 2560 }, { "epoch": 0.5850049793711766, "grad_norm": 0.7703519904997088, "learning_rate": 4.397619962257498e-06, "loss": 0.4481, "step": 2570 }, { "epoch": 0.5872812633376013, "grad_norm": 0.803886348953792, "learning_rate": 4.3582019850913796e-06, "loss": 0.4487, "step": 2580 }, { "epoch": 0.5895575473040262, "grad_norm": 0.7300835048061479, "learning_rate": 4.3188245235462865e-06, "loss": 0.4446, "step": 2590 }, { "epoch": 0.591833831270451, "grad_norm": 0.8101791772935961, "learning_rate": 4.2794900634546385e-06, "loss": 0.4553, "step": 2600 }, { "epoch": 0.5941101152368758, "grad_norm": 0.8207823650264575, "learning_rate": 4.240201087934238e-06, "loss": 0.4511, "step": 2610 }, { "epoch": 0.5963863992033006, "grad_norm": 0.8176342960234186, "learning_rate": 4.200960077231528e-06, "loss": 0.4425, "step": 2620 }, { "epoch": 0.5986626831697254, "grad_norm": 0.916935818899542, "learning_rate": 4.161769508565012e-06, "loss": 0.4442, "step": 2630 }, { "epoch": 0.6009389671361502, "grad_norm": 0.7421424964297176, "learning_rate": 4.122631855968873e-06, "loss": 0.4509, "step": 2640 }, { "epoch": 0.603215251102575, "grad_norm": 0.9115282778731496, "learning_rate": 4.0835495901367955e-06, "loss": 0.455, "step": 2650 }, { "epoch": 0.6054915350689999, "grad_norm": 0.7816827264699414, "learning_rate": 4.0445251782659875e-06, "loss": 0.4381, "step": 2660 }, { "epoch": 0.6077678190354247, "grad_norm": 2.194076497168195, "learning_rate": 4.005561083901434e-06, "loss": 0.4447, "step": 2670 }, { "epoch": 0.6100441030018495, "grad_norm": 0.7573191196887984, "learning_rate": 3.966659766780383e-06, "loss": 0.4446, "step": 2680 }, { "epoch": 0.6123203869682743, "grad_norm": 0.7687574770334604, "learning_rate": 3.927823682677057e-06, "loss": 0.4496, "step": 2690 }, { "epoch": 0.614596670934699, "grad_norm": 0.9358386363240805, "learning_rate": 3.889055283247628e-06, "loss": 0.4568, "step": 2700 }, { "epoch": 0.6168729549011239, "grad_norm": 0.7314708296261042, "learning_rate": 3.850357015875456e-06, "loss": 0.4446, "step": 2710 }, { "epoch": 0.6191492388675487, "grad_norm": 0.8070457903866413, "learning_rate": 3.8117313235165754e-06, "loss": 0.4521, "step": 2720 }, { "epoch": 0.6214255228339736, "grad_norm": 0.7257119700751845, "learning_rate": 3.7731806445454856e-06, "loss": 0.4427, "step": 2730 }, { "epoch": 0.6237018068003983, "grad_norm": 0.7975212623445046, "learning_rate": 3.7347074126012195e-06, "loss": 0.4477, "step": 2740 }, { "epoch": 0.6259780907668232, "grad_norm": 0.7714694828216863, "learning_rate": 3.6963140564337074e-06, "loss": 0.4538, "step": 2750 }, { "epoch": 0.628254374733248, "grad_norm": 0.800630933191912, "learning_rate": 3.658002999750462e-06, "loss": 0.446, "step": 2760 }, { "epoch": 0.6305306586996727, "grad_norm": 0.7408683914326505, "learning_rate": 3.6197766610635656e-06, "loss": 0.446, "step": 2770 }, { "epoch": 0.6328069426660976, "grad_norm": 0.7609135405893662, "learning_rate": 3.5816374535369934e-06, "loss": 0.4416, "step": 2780 }, { "epoch": 0.6350832266325224, "grad_norm": 0.8061635946819576, "learning_rate": 3.543587784834288e-06, "loss": 0.4385, "step": 2790 }, { "epoch": 0.6373595105989472, "grad_norm": 0.8207487764566586, "learning_rate": 3.5056300569665503e-06, "loss": 0.4443, "step": 2800 }, { "epoch": 0.639635794565372, "grad_norm": 0.7780495002255654, "learning_rate": 3.4677666661408096e-06, "loss": 0.4393, "step": 2810 }, { "epoch": 0.6419120785317969, "grad_norm": 0.820529174759461, "learning_rate": 3.4300000026087664e-06, "loss": 0.448, "step": 2820 }, { "epoch": 0.6441883624982216, "grad_norm": 0.7890561874485035, "learning_rate": 3.392332450515886e-06, "loss": 0.4489, "step": 2830 }, { "epoch": 0.6464646464646465, "grad_norm": 0.8198574552313013, "learning_rate": 3.3547663877508928e-06, "loss": 0.4496, "step": 2840 }, { "epoch": 0.6487409304310713, "grad_norm": 0.8209595374102931, "learning_rate": 3.3173041857956716e-06, "loss": 0.442, "step": 2850 }, { "epoch": 0.651017214397496, "grad_norm": 0.8174067144646198, "learning_rate": 3.2799482095755424e-06, "loss": 0.4447, "step": 2860 }, { "epoch": 0.6532934983639209, "grad_norm": 0.7098988026439182, "learning_rate": 3.242700817309976e-06, "loss": 0.4429, "step": 2870 }, { "epoch": 0.6555697823303457, "grad_norm": 0.9250724312431224, "learning_rate": 3.205564360363724e-06, "loss": 0.4508, "step": 2880 }, { "epoch": 0.6578460662967706, "grad_norm": 0.9050078757133033, "learning_rate": 3.168541183098378e-06, "loss": 0.447, "step": 2890 }, { "epoch": 0.6601223502631953, "grad_norm": 0.7647575780260846, "learning_rate": 3.131633622724377e-06, "loss": 0.4521, "step": 2900 }, { "epoch": 0.6623986342296202, "grad_norm": 0.749840221225747, "learning_rate": 3.0948440091534594e-06, "loss": 0.438, "step": 2910 }, { "epoch": 0.664674918196045, "grad_norm": 0.7184423492925078, "learning_rate": 3.058174664851582e-06, "loss": 0.4465, "step": 2920 }, { "epoch": 0.6669512021624697, "grad_norm": 0.7788843341529811, "learning_rate": 3.0216279046923084e-06, "loss": 0.4427, "step": 2930 }, { "epoch": 0.6692274861288946, "grad_norm": 0.795166169097631, "learning_rate": 2.9852060358106717e-06, "loss": 0.4438, "step": 2940 }, { "epoch": 0.6715037700953194, "grad_norm": 0.8307138251372255, "learning_rate": 2.9489113574575272e-06, "loss": 0.4467, "step": 2950 }, { "epoch": 0.6737800540617442, "grad_norm": 0.7663349953487524, "learning_rate": 2.912746160854417e-06, "loss": 0.4491, "step": 2960 }, { "epoch": 0.676056338028169, "grad_norm": 0.7838493815092107, "learning_rate": 2.8767127290489084e-06, "loss": 0.438, "step": 2970 }, { "epoch": 0.6783326219945939, "grad_norm": 0.791824009698266, "learning_rate": 2.840813336770487e-06, "loss": 0.4372, "step": 2980 }, { "epoch": 0.6806089059610186, "grad_norm": 0.73879969079544, "learning_rate": 2.805050250286949e-06, "loss": 0.4514, "step": 2990 }, { "epoch": 0.6828851899274434, "grad_norm": 0.8099909989978515, "learning_rate": 2.769425727261339e-06, "loss": 0.4537, "step": 3000 }, { "epoch": 0.6851614738938683, "grad_norm": 0.7590973946118406, "learning_rate": 2.7339420166094183e-06, "loss": 0.4463, "step": 3010 }, { "epoch": 0.687437757860293, "grad_norm": 0.796201063821709, "learning_rate": 2.6986013583577083e-06, "loss": 0.4397, "step": 3020 }, { "epoch": 0.6897140418267179, "grad_norm": 0.7050687084934512, "learning_rate": 2.6634059835020733e-06, "loss": 0.4268, "step": 3030 }, { "epoch": 0.6919903257931427, "grad_norm": 1.0187421881981598, "learning_rate": 2.628358113866881e-06, "loss": 0.4438, "step": 3040 }, { "epoch": 0.6942666097595676, "grad_norm": 0.8443744661543358, "learning_rate": 2.5934599619647495e-06, "loss": 0.4512, "step": 3050 }, { "epoch": 0.6965428937259923, "grad_norm": 0.7864339330637931, "learning_rate": 2.558713730856862e-06, "loss": 0.4372, "step": 3060 }, { "epoch": 0.6988191776924171, "grad_norm": 0.8039029994902843, "learning_rate": 2.524121614013906e-06, "loss": 0.447, "step": 3070 }, { "epoch": 0.701095461658842, "grad_norm": 0.8554560338951394, "learning_rate": 2.4896857951775973e-06, "loss": 0.4418, "step": 3080 }, { "epoch": 0.7033717456252667, "grad_norm": 0.7789235728757384, "learning_rate": 2.455408448222814e-06, "loss": 0.4428, "step": 3090 }, { "epoch": 0.7056480295916916, "grad_norm": 0.8390767483792194, "learning_rate": 2.4212917370203877e-06, "loss": 0.4513, "step": 3100 }, { "epoch": 0.7079243135581164, "grad_norm": 0.7423511083655429, "learning_rate": 2.3873378153004736e-06, "loss": 0.4415, "step": 3110 }, { "epoch": 0.7102005975245412, "grad_norm": 0.7424046071658116, "learning_rate": 2.3535488265166095e-06, "loss": 0.4293, "step": 3120 }, { "epoch": 0.712476881490966, "grad_norm": 0.7456496661301177, "learning_rate": 2.319926903710398e-06, "loss": 0.4438, "step": 3130 }, { "epoch": 0.7147531654573908, "grad_norm": 0.8278781199522129, "learning_rate": 2.2864741693768423e-06, "loss": 0.4387, "step": 3140 }, { "epoch": 0.7170294494238156, "grad_norm": 0.8116784082715538, "learning_rate": 2.253192735330371e-06, "loss": 0.4462, "step": 3150 }, { "epoch": 0.7193057333902404, "grad_norm": 0.7004955096514237, "learning_rate": 2.2200847025715142e-06, "loss": 0.4398, "step": 3160 }, { "epoch": 0.7215820173566653, "grad_norm": 0.7367447404574639, "learning_rate": 2.1871521611542705e-06, "loss": 0.4475, "step": 3170 }, { "epoch": 0.72385830132309, "grad_norm": 0.7799543504096647, "learning_rate": 2.1543971900541722e-06, "loss": 0.443, "step": 3180 }, { "epoch": 0.7261345852895149, "grad_norm": 0.8090558026400204, "learning_rate": 2.1218218570370303e-06, "loss": 0.4449, "step": 3190 }, { "epoch": 0.7284108692559397, "grad_norm": 0.8879224467129067, "learning_rate": 2.0894282185284147e-06, "loss": 0.4484, "step": 3200 }, { "epoch": 0.7306871532223644, "grad_norm": 0.7328095140462628, "learning_rate": 2.057218319483828e-06, "loss": 0.4414, "step": 3210 }, { "epoch": 0.7329634371887893, "grad_norm": 0.7572339184999409, "learning_rate": 2.0251941932596115e-06, "loss": 0.4372, "step": 3220 }, { "epoch": 0.7352397211552141, "grad_norm": 0.7491266549650365, "learning_rate": 1.9933578614845784e-06, "loss": 0.4393, "step": 3230 }, { "epoch": 0.737516005121639, "grad_norm": 0.821203801856231, "learning_rate": 1.961711333932407e-06, "loss": 0.4507, "step": 3240 }, { "epoch": 0.7397922890880637, "grad_norm": 0.8213618929354572, "learning_rate": 1.930256608394747e-06, "loss": 0.4404, "step": 3250 }, { "epoch": 0.7420685730544886, "grad_norm": 0.7809128990538117, "learning_rate": 1.898995670555112e-06, "loss": 0.4338, "step": 3260 }, { "epoch": 0.7443448570209134, "grad_norm": 0.7867605315635634, "learning_rate": 1.8679304938635373e-06, "loss": 0.4481, "step": 3270 }, { "epoch": 0.7466211409873381, "grad_norm": 0.7482999924845694, "learning_rate": 1.8370630394119742e-06, "loss": 0.4343, "step": 3280 }, { "epoch": 0.748897424953763, "grad_norm": 0.8060085030283564, "learning_rate": 1.806395255810518e-06, "loss": 0.4377, "step": 3290 }, { "epoch": 0.7511737089201878, "grad_norm": 0.8755665289100689, "learning_rate": 1.7759290790643696e-06, "loss": 0.4451, "step": 3300 }, { "epoch": 0.7534499928866126, "grad_norm": 0.8120416476683848, "learning_rate": 1.745666432451638e-06, "loss": 0.4387, "step": 3310 }, { "epoch": 0.7557262768530374, "grad_norm": 0.8156630900998141, "learning_rate": 1.7156092264019198e-06, "loss": 0.4361, "step": 3320 }, { "epoch": 0.7580025608194623, "grad_norm": 0.7998270545611499, "learning_rate": 1.6857593583756915e-06, "loss": 0.448, "step": 3330 }, { "epoch": 0.760278844785887, "grad_norm": 0.8207266172010161, "learning_rate": 1.6561187127445367e-06, "loss": 0.4417, "step": 3340 }, { "epoch": 0.7625551287523119, "grad_norm": 0.8067253437276407, "learning_rate": 1.626689160672182e-06, "loss": 0.4476, "step": 3350 }, { "epoch": 0.7648314127187367, "grad_norm": 0.8853629851991137, "learning_rate": 1.5974725599963776e-06, "loss": 0.4325, "step": 3360 }, { "epoch": 0.7671076966851614, "grad_norm": 0.921124886395691, "learning_rate": 1.5684707551116074e-06, "loss": 0.4385, "step": 3370 }, { "epoch": 0.7693839806515863, "grad_norm": 0.9059423563712878, "learning_rate": 1.5396855768526664e-06, "loss": 0.4441, "step": 3380 }, { "epoch": 0.7716602646180111, "grad_norm": 0.7726314288351178, "learning_rate": 1.5111188423790773e-06, "loss": 0.4367, "step": 3390 }, { "epoch": 0.773936548584436, "grad_norm": 0.8371458674118885, "learning_rate": 1.4827723550603706e-06, "loss": 0.4494, "step": 3400 }, { "epoch": 0.7762128325508607, "grad_norm": 0.9280929650984211, "learning_rate": 1.4546479043622592e-06, "loss": 0.4363, "step": 3410 }, { "epoch": 0.7784891165172856, "grad_norm": 0.8794307651372741, "learning_rate": 1.4267472657336473e-06, "loss": 0.4398, "step": 3420 }, { "epoch": 0.7807654004837103, "grad_norm": 0.8239130127708325, "learning_rate": 1.3990722004945705e-06, "loss": 0.4418, "step": 3430 }, { "epoch": 0.7830416844501351, "grad_norm": 0.790534002165955, "learning_rate": 1.371624455724998e-06, "loss": 0.4457, "step": 3440 }, { "epoch": 0.78531796841656, "grad_norm": 0.821515070101423, "learning_rate": 1.3444057641545377e-06, "loss": 0.4446, "step": 3450 }, { "epoch": 0.7875942523829847, "grad_norm": 0.7821344445262979, "learning_rate": 1.317417844053066e-06, "loss": 0.4326, "step": 3460 }, { "epoch": 0.7898705363494096, "grad_norm": 0.7672443240766755, "learning_rate": 1.2906623991222384e-06, "loss": 0.4392, "step": 3470 }, { "epoch": 0.7921468203158344, "grad_norm": 0.8085724525323776, "learning_rate": 1.2641411183879527e-06, "loss": 0.4368, "step": 3480 }, { "epoch": 0.7944231042822593, "grad_norm": 0.8245898899691535, "learning_rate": 1.2378556760937172e-06, "loss": 0.4383, "step": 3490 }, { "epoch": 0.796699388248684, "grad_norm": 0.9491912069759774, "learning_rate": 1.2118077315949555e-06, "loss": 0.4433, "step": 3500 }, { "epoch": 0.7989756722151088, "grad_norm": 0.766723308181965, "learning_rate": 1.1859989292542617e-06, "loss": 0.4391, "step": 3510 }, { "epoch": 0.8012519561815337, "grad_norm": 0.9223119446048714, "learning_rate": 1.16043089833759e-06, "loss": 0.4353, "step": 3520 }, { "epoch": 0.8035282401479584, "grad_norm": 0.8683331011804727, "learning_rate": 1.1351052529114031e-06, "loss": 0.4481, "step": 3530 }, { "epoch": 0.8058045241143833, "grad_norm": 0.8456488216541104, "learning_rate": 1.1100235917407749e-06, "loss": 0.4423, "step": 3540 }, { "epoch": 0.8080808080808081, "grad_norm": 0.8435393816637614, "learning_rate": 1.0851874981884703e-06, "loss": 0.4392, "step": 3550 }, { "epoch": 0.8103570920472329, "grad_norm": 0.8551429696631416, "learning_rate": 1.0605985401149854e-06, "loss": 0.4373, "step": 3560 }, { "epoch": 0.8126333760136577, "grad_norm": 0.8326619193733407, "learning_rate": 1.0362582697795736e-06, "loss": 0.4403, "step": 3570 }, { "epoch": 0.8149096599800825, "grad_norm": 0.7750783611846149, "learning_rate": 1.012168223742252e-06, "loss": 0.4358, "step": 3580 }, { "epoch": 0.8171859439465073, "grad_norm": 0.74086430166713, "learning_rate": 9.883299227667997e-07, "loss": 0.4376, "step": 3590 }, { "epoch": 0.8194622279129321, "grad_norm": 0.8021836339934405, "learning_rate": 9.647448717247598e-07, "loss": 0.446, "step": 3600 }, { "epoch": 0.821738511879357, "grad_norm": 0.9512917372490248, "learning_rate": 9.414145595004365e-07, "loss": 0.4342, "step": 3610 }, { "epoch": 0.8240147958457817, "grad_norm": 0.8218941611192476, "learning_rate": 9.183404588968981e-07, "loss": 0.4389, "step": 3620 }, { "epoch": 0.8262910798122066, "grad_norm": 0.8452290036325213, "learning_rate": 8.955240265430182e-07, "loss": 0.4352, "step": 3630 }, { "epoch": 0.8285673637786314, "grad_norm": 0.8478009707657702, "learning_rate": 8.729667028014999e-07, "loss": 0.4512, "step": 3640 }, { "epoch": 0.8308436477450561, "grad_norm": 0.9736536719337533, "learning_rate": 8.506699116779643e-07, "loss": 0.4359, "step": 3650 }, { "epoch": 0.833119931711481, "grad_norm": 0.7223652613461555, "learning_rate": 8.286350607310506e-07, "loss": 0.434, "step": 3660 }, { "epoch": 0.8353962156779058, "grad_norm": 0.7773708358623372, "learning_rate": 8.068635409835541e-07, "loss": 0.4367, "step": 3670 }, { "epoch": 0.8376724996443307, "grad_norm": 0.8191260289584852, "learning_rate": 7.853567268346212e-07, "loss": 0.4427, "step": 3680 }, { "epoch": 0.8399487836107554, "grad_norm": 0.8476046937498816, "learning_rate": 7.641159759729821e-07, "loss": 0.439, "step": 3690 }, { "epoch": 0.8422250675771803, "grad_norm": 0.854034845886361, "learning_rate": 7.431426292912414e-07, "loss": 0.4458, "step": 3700 }, { "epoch": 0.8445013515436051, "grad_norm": 0.8815331913866332, "learning_rate": 7.224380108012325e-07, "loss": 0.4299, "step": 3710 }, { "epoch": 0.8467776355100298, "grad_norm": 0.7558898242505027, "learning_rate": 7.020034275504329e-07, "loss": 0.4363, "step": 3720 }, { "epoch": 0.8490539194764547, "grad_norm": 0.8092910834412402, "learning_rate": 6.81840169539451e-07, "loss": 0.4374, "step": 3730 }, { "epoch": 0.8513302034428795, "grad_norm": 0.7669287363471868, "learning_rate": 6.619495096405959e-07, "loss": 0.4405, "step": 3740 }, { "epoch": 0.8536064874093043, "grad_norm": 0.794026413586518, "learning_rate": 6.423327035175186e-07, "loss": 0.447, "step": 3750 }, { "epoch": 0.8558827713757291, "grad_norm": 0.7408546983770723, "learning_rate": 6.229909895459429e-07, "loss": 0.4315, "step": 3760 }, { "epoch": 0.858159055342154, "grad_norm": 0.7237210059574698, "learning_rate": 6.039255887354966e-07, "loss": 0.4391, "step": 3770 }, { "epoch": 0.8604353393085787, "grad_norm": 0.9595044766938607, "learning_rate": 5.851377046526208e-07, "loss": 0.4427, "step": 3780 }, { "epoch": 0.8627116232750035, "grad_norm": 0.8802588662536729, "learning_rate": 5.666285233445978e-07, "loss": 0.4447, "step": 3790 }, { "epoch": 0.8649879072414284, "grad_norm": 0.8249439161204124, "learning_rate": 5.483992132646781e-07, "loss": 0.4433, "step": 3800 }, { "epoch": 0.8672641912078531, "grad_norm": 0.7832517731345501, "learning_rate": 5.304509251983103e-07, "loss": 0.4358, "step": 3810 }, { "epoch": 0.869540475174278, "grad_norm": 0.7248468716835277, "learning_rate": 5.127847921905076e-07, "loss": 0.4449, "step": 3820 }, { "epoch": 0.8718167591407028, "grad_norm": 0.7513659638517941, "learning_rate": 4.954019294743045e-07, "loss": 0.4448, "step": 3830 }, { "epoch": 0.8740930431071277, "grad_norm": 0.9357495352200004, "learning_rate": 4.783034344003673e-07, "loss": 0.4398, "step": 3840 }, { "epoch": 0.8763693270735524, "grad_norm": 0.7702794579988504, "learning_rate": 4.6149038636771337e-07, "loss": 0.4396, "step": 3850 }, { "epoch": 0.8786456110399773, "grad_norm": 0.7237585600700893, "learning_rate": 4.449638467555706e-07, "loss": 0.4369, "step": 3860 }, { "epoch": 0.8809218950064021, "grad_norm": 0.7689094322016036, "learning_rate": 4.2872485885637803e-07, "loss": 0.4419, "step": 3870 }, { "epoch": 0.8831981789728268, "grad_norm": 0.6768771986551186, "learning_rate": 4.1277444780992215e-07, "loss": 0.4337, "step": 3880 }, { "epoch": 0.8854744629392517, "grad_norm": 0.805045121729404, "learning_rate": 3.9711362053862115e-07, "loss": 0.4284, "step": 3890 }, { "epoch": 0.8877507469056765, "grad_norm": 0.9172147345939642, "learning_rate": 3.817433656839586e-07, "loss": 0.4446, "step": 3900 }, { "epoch": 0.8900270308721013, "grad_norm": 0.817219220444952, "learning_rate": 3.6666465354407766e-07, "loss": 0.4378, "step": 3910 }, { "epoch": 0.8923033148385261, "grad_norm": 0.8368161427901388, "learning_rate": 3.5187843601252157e-07, "loss": 0.4396, "step": 3920 }, { "epoch": 0.894579598804951, "grad_norm": 0.8185450548355138, "learning_rate": 3.373856465181424e-07, "loss": 0.4364, "step": 3930 }, { "epoch": 0.8968558827713757, "grad_norm": 0.7618894593947647, "learning_rate": 3.231871999661845e-07, "loss": 0.4383, "step": 3940 }, { "epoch": 0.8991321667378005, "grad_norm": 0.956580014268259, "learning_rate": 3.0928399268051247e-07, "loss": 0.442, "step": 3950 }, { "epoch": 0.9014084507042254, "grad_norm": 0.800968878170199, "learning_rate": 2.9567690234704295e-07, "loss": 0.4395, "step": 3960 }, { "epoch": 0.9036847346706501, "grad_norm": 0.8086633377655379, "learning_rate": 2.8236678795832863e-07, "loss": 0.4347, "step": 3970 }, { "epoch": 0.905961018637075, "grad_norm": 0.7394969702194686, "learning_rate": 2.693544897593325e-07, "loss": 0.4359, "step": 3980 }, { "epoch": 0.9082373026034998, "grad_norm": 0.786442599221493, "learning_rate": 2.566408291943906e-07, "loss": 0.4483, "step": 3990 }, { "epoch": 0.9105135865699246, "grad_norm": 0.7730250654140295, "learning_rate": 2.4422660885534635e-07, "loss": 0.4506, "step": 4000 }, { "epoch": 0.9127898705363494, "grad_norm": 0.7971290026825717, "learning_rate": 2.3211261243089255e-07, "loss": 0.4351, "step": 4010 }, { "epoch": 0.9150661545027742, "grad_norm": 0.7956171103006121, "learning_rate": 2.2029960465709433e-07, "loss": 0.4358, "step": 4020 }, { "epoch": 0.917342438469199, "grad_norm": 0.7975596769757733, "learning_rate": 2.0878833126911135e-07, "loss": 0.4429, "step": 4030 }, { "epoch": 0.9196187224356238, "grad_norm": 0.7720277466201635, "learning_rate": 1.9757951895412576e-07, "loss": 0.4352, "step": 4040 }, { "epoch": 0.9218950064020487, "grad_norm": 0.7853839927041535, "learning_rate": 1.866738753054631e-07, "loss": 0.4551, "step": 4050 }, { "epoch": 0.9241712903684735, "grad_norm": 0.7825373294798991, "learning_rate": 1.7607208877792604e-07, "loss": 0.4417, "step": 4060 }, { "epoch": 0.9264475743348983, "grad_norm": 0.8184549696249286, "learning_rate": 1.6577482864432946e-07, "loss": 0.4399, "step": 4070 }, { "epoch": 0.9287238583013231, "grad_norm": 0.7713360616227353, "learning_rate": 1.5578274495325618e-07, "loss": 0.4329, "step": 4080 }, { "epoch": 0.9310001422677479, "grad_norm": 0.7499304967763705, "learning_rate": 1.4609646848801561e-07, "loss": 0.4378, "step": 4090 }, { "epoch": 0.9332764262341727, "grad_norm": 0.8556735326094547, "learning_rate": 1.3671661072682585e-07, "loss": 0.4463, "step": 4100 }, { "epoch": 0.9355527102005975, "grad_norm": 0.7950794399913543, "learning_rate": 1.276437638042116e-07, "loss": 0.4324, "step": 4110 }, { "epoch": 0.9378289941670224, "grad_norm": 0.8029386901058022, "learning_rate": 1.1887850047362315e-07, "loss": 0.4418, "step": 4120 }, { "epoch": 0.9401052781334471, "grad_norm": 0.8040559392859772, "learning_rate": 1.104213740712795e-07, "loss": 0.4432, "step": 4130 }, { "epoch": 0.942381562099872, "grad_norm": 0.7956315398376556, "learning_rate": 1.0227291848123932e-07, "loss": 0.4443, "step": 4140 }, { "epoch": 0.9446578460662968, "grad_norm": 0.8316381693144498, "learning_rate": 9.443364810169331e-08, "loss": 0.4479, "step": 4150 }, { "epoch": 0.9469341300327215, "grad_norm": 0.7160374953454977, "learning_rate": 8.690405781249745e-08, "loss": 0.4394, "step": 4160 }, { "epoch": 0.9492104139991464, "grad_norm": 1.1619693324211688, "learning_rate": 7.96846229439241e-08, "loss": 0.4391, "step": 4170 }, { "epoch": 0.9514866979655712, "grad_norm": 0.7807355700189923, "learning_rate": 7.277579924666322e-08, "loss": 0.4431, "step": 4180 }, { "epoch": 0.953762981931996, "grad_norm": 0.799461080581998, "learning_rate": 6.617802286304597e-08, "loss": 0.4465, "step": 4190 }, { "epoch": 0.9560392658984208, "grad_norm": 0.8615155608414043, "learning_rate": 5.989171029951446e-08, "loss": 0.4545, "step": 4200 }, { "epoch": 0.9583155498648457, "grad_norm": 1.7829375204373485, "learning_rate": 5.391725840032724e-08, "loss": 0.4361, "step": 4210 }, { "epoch": 0.9605918338312704, "grad_norm": 0.8158290333293733, "learning_rate": 4.8255044322507714e-08, "loss": 0.4319, "step": 4220 }, { "epoch": 0.9628681177976952, "grad_norm": 0.7479808786166098, "learning_rate": 4.290542551203536e-08, "loss": 0.4452, "step": 4230 }, { "epoch": 0.9651444017641201, "grad_norm": 0.823075917374607, "learning_rate": 3.7868739681278796e-08, "loss": 0.4395, "step": 4240 }, { "epoch": 0.9674206857305448, "grad_norm": 0.8193285422862143, "learning_rate": 3.314530478768008e-08, "loss": 0.4378, "step": 4250 }, { "epoch": 0.9696969696969697, "grad_norm": 0.7169988301586265, "learning_rate": 2.8735419013677934e-08, "loss": 0.4368, "step": 4260 }, { "epoch": 0.9719732536633945, "grad_norm": 0.867983394784437, "learning_rate": 2.4639360747888974e-08, "loss": 0.4433, "step": 4270 }, { "epoch": 0.9742495376298194, "grad_norm": 0.8926574534891935, "learning_rate": 2.0857388567529502e-08, "loss": 0.4366, "step": 4280 }, { "epoch": 0.9765258215962441, "grad_norm": 0.8314945073143262, "learning_rate": 1.738974122209358e-08, "loss": 0.4472, "step": 4290 }, { "epoch": 0.9788021055626689, "grad_norm": 0.8241979804740377, "learning_rate": 1.4236637618282312e-08, "loss": 0.4496, "step": 4300 }, { "epoch": 0.9810783895290938, "grad_norm": 0.7393166267679743, "learning_rate": 1.1398276806182107e-08, "loss": 0.4315, "step": 4310 }, { "epoch": 0.9833546734955185, "grad_norm": 0.7415102824589282, "learning_rate": 8.874837966700855e-09, "loss": 0.433, "step": 4320 }, { "epoch": 0.9856309574619434, "grad_norm": 0.7946710429947246, "learning_rate": 6.6664804002564145e-09, "loss": 0.4364, "step": 4330 }, { "epoch": 0.9879072414283682, "grad_norm": 0.8084200476976278, "learning_rate": 4.773343516718543e-09, "loss": 0.4312, "step": 4340 }, { "epoch": 0.990183525394793, "grad_norm": 0.8535152486368869, "learning_rate": 3.1955468266120505e-09, "loss": 0.4462, "step": 4350 }, { "epoch": 0.9924598093612178, "grad_norm": 0.7974643794010233, "learning_rate": 1.9331899335661708e-09, "loss": 0.4357, "step": 4360 }, { "epoch": 0.9947360933276427, "grad_norm": 0.7710537576596679, "learning_rate": 9.863525280340292e-10, "loss": 0.4337, "step": 4370 }, { "epoch": 0.9970123772940674, "grad_norm": 0.7708116142325829, "learning_rate": 3.550943822550057e-10, "loss": 0.4338, "step": 4380 }, { "epoch": 0.9992886612604922, "grad_norm": 0.8061784317503751, "learning_rate": 3.9455346487193846e-11, "loss": 0.4427, "step": 4390 }, { "epoch": 1.0, "step": 4394, "total_flos": 9.741193804139987e+18, "train_loss": 0.29473522613414266, "train_runtime": 212859.8204, "train_samples_per_second": 1.981, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 4394, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.741193804139987e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }