{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014992644233922732, "grad_norm": 1.3515625, "learning_rate": 3.6e-08, "loss": 1.6646936416625977, "step": 10 }, { "epoch": 0.0029985288467845464, "grad_norm": 1.0859375, "learning_rate": 7.599999999999999e-08, "loss": 1.642629623413086, "step": 20 }, { "epoch": 0.004497793270176819, "grad_norm": 0.953125, "learning_rate": 1.16e-07, "loss": 1.6638397216796874, "step": 30 }, { "epoch": 0.005997057693569093, "grad_norm": 0.99609375, "learning_rate": 1.56e-07, "loss": 1.648602294921875, "step": 40 }, { "epoch": 0.007496322116961366, "grad_norm": 1.1796875, "learning_rate": 1.96e-07, "loss": 1.6447210311889648, "step": 50 }, { "epoch": 0.008995586540353638, "grad_norm": 1.125, "learning_rate": 2.3599999999999997e-07, "loss": 1.6022998809814453, "step": 60 }, { "epoch": 0.010494850963745913, "grad_norm": 1.1171875, "learning_rate": 2.7600000000000004e-07, "loss": 1.5853511810302734, "step": 70 }, { "epoch": 0.011994115387138186, "grad_norm": 1.078125, "learning_rate": 3.1599999999999997e-07, "loss": 1.512193775177002, "step": 80 }, { "epoch": 0.013493379810530459, "grad_norm": 1.0703125, "learning_rate": 3.5599999999999996e-07, "loss": 1.4463014602661133, "step": 90 }, { "epoch": 0.014992644233922731, "grad_norm": 1.21875, "learning_rate": 3.96e-07, "loss": 1.3526597023010254, "step": 100 }, { "epoch": 0.016491908657315004, "grad_norm": 1.71875, "learning_rate": 4.36e-07, "loss": 1.1787323951721191, "step": 110 }, { "epoch": 0.017991173080707277, "grad_norm": 0.9765625, "learning_rate": 4.76e-07, "loss": 0.9612628936767578, "step": 120 }, { "epoch": 0.01949043750409955, "grad_norm": 1.2578125, "learning_rate": 5.16e-07, "loss": 0.6817054748535156, "step": 130 }, { "epoch": 0.020989701927491826, "grad_norm": 0.703125, "learning_rate": 5.560000000000001e-07, "loss": 0.4010798454284668, "step": 140 }, { "epoch": 0.0224889663508841, "grad_norm": 0.44921875, "learning_rate": 5.96e-07, "loss": 0.33456034660339357, "step": 150 }, { "epoch": 0.02398823077427637, "grad_norm": 0.447265625, "learning_rate": 6.36e-07, "loss": 0.351082444190979, "step": 160 }, { "epoch": 0.025487495197668644, "grad_norm": 0.41015625, "learning_rate": 6.76e-07, "loss": 0.3510997772216797, "step": 170 }, { "epoch": 0.026986759621060917, "grad_norm": 0.2734375, "learning_rate": 7.159999999999999e-07, "loss": 0.27989749908447265, "step": 180 }, { "epoch": 0.02848602404445319, "grad_norm": 0.484375, "learning_rate": 7.559999999999999e-07, "loss": 0.25770542621612547, "step": 190 }, { "epoch": 0.029985288467845463, "grad_norm": 0.361328125, "learning_rate": 7.96e-07, "loss": 0.24579532146453859, "step": 200 }, { "epoch": 0.031484552891237735, "grad_norm": 0.357421875, "learning_rate": 8.359999999999999e-07, "loss": 0.19708189964294434, "step": 210 }, { "epoch": 0.03298381731463001, "grad_norm": 0.337890625, "learning_rate": 8.76e-07, "loss": 0.19723033905029297, "step": 220 }, { "epoch": 0.03448308173802228, "grad_norm": 0.447265625, "learning_rate": 9.16e-07, "loss": 0.2601869821548462, "step": 230 }, { "epoch": 0.035982346161414554, "grad_norm": 0.2431640625, "learning_rate": 9.559999999999998e-07, "loss": 0.22059969902038573, "step": 240 }, { "epoch": 0.037481610584806827, "grad_norm": 0.291015625, "learning_rate": 9.959999999999999e-07, "loss": 0.23547093868255614, "step": 250 }, { "epoch": 0.0389808750081991, "grad_norm": 0.396484375, "learning_rate": 1.036e-06, "loss": 0.23184664249420167, "step": 260 }, { "epoch": 0.04048013943159137, "grad_norm": 0.2373046875, "learning_rate": 1.076e-06, "loss": 0.1775584936141968, "step": 270 }, { "epoch": 0.04197940385498365, "grad_norm": 0.3046875, "learning_rate": 1.116e-06, "loss": 0.1842654228210449, "step": 280 }, { "epoch": 0.043478668278375925, "grad_norm": 0.29296875, "learning_rate": 1.1559999999999998e-06, "loss": 0.1996519684791565, "step": 290 }, { "epoch": 0.0449779327017682, "grad_norm": 0.26171875, "learning_rate": 1.1959999999999999e-06, "loss": 0.2009890556335449, "step": 300 }, { "epoch": 0.04647719712516047, "grad_norm": 0.359375, "learning_rate": 1.236e-06, "loss": 0.20845539569854737, "step": 310 }, { "epoch": 0.04797646154855274, "grad_norm": 0.474609375, "learning_rate": 1.276e-06, "loss": 0.18711086511611938, "step": 320 }, { "epoch": 0.049475725971945016, "grad_norm": 0.287109375, "learning_rate": 1.316e-06, "loss": 0.17694923877716065, "step": 330 }, { "epoch": 0.05097499039533729, "grad_norm": 0.4921875, "learning_rate": 1.356e-06, "loss": 0.2112422227859497, "step": 340 }, { "epoch": 0.05247425481872956, "grad_norm": 0.49609375, "learning_rate": 1.3959999999999998e-06, "loss": 0.16929364204406738, "step": 350 }, { "epoch": 0.053973519242121834, "grad_norm": 0.216796875, "learning_rate": 1.4359999999999999e-06, "loss": 0.14373246431350709, "step": 360 }, { "epoch": 0.05547278366551411, "grad_norm": 0.251953125, "learning_rate": 1.476e-06, "loss": 0.19208487272262573, "step": 370 }, { "epoch": 0.05697204808890638, "grad_norm": 0.490234375, "learning_rate": 1.516e-06, "loss": 0.21074600219726564, "step": 380 }, { "epoch": 0.05847131251229865, "grad_norm": 0.404296875, "learning_rate": 1.556e-06, "loss": 0.23820483684539795, "step": 390 }, { "epoch": 0.059970576935690925, "grad_norm": 0.3828125, "learning_rate": 1.596e-06, "loss": 0.1559612274169922, "step": 400 }, { "epoch": 0.0614698413590832, "grad_norm": 0.263671875, "learning_rate": 1.6359999999999999e-06, "loss": 0.17463357448577882, "step": 410 }, { "epoch": 0.06296910578247547, "grad_norm": 0.2294921875, "learning_rate": 1.676e-06, "loss": 0.17741453647613525, "step": 420 }, { "epoch": 0.06446837020586775, "grad_norm": 0.37890625, "learning_rate": 1.716e-06, "loss": 0.16474447250366211, "step": 430 }, { "epoch": 0.06596763462926002, "grad_norm": 0.1826171875, "learning_rate": 1.756e-06, "loss": 0.15805156230926515, "step": 440 }, { "epoch": 0.0674668990526523, "grad_norm": 0.357421875, "learning_rate": 1.796e-06, "loss": 0.1870645046234131, "step": 450 }, { "epoch": 0.06896616347604456, "grad_norm": 0.3203125, "learning_rate": 1.836e-06, "loss": 0.17467626333236694, "step": 460 }, { "epoch": 0.07046542789943684, "grad_norm": 0.296875, "learning_rate": 1.8759999999999997e-06, "loss": 0.1839710831642151, "step": 470 }, { "epoch": 0.07196469232282911, "grad_norm": 0.271484375, "learning_rate": 1.916e-06, "loss": 0.15121291875839232, "step": 480 }, { "epoch": 0.07346395674622139, "grad_norm": 0.1962890625, "learning_rate": 1.956e-06, "loss": 0.16356420516967773, "step": 490 }, { "epoch": 0.07496322116961365, "grad_norm": 0.2294921875, "learning_rate": 1.996e-06, "loss": 0.18490909337997435, "step": 500 }, { "epoch": 0.07646248559300593, "grad_norm": 0.306640625, "learning_rate": 1.9999895001358395e-06, "loss": 0.17003339529037476, "step": 510 }, { "epoch": 0.0779617500163982, "grad_norm": 0.314453125, "learning_rate": 1.9999532045921925e-06, "loss": 0.14626400470733641, "step": 520 }, { "epoch": 0.07946101443979048, "grad_norm": 0.1728515625, "learning_rate": 1.9998909846818658e-06, "loss": 0.1461304545402527, "step": 530 }, { "epoch": 0.08096027886318274, "grad_norm": 0.1572265625, "learning_rate": 1.9998028420179468e-06, "loss": 0.1631840229034424, "step": 540 }, { "epoch": 0.08245954328657502, "grad_norm": 0.255859375, "learning_rate": 1.9996887788855846e-06, "loss": 0.14891616106033326, "step": 550 }, { "epoch": 0.0839588077099673, "grad_norm": 0.341796875, "learning_rate": 1.999548798241933e-06, "loss": 0.1451740264892578, "step": 560 }, { "epoch": 0.08545807213335957, "grad_norm": 0.26953125, "learning_rate": 1.9993829037160704e-06, "loss": 0.13687235116958618, "step": 570 }, { "epoch": 0.08695733655675185, "grad_norm": 0.19921875, "learning_rate": 1.9991910996089085e-06, "loss": 0.15143134593963622, "step": 580 }, { "epoch": 0.08845660098014411, "grad_norm": 0.322265625, "learning_rate": 1.998973390893081e-06, "loss": 0.15538901090621948, "step": 590 }, { "epoch": 0.0899558654035364, "grad_norm": 0.267578125, "learning_rate": 1.998729783212812e-06, "loss": 0.17548735141754152, "step": 600 }, { "epoch": 0.09145512982692866, "grad_norm": 0.234375, "learning_rate": 1.998460282883772e-06, "loss": 0.1454736351966858, "step": 610 }, { "epoch": 0.09295439425032094, "grad_norm": 0.392578125, "learning_rate": 1.998164896892913e-06, "loss": 0.13865782022476197, "step": 620 }, { "epoch": 0.0944536586737132, "grad_norm": 0.259765625, "learning_rate": 1.9978436328982882e-06, "loss": 0.16720572710037232, "step": 630 }, { "epoch": 0.09595292309710549, "grad_norm": 0.2197265625, "learning_rate": 1.997496499228853e-06, "loss": 0.14800021648406983, "step": 640 }, { "epoch": 0.09745218752049775, "grad_norm": 0.1708984375, "learning_rate": 1.9971235048842495e-06, "loss": 0.14826395511627197, "step": 650 }, { "epoch": 0.09895145194389003, "grad_norm": 0.1396484375, "learning_rate": 1.996724659534572e-06, "loss": 0.12433024644851684, "step": 660 }, { "epoch": 0.1004507163672823, "grad_norm": 0.28125, "learning_rate": 1.9962999735201173e-06, "loss": 0.1702478051185608, "step": 670 }, { "epoch": 0.10194998079067458, "grad_norm": 0.251953125, "learning_rate": 1.9958494578511167e-06, "loss": 0.1259335994720459, "step": 680 }, { "epoch": 0.10344924521406684, "grad_norm": 0.2353515625, "learning_rate": 1.99537312420745e-06, "loss": 0.20034666061401368, "step": 690 }, { "epoch": 0.10494850963745912, "grad_norm": 0.25390625, "learning_rate": 1.994870984938344e-06, "loss": 0.12428268194198608, "step": 700 }, { "epoch": 0.10644777406085139, "grad_norm": 0.2578125, "learning_rate": 1.9943430530620497e-06, "loss": 0.11142982244491577, "step": 710 }, { "epoch": 0.10794703848424367, "grad_norm": 0.361328125, "learning_rate": 1.993789342265507e-06, "loss": 0.1445391893386841, "step": 720 }, { "epoch": 0.10944630290763595, "grad_norm": 0.353515625, "learning_rate": 1.99320986690399e-06, "loss": 0.1293397307395935, "step": 730 }, { "epoch": 0.11094556733102821, "grad_norm": 0.26171875, "learning_rate": 1.9926046420007326e-06, "loss": 0.11696268320083618, "step": 740 }, { "epoch": 0.1124448317544205, "grad_norm": 0.2490234375, "learning_rate": 1.9919736832465417e-06, "loss": 0.12922875881195067, "step": 750 }, { "epoch": 0.11394409617781276, "grad_norm": 0.3046875, "learning_rate": 1.9913170069993896e-06, "loss": 0.13306174278259278, "step": 760 }, { "epoch": 0.11544336060120504, "grad_norm": 0.1982421875, "learning_rate": 1.9906346302839882e-06, "loss": 0.13486032485961913, "step": 770 }, { "epoch": 0.1169426250245973, "grad_norm": 0.1982421875, "learning_rate": 1.9899265707913492e-06, "loss": 0.13135333061218263, "step": 780 }, { "epoch": 0.11844188944798958, "grad_norm": 0.294921875, "learning_rate": 1.989192846878326e-06, "loss": 0.12307331562042237, "step": 790 }, { "epoch": 0.11994115387138185, "grad_norm": 0.29296875, "learning_rate": 1.988433477567137e-06, "loss": 0.11497733592987061, "step": 800 }, { "epoch": 0.12144041829477413, "grad_norm": 0.1865234375, "learning_rate": 1.9876484825448706e-06, "loss": 0.13883528709411622, "step": 810 }, { "epoch": 0.1229396827181664, "grad_norm": 0.37109375, "learning_rate": 1.9868378821629795e-06, "loss": 0.13286290168762208, "step": 820 }, { "epoch": 0.12443894714155868, "grad_norm": 0.3046875, "learning_rate": 1.9860016974367474e-06, "loss": 0.1608394503593445, "step": 830 }, { "epoch": 0.12593821156495094, "grad_norm": 0.27734375, "learning_rate": 1.985139950044749e-06, "loss": 0.1350063681602478, "step": 840 }, { "epoch": 0.1274374759883432, "grad_norm": 0.2236328125, "learning_rate": 1.9842526623282844e-06, "loss": 0.14678356647491456, "step": 850 }, { "epoch": 0.1289367404117355, "grad_norm": 0.1455078125, "learning_rate": 1.9833398572908027e-06, "loss": 0.13124724626541137, "step": 860 }, { "epoch": 0.13043600483512777, "grad_norm": 0.22265625, "learning_rate": 1.9824015585973037e-06, "loss": 0.1295769214630127, "step": 870 }, { "epoch": 0.13193526925852003, "grad_norm": 0.1455078125, "learning_rate": 1.9814377905737253e-06, "loss": 0.14678038358688356, "step": 880 }, { "epoch": 0.1334345336819123, "grad_norm": 0.27734375, "learning_rate": 1.980448578206312e-06, "loss": 0.12379497289657593, "step": 890 }, { "epoch": 0.1349337981053046, "grad_norm": 0.322265625, "learning_rate": 1.9794339471409684e-06, "loss": 0.1308390736579895, "step": 900 }, { "epoch": 0.13643306252869686, "grad_norm": 0.322265625, "learning_rate": 1.978393923682593e-06, "loss": 0.1078214168548584, "step": 910 }, { "epoch": 0.13793232695208912, "grad_norm": 0.203125, "learning_rate": 1.9773285347943975e-06, "loss": 0.12421451807022095, "step": 920 }, { "epoch": 0.13943159137548142, "grad_norm": 0.3125, "learning_rate": 1.976237808097206e-06, "loss": 0.11592028141021729, "step": 930 }, { "epoch": 0.14093085579887368, "grad_norm": 0.2158203125, "learning_rate": 1.975121771868741e-06, "loss": 0.11567631959915162, "step": 940 }, { "epoch": 0.14243012022226595, "grad_norm": 0.1904296875, "learning_rate": 1.9739804550428887e-06, "loss": 0.13639799356460572, "step": 950 }, { "epoch": 0.14392938464565821, "grad_norm": 0.349609375, "learning_rate": 1.9728138872089495e-06, "loss": 0.12592445611953734, "step": 960 }, { "epoch": 0.1454286490690505, "grad_norm": 0.1826171875, "learning_rate": 1.9716220986108715e-06, "loss": 0.12377442121505737, "step": 970 }, { "epoch": 0.14692791349244277, "grad_norm": 0.2578125, "learning_rate": 1.9704051201464644e-06, "loss": 0.14418370723724366, "step": 980 }, { "epoch": 0.14842717791583504, "grad_norm": 0.2109375, "learning_rate": 1.9691629833666016e-06, "loss": 0.1573760986328125, "step": 990 }, { "epoch": 0.1499264423392273, "grad_norm": 0.275390625, "learning_rate": 1.9678957204743986e-06, "loss": 0.1386464238166809, "step": 1000 }, { "epoch": 0.1514257067626196, "grad_norm": 0.287109375, "learning_rate": 1.966603364324381e-06, "loss": 0.13971794843673707, "step": 1010 }, { "epoch": 0.15292497118601187, "grad_norm": 0.1669921875, "learning_rate": 1.965285948421631e-06, "loss": 0.13169209957122802, "step": 1020 }, { "epoch": 0.15442423560940413, "grad_norm": 0.31640625, "learning_rate": 1.963943506920921e-06, "loss": 0.1507979989051819, "step": 1030 }, { "epoch": 0.1559235000327964, "grad_norm": 0.1748046875, "learning_rate": 1.962576074625824e-06, "loss": 0.11561447381973267, "step": 1040 }, { "epoch": 0.1574227644561887, "grad_norm": 0.25390625, "learning_rate": 1.961183686987816e-06, "loss": 0.14605475664138795, "step": 1050 }, { "epoch": 0.15892202887958096, "grad_norm": 0.421875, "learning_rate": 1.9597663801053534e-06, "loss": 0.13819440603256225, "step": 1060 }, { "epoch": 0.16042129330297322, "grad_norm": 0.16015625, "learning_rate": 1.9583241907229395e-06, "loss": 0.14112586975097657, "step": 1070 }, { "epoch": 0.1619205577263655, "grad_norm": 0.158203125, "learning_rate": 1.95685715623017e-06, "loss": 0.1168364405632019, "step": 1080 }, { "epoch": 0.16341982214975778, "grad_norm": 0.390625, "learning_rate": 1.955365314660765e-06, "loss": 0.11267675161361694, "step": 1090 }, { "epoch": 0.16491908657315005, "grad_norm": 0.30078125, "learning_rate": 1.9538487046915824e-06, "loss": 0.12178796529769897, "step": 1100 }, { "epoch": 0.1664183509965423, "grad_norm": 0.2353515625, "learning_rate": 1.952307365641615e-06, "loss": 0.10850706100463867, "step": 1110 }, { "epoch": 0.1679176154199346, "grad_norm": 0.248046875, "learning_rate": 1.950741337470971e-06, "loss": 0.12071930170059204, "step": 1120 }, { "epoch": 0.16941687984332687, "grad_norm": 0.291015625, "learning_rate": 1.949150660779839e-06, "loss": 0.12768586874008178, "step": 1130 }, { "epoch": 0.17091614426671914, "grad_norm": 0.2041015625, "learning_rate": 1.9475353768074354e-06, "loss": 0.12366677522659301, "step": 1140 }, { "epoch": 0.1724154086901114, "grad_norm": 0.216796875, "learning_rate": 1.9458955274309334e-06, "loss": 0.12472466230392457, "step": 1150 }, { "epoch": 0.1739146731135037, "grad_norm": 0.61328125, "learning_rate": 1.944231155164378e-06, "loss": 0.10178214311599731, "step": 1160 }, { "epoch": 0.17541393753689596, "grad_norm": 0.1884765625, "learning_rate": 1.942542303157587e-06, "loss": 0.10434643030166627, "step": 1170 }, { "epoch": 0.17691320196028823, "grad_norm": 0.1728515625, "learning_rate": 1.940829015195027e-06, "loss": 0.12654454708099366, "step": 1180 }, { "epoch": 0.1784124663836805, "grad_norm": 0.2099609375, "learning_rate": 1.939091335694682e-06, "loss": 0.14714936017990113, "step": 1190 }, { "epoch": 0.1799117308070728, "grad_norm": 0.220703125, "learning_rate": 1.9373293097069006e-06, "loss": 0.12481101751327514, "step": 1200 }, { "epoch": 0.18141099523046506, "grad_norm": 0.177734375, "learning_rate": 1.935542982913229e-06, "loss": 0.126925528049469, "step": 1210 }, { "epoch": 0.18291025965385732, "grad_norm": 0.216796875, "learning_rate": 1.9337324016252246e-06, "loss": 0.12335828542709351, "step": 1220 }, { "epoch": 0.1844095240772496, "grad_norm": 0.255859375, "learning_rate": 1.931897612783257e-06, "loss": 0.1198701024055481, "step": 1230 }, { "epoch": 0.18590878850064188, "grad_norm": 0.1884765625, "learning_rate": 1.9300386639552917e-06, "loss": 0.10855865478515625, "step": 1240 }, { "epoch": 0.18740805292403415, "grad_norm": 0.169921875, "learning_rate": 1.928155603335654e-06, "loss": 0.11242524385452271, "step": 1250 }, { "epoch": 0.1889073173474264, "grad_norm": 0.2021484375, "learning_rate": 1.9262484797437835e-06, "loss": 0.10338661670684815, "step": 1260 }, { "epoch": 0.1904065817708187, "grad_norm": 0.275390625, "learning_rate": 1.924317342622964e-06, "loss": 0.13085209131240844, "step": 1270 }, { "epoch": 0.19190584619421097, "grad_norm": 0.228515625, "learning_rate": 1.922362242039046e-06, "loss": 0.13100965023040773, "step": 1280 }, { "epoch": 0.19340511061760324, "grad_norm": 0.318359375, "learning_rate": 1.920383228679146e-06, "loss": 0.11286605596542358, "step": 1290 }, { "epoch": 0.1949043750409955, "grad_norm": 0.177734375, "learning_rate": 1.9183803538503325e-06, "loss": 0.10787241458892823, "step": 1300 }, { "epoch": 0.1964036394643878, "grad_norm": 0.2041015625, "learning_rate": 1.916353669478297e-06, "loss": 0.12694379091262817, "step": 1310 }, { "epoch": 0.19790290388778006, "grad_norm": 0.275390625, "learning_rate": 1.914303228106007e-06, "loss": 0.12459377050399781, "step": 1320 }, { "epoch": 0.19940216831117233, "grad_norm": 0.1953125, "learning_rate": 1.912229082892344e-06, "loss": 0.11015371084213257, "step": 1330 }, { "epoch": 0.2009014327345646, "grad_norm": 0.166015625, "learning_rate": 1.910131287610726e-06, "loss": 0.10224473476409912, "step": 1340 }, { "epoch": 0.2024006971579569, "grad_norm": 0.453125, "learning_rate": 1.9080098966477114e-06, "loss": 0.1472551107406616, "step": 1350 }, { "epoch": 0.20389996158134915, "grad_norm": 0.28515625, "learning_rate": 1.9058649650015913e-06, "loss": 0.12049105167388915, "step": 1360 }, { "epoch": 0.20539922600474142, "grad_norm": 0.2236328125, "learning_rate": 1.9036965482809624e-06, "loss": 0.10829113721847534, "step": 1370 }, { "epoch": 0.20689849042813369, "grad_norm": 0.291015625, "learning_rate": 1.9015047027032858e-06, "loss": 0.09630746841430664, "step": 1380 }, { "epoch": 0.20839775485152598, "grad_norm": 0.25, "learning_rate": 1.8992894850934288e-06, "loss": 0.10639712810516358, "step": 1390 }, { "epoch": 0.20989701927491825, "grad_norm": 0.251953125, "learning_rate": 1.8970509528821933e-06, "loss": 0.1108583927154541, "step": 1400 }, { "epoch": 0.2113962836983105, "grad_norm": 0.267578125, "learning_rate": 1.8947891641048236e-06, "loss": 0.1440010905265808, "step": 1410 }, { "epoch": 0.21289554812170278, "grad_norm": 0.2734375, "learning_rate": 1.8925041773995066e-06, "loss": 0.11479418277740479, "step": 1420 }, { "epoch": 0.21439481254509507, "grad_norm": 0.30078125, "learning_rate": 1.8901960520058466e-06, "loss": 0.1372006893157959, "step": 1430 }, { "epoch": 0.21589407696848734, "grad_norm": 0.130859375, "learning_rate": 1.8878648477633338e-06, "loss": 0.1048818826675415, "step": 1440 }, { "epoch": 0.2173933413918796, "grad_norm": 0.2353515625, "learning_rate": 1.8855106251097893e-06, "loss": 0.11379430294036866, "step": 1450 }, { "epoch": 0.2188926058152719, "grad_norm": 0.158203125, "learning_rate": 1.8831334450798008e-06, "loss": 0.11848256587982178, "step": 1460 }, { "epoch": 0.22039187023866416, "grad_norm": 0.2080078125, "learning_rate": 1.8807333693031394e-06, "loss": 0.11757129430770874, "step": 1470 }, { "epoch": 0.22189113466205643, "grad_norm": 0.255859375, "learning_rate": 1.8783104600031608e-06, "loss": 0.1077274203300476, "step": 1480 }, { "epoch": 0.2233903990854487, "grad_norm": 0.1875, "learning_rate": 1.8758647799951936e-06, "loss": 0.13631620407104492, "step": 1490 }, { "epoch": 0.224889663508841, "grad_norm": 0.1787109375, "learning_rate": 1.8733963926849108e-06, "loss": 0.11129487752914428, "step": 1500 }, { "epoch": 0.22638892793223325, "grad_norm": 0.234375, "learning_rate": 1.870905362066684e-06, "loss": 0.10358604192733764, "step": 1510 }, { "epoch": 0.22788819235562552, "grad_norm": 0.2275390625, "learning_rate": 1.8683917527219274e-06, "loss": 0.10696442127227783, "step": 1520 }, { "epoch": 0.22938745677901778, "grad_norm": 0.244140625, "learning_rate": 1.86585562981742e-06, "loss": 0.1079567551612854, "step": 1530 }, { "epoch": 0.23088672120241008, "grad_norm": 0.19140625, "learning_rate": 1.863297059103619e-06, "loss": 0.08297246098518371, "step": 1540 }, { "epoch": 0.23238598562580234, "grad_norm": 0.2314453125, "learning_rate": 1.860716106912954e-06, "loss": 0.11826142072677612, "step": 1550 }, { "epoch": 0.2338852500491946, "grad_norm": 0.1689453125, "learning_rate": 1.858112840158107e-06, "loss": 0.11677643060684204, "step": 1560 }, { "epoch": 0.23538451447258688, "grad_norm": 0.220703125, "learning_rate": 1.8554873263302783e-06, "loss": 0.10421488285064698, "step": 1570 }, { "epoch": 0.23688377889597917, "grad_norm": 0.33203125, "learning_rate": 1.8528396334974364e-06, "loss": 0.10596433877944947, "step": 1580 }, { "epoch": 0.23838304331937143, "grad_norm": 0.150390625, "learning_rate": 1.850169830302553e-06, "loss": 0.09852623343467712, "step": 1590 }, { "epoch": 0.2398823077427637, "grad_norm": 0.2392578125, "learning_rate": 1.8474779859618245e-06, "loss": 0.13672434091567992, "step": 1600 }, { "epoch": 0.24138157216615597, "grad_norm": 0.177734375, "learning_rate": 1.8447641702628762e-06, "loss": 0.11511225700378418, "step": 1610 }, { "epoch": 0.24288083658954826, "grad_norm": 0.27734375, "learning_rate": 1.8420284535629539e-06, "loss": 0.11240946054458618, "step": 1620 }, { "epoch": 0.24438010101294053, "grad_norm": 0.1484375, "learning_rate": 1.839270906787099e-06, "loss": 0.07973622083663941, "step": 1630 }, { "epoch": 0.2458793654363328, "grad_norm": 0.23046875, "learning_rate": 1.8364916014263115e-06, "loss": 0.10506463050842285, "step": 1640 }, { "epoch": 0.24737862985972509, "grad_norm": 0.287109375, "learning_rate": 1.8336906095356937e-06, "loss": 0.1416532278060913, "step": 1650 }, { "epoch": 0.24887789428311735, "grad_norm": 0.294921875, "learning_rate": 1.830868003732585e-06, "loss": 0.10021046400070191, "step": 1660 }, { "epoch": 0.2503771587065096, "grad_norm": 0.2158203125, "learning_rate": 1.8280238571946773e-06, "loss": 0.09624313712120056, "step": 1670 }, { "epoch": 0.2518764231299019, "grad_norm": 0.19921875, "learning_rate": 1.8251582436581193e-06, "loss": 0.09360762238502503, "step": 1680 }, { "epoch": 0.25337568755329415, "grad_norm": 0.248046875, "learning_rate": 1.8222712374156038e-06, "loss": 0.10825358629226685, "step": 1690 }, { "epoch": 0.2548749519766864, "grad_norm": 0.2578125, "learning_rate": 1.8193629133144412e-06, "loss": 0.09739000201225281, "step": 1700 }, { "epoch": 0.25637421640007874, "grad_norm": 0.2734375, "learning_rate": 1.8164333467546205e-06, "loss": 0.13052973747253419, "step": 1710 }, { "epoch": 0.257873480823471, "grad_norm": 0.314453125, "learning_rate": 1.8134826136868533e-06, "loss": 0.1281905174255371, "step": 1720 }, { "epoch": 0.25937274524686327, "grad_norm": 0.1611328125, "learning_rate": 1.810510790610606e-06, "loss": 0.1224624514579773, "step": 1730 }, { "epoch": 0.26087200967025553, "grad_norm": 0.2236328125, "learning_rate": 1.8075179545721148e-06, "loss": 0.11144398450851441, "step": 1740 }, { "epoch": 0.2623712740936478, "grad_norm": 0.3125, "learning_rate": 1.8045041831623892e-06, "loss": 0.07502882480621338, "step": 1750 }, { "epoch": 0.26387053851704007, "grad_norm": 0.2578125, "learning_rate": 1.8014695545152014e-06, "loss": 0.11576559543609619, "step": 1760 }, { "epoch": 0.26536980294043233, "grad_norm": 0.16015625, "learning_rate": 1.7984141473050583e-06, "loss": 0.10232355594635009, "step": 1770 }, { "epoch": 0.2668690673638246, "grad_norm": 0.267578125, "learning_rate": 1.7953380407451632e-06, "loss": 0.10430169105529785, "step": 1780 }, { "epoch": 0.2683683317872169, "grad_norm": 0.279296875, "learning_rate": 1.7922413145853632e-06, "loss": 0.10129927396774292, "step": 1790 }, { "epoch": 0.2698675962106092, "grad_norm": 0.1787109375, "learning_rate": 1.7891240491100794e-06, "loss": 0.1479990601539612, "step": 1800 }, { "epoch": 0.27136686063400145, "grad_norm": 0.265625, "learning_rate": 1.7859863251362268e-06, "loss": 0.09153670072555542, "step": 1810 }, { "epoch": 0.2728661250573937, "grad_norm": 0.185546875, "learning_rate": 1.7828282240111188e-06, "loss": 0.10302189588546753, "step": 1820 }, { "epoch": 0.274365389480786, "grad_norm": 0.279296875, "learning_rate": 1.779649827610359e-06, "loss": 0.10783896446228028, "step": 1830 }, { "epoch": 0.27586465390417825, "grad_norm": 0.2099609375, "learning_rate": 1.7764512183357161e-06, "loss": 0.10202981233596801, "step": 1840 }, { "epoch": 0.2773639183275705, "grad_norm": 0.279296875, "learning_rate": 1.7732324791129914e-06, "loss": 0.09905132055282592, "step": 1850 }, { "epoch": 0.27886318275096283, "grad_norm": 0.1611328125, "learning_rate": 1.769993693389865e-06, "loss": 0.10531445741653442, "step": 1860 }, { "epoch": 0.2803624471743551, "grad_norm": 0.2138671875, "learning_rate": 1.7667349451337353e-06, "loss": 0.08846319317817689, "step": 1870 }, { "epoch": 0.28186171159774737, "grad_norm": 0.2392578125, "learning_rate": 1.7634563188295403e-06, "loss": 0.0975230872631073, "step": 1880 }, { "epoch": 0.28336097602113963, "grad_norm": 0.31640625, "learning_rate": 1.7601578994775684e-06, "loss": 0.09964791536331177, "step": 1890 }, { "epoch": 0.2848602404445319, "grad_norm": 0.2373046875, "learning_rate": 1.756839772591254e-06, "loss": 0.1272280693054199, "step": 1900 }, { "epoch": 0.28635950486792416, "grad_norm": 0.353515625, "learning_rate": 1.7535020241949598e-06, "loss": 0.11281530857086182, "step": 1910 }, { "epoch": 0.28785876929131643, "grad_norm": 0.19921875, "learning_rate": 1.7501447408217497e-06, "loss": 0.12100661993026733, "step": 1920 }, { "epoch": 0.2893580337147087, "grad_norm": 0.16015625, "learning_rate": 1.7467680095111414e-06, "loss": 0.10090996026992798, "step": 1930 }, { "epoch": 0.290857298138101, "grad_norm": 0.28125, "learning_rate": 1.7433719178068524e-06, "loss": 0.13152073621749877, "step": 1940 }, { "epoch": 0.2923565625614933, "grad_norm": 0.193359375, "learning_rate": 1.739956553754529e-06, "loss": 0.09162830114364624, "step": 1950 }, { "epoch": 0.29385582698488555, "grad_norm": 0.396484375, "learning_rate": 1.7365220058994655e-06, "loss": 0.1236315131187439, "step": 1960 }, { "epoch": 0.2953550914082778, "grad_norm": 0.2119140625, "learning_rate": 1.7330683632843059e-06, "loss": 0.09788467288017273, "step": 1970 }, { "epoch": 0.2968543558316701, "grad_norm": 0.25390625, "learning_rate": 1.7295957154467382e-06, "loss": 0.09465370178222657, "step": 1980 }, { "epoch": 0.29835362025506235, "grad_norm": 0.1279296875, "learning_rate": 1.726104152417171e-06, "loss": 0.1005245327949524, "step": 1990 }, { "epoch": 0.2998528846784546, "grad_norm": 0.28515625, "learning_rate": 1.722593764716401e-06, "loss": 0.11565471887588501, "step": 2000 }, { "epoch": 0.30135214910184693, "grad_norm": 0.1865234375, "learning_rate": 1.7190646433532644e-06, "loss": 0.10114152431488037, "step": 2010 }, { "epoch": 0.3028514135252392, "grad_norm": 0.349609375, "learning_rate": 1.7155168798222789e-06, "loss": 0.11758486032485962, "step": 2020 }, { "epoch": 0.30435067794863147, "grad_norm": 0.26953125, "learning_rate": 1.7119505661012718e-06, "loss": 0.12670440673828126, "step": 2030 }, { "epoch": 0.30584994237202373, "grad_norm": 0.2138671875, "learning_rate": 1.7083657946489941e-06, "loss": 0.09111065268516541, "step": 2040 }, { "epoch": 0.307349206795416, "grad_norm": 0.244140625, "learning_rate": 1.7047626584027248e-06, "loss": 0.10659761428833008, "step": 2050 }, { "epoch": 0.30884847121880826, "grad_norm": 0.203125, "learning_rate": 1.7011412507758598e-06, "loss": 0.09141663908958435, "step": 2060 }, { "epoch": 0.31034773564220053, "grad_norm": 0.2158203125, "learning_rate": 1.6975016656554924e-06, "loss": 0.1156761646270752, "step": 2070 }, { "epoch": 0.3118470000655928, "grad_norm": 0.1796875, "learning_rate": 1.693843997399977e-06, "loss": 0.1171414852142334, "step": 2080 }, { "epoch": 0.3133462644889851, "grad_norm": 0.158203125, "learning_rate": 1.690168340836484e-06, "loss": 0.10372446775436402, "step": 2090 }, { "epoch": 0.3148455289123774, "grad_norm": 0.3515625, "learning_rate": 1.6864747912585416e-06, "loss": 0.11128904819488525, "step": 2100 }, { "epoch": 0.31634479333576965, "grad_norm": 0.2138671875, "learning_rate": 1.6827634444235643e-06, "loss": 0.11956160068511963, "step": 2110 }, { "epoch": 0.3178440577591619, "grad_norm": 0.1640625, "learning_rate": 1.6790343965503709e-06, "loss": 0.08641130924224853, "step": 2120 }, { "epoch": 0.3193433221825542, "grad_norm": 0.318359375, "learning_rate": 1.67528774431669e-06, "loss": 0.11216531991958618, "step": 2130 }, { "epoch": 0.32084258660594644, "grad_norm": 0.2890625, "learning_rate": 1.6715235848566533e-06, "loss": 0.09440256357192993, "step": 2140 }, { "epoch": 0.3223418510293387, "grad_norm": 0.2373046875, "learning_rate": 1.6677420157582774e-06, "loss": 0.08534490466117858, "step": 2150 }, { "epoch": 0.323841115452731, "grad_norm": 0.30859375, "learning_rate": 1.663943135060934e-06, "loss": 0.0956838846206665, "step": 2160 }, { "epoch": 0.3253403798761233, "grad_norm": 0.1767578125, "learning_rate": 1.6601270412528084e-06, "loss": 0.1049761414527893, "step": 2170 }, { "epoch": 0.32683964429951556, "grad_norm": 0.189453125, "learning_rate": 1.6562938332683454e-06, "loss": 0.10431164503097534, "step": 2180 }, { "epoch": 0.32833890872290783, "grad_norm": 0.169921875, "learning_rate": 1.6524436104856845e-06, "loss": 0.09506284594535827, "step": 2190 }, { "epoch": 0.3298381731463001, "grad_norm": 0.23828125, "learning_rate": 1.648576472724084e-06, "loss": 0.1192029595375061, "step": 2200 }, { "epoch": 0.33133743756969236, "grad_norm": 0.201171875, "learning_rate": 1.6446925202413331e-06, "loss": 0.09638182520866394, "step": 2210 }, { "epoch": 0.3328367019930846, "grad_norm": 0.19921875, "learning_rate": 1.640791853731152e-06, "loss": 0.090701824426651, "step": 2220 }, { "epoch": 0.3343359664164769, "grad_norm": 0.220703125, "learning_rate": 1.6368745743205821e-06, "loss": 0.09149349331855774, "step": 2230 }, { "epoch": 0.3358352308398692, "grad_norm": 0.310546875, "learning_rate": 1.6329407835673635e-06, "loss": 0.13018569946289063, "step": 2240 }, { "epoch": 0.3373344952632615, "grad_norm": 0.296875, "learning_rate": 1.628990583457302e-06, "loss": 0.1057326078414917, "step": 2250 }, { "epoch": 0.33883375968665375, "grad_norm": 0.212890625, "learning_rate": 1.6250240764016272e-06, "loss": 0.1026038646697998, "step": 2260 }, { "epoch": 0.340333024110046, "grad_norm": 0.32421875, "learning_rate": 1.6210413652343338e-06, "loss": 0.08930633664131164, "step": 2270 }, { "epoch": 0.3418322885334383, "grad_norm": 0.38671875, "learning_rate": 1.6170425532095187e-06, "loss": 0.10358338356018067, "step": 2280 }, { "epoch": 0.34333155295683054, "grad_norm": 0.271484375, "learning_rate": 1.6130277439987022e-06, "loss": 0.09695777893066407, "step": 2290 }, { "epoch": 0.3448308173802228, "grad_norm": 0.296875, "learning_rate": 1.6089970416881414e-06, "loss": 0.10922973155975342, "step": 2300 }, { "epoch": 0.3463300818036151, "grad_norm": 0.3046875, "learning_rate": 1.6049505507761309e-06, "loss": 0.10175033807754516, "step": 2310 }, { "epoch": 0.3478293462270074, "grad_norm": 0.2890625, "learning_rate": 1.600888376170294e-06, "loss": 0.10103652477264405, "step": 2320 }, { "epoch": 0.34932861065039966, "grad_norm": 0.1904296875, "learning_rate": 1.5968106231848632e-06, "loss": 0.07333493828773499, "step": 2330 }, { "epoch": 0.35082787507379193, "grad_norm": 0.1875, "learning_rate": 1.5927173975379488e-06, "loss": 0.08524224758148194, "step": 2340 }, { "epoch": 0.3523271394971842, "grad_norm": 0.220703125, "learning_rate": 1.5886088053488e-06, "loss": 0.09646062850952149, "step": 2350 }, { "epoch": 0.35382640392057646, "grad_norm": 0.265625, "learning_rate": 1.584484953135051e-06, "loss": 0.0860047996044159, "step": 2360 }, { "epoch": 0.3553256683439687, "grad_norm": 0.177734375, "learning_rate": 1.580345947809962e-06, "loss": 0.09231213331222535, "step": 2370 }, { "epoch": 0.356824932767361, "grad_norm": 0.1845703125, "learning_rate": 1.5761918966796462e-06, "loss": 0.08510161638259887, "step": 2380 }, { "epoch": 0.3583241971907533, "grad_norm": 0.171875, "learning_rate": 1.5720229074402883e-06, "loss": 0.10984573364257813, "step": 2390 }, { "epoch": 0.3598234616141456, "grad_norm": 0.26171875, "learning_rate": 1.5678390881753512e-06, "loss": 0.11594033241271973, "step": 2400 }, { "epoch": 0.36132272603753784, "grad_norm": 0.376953125, "learning_rate": 1.5636405473527763e-06, "loss": 0.09002584218978882, "step": 2410 }, { "epoch": 0.3628219904609301, "grad_norm": 0.216796875, "learning_rate": 1.5594273938221683e-06, "loss": 0.09397087097167969, "step": 2420 }, { "epoch": 0.3643212548843224, "grad_norm": 0.3515625, "learning_rate": 1.5551997368119758e-06, "loss": 0.10535862445831298, "step": 2430 }, { "epoch": 0.36582051930771464, "grad_norm": 0.1787109375, "learning_rate": 1.5509576859266589e-06, "loss": 0.09418719410896301, "step": 2440 }, { "epoch": 0.3673197837311069, "grad_norm": 0.19921875, "learning_rate": 1.5467013511438455e-06, "loss": 0.10402942895889282, "step": 2450 }, { "epoch": 0.3688190481544992, "grad_norm": 0.171875, "learning_rate": 1.5424308428114842e-06, "loss": 0.09072368144989014, "step": 2460 }, { "epoch": 0.3703183125778915, "grad_norm": 0.2177734375, "learning_rate": 1.5381462716449793e-06, "loss": 0.12782552242279052, "step": 2470 }, { "epoch": 0.37181757700128376, "grad_norm": 0.271484375, "learning_rate": 1.5338477487243229e-06, "loss": 0.12468627691268921, "step": 2480 }, { "epoch": 0.373316841424676, "grad_norm": 0.19921875, "learning_rate": 1.5295353854912142e-06, "loss": 0.08745025396347046, "step": 2490 }, { "epoch": 0.3748161058480683, "grad_norm": 0.177734375, "learning_rate": 1.5252092937461708e-06, "loss": 0.11175857782363892, "step": 2500 }, { "epoch": 0.37631537027146056, "grad_norm": 0.2412109375, "learning_rate": 1.52086958564563e-06, "loss": 0.09319526553153992, "step": 2510 }, { "epoch": 0.3778146346948528, "grad_norm": 0.2890625, "learning_rate": 1.5165163736990402e-06, "loss": 0.09921846985816955, "step": 2520 }, { "epoch": 0.3793138991182451, "grad_norm": 0.228515625, "learning_rate": 1.5121497707659459e-06, "loss": 0.13923016786575318, "step": 2530 }, { "epoch": 0.3808131635416374, "grad_norm": 0.177734375, "learning_rate": 1.5077698900530605e-06, "loss": 0.09786847829818726, "step": 2540 }, { "epoch": 0.3823124279650297, "grad_norm": 0.185546875, "learning_rate": 1.5033768451113309e-06, "loss": 0.09633988738059998, "step": 2550 }, { "epoch": 0.38381169238842194, "grad_norm": 0.2578125, "learning_rate": 1.4989707498329943e-06, "loss": 0.14051291942596436, "step": 2560 }, { "epoch": 0.3853109568118142, "grad_norm": 0.1962890625, "learning_rate": 1.4945517184486266e-06, "loss": 0.09283372163772582, "step": 2570 }, { "epoch": 0.3868102212352065, "grad_norm": 0.18359375, "learning_rate": 1.4901198655241784e-06, "loss": 0.09845755696296692, "step": 2580 }, { "epoch": 0.38830948565859874, "grad_norm": 0.2216796875, "learning_rate": 1.4856753059580065e-06, "loss": 0.09300137758255005, "step": 2590 }, { "epoch": 0.389808750081991, "grad_norm": 0.2138671875, "learning_rate": 1.4812181549778956e-06, "loss": 0.0833775520324707, "step": 2600 }, { "epoch": 0.3913080145053833, "grad_norm": 0.24609375, "learning_rate": 1.4767485281380694e-06, "loss": 0.09278824925422668, "step": 2610 }, { "epoch": 0.3928072789287756, "grad_norm": 0.2353515625, "learning_rate": 1.4722665413161948e-06, "loss": 0.09754594564437866, "step": 2620 }, { "epoch": 0.39430654335216786, "grad_norm": 0.26953125, "learning_rate": 1.46777231071038e-06, "loss": 0.1008460521697998, "step": 2630 }, { "epoch": 0.3958058077755601, "grad_norm": 0.28515625, "learning_rate": 1.4632659528361591e-06, "loss": 0.0745955765247345, "step": 2640 }, { "epoch": 0.3973050721989524, "grad_norm": 0.2470703125, "learning_rate": 1.4587475845234729e-06, "loss": 0.11444522142410278, "step": 2650 }, { "epoch": 0.39880433662234466, "grad_norm": 0.23046875, "learning_rate": 1.454217322913641e-06, "loss": 0.09638299942016601, "step": 2660 }, { "epoch": 0.4003036010457369, "grad_norm": 0.16015625, "learning_rate": 1.4496752854563217e-06, "loss": 0.0774892508983612, "step": 2670 }, { "epoch": 0.4018028654691292, "grad_norm": 0.263671875, "learning_rate": 1.4451215899064699e-06, "loss": 0.10078433752059937, "step": 2680 }, { "epoch": 0.40330212989252145, "grad_norm": 0.2001953125, "learning_rate": 1.4405563543212841e-06, "loss": 0.0878619134426117, "step": 2690 }, { "epoch": 0.4048013943159138, "grad_norm": 0.1982421875, "learning_rate": 1.4359796970571434e-06, "loss": 0.08299956321716309, "step": 2700 }, { "epoch": 0.40630065873930604, "grad_norm": 0.244140625, "learning_rate": 1.4313917367665414e-06, "loss": 0.11845102310180664, "step": 2710 }, { "epoch": 0.4077999231626983, "grad_norm": 0.26953125, "learning_rate": 1.4267925923950094e-06, "loss": 0.1439320921897888, "step": 2720 }, { "epoch": 0.4092991875860906, "grad_norm": 0.248046875, "learning_rate": 1.422182383178032e-06, "loss": 0.09109203219413757, "step": 2730 }, { "epoch": 0.41079845200948284, "grad_norm": 0.2099609375, "learning_rate": 1.4175612286379562e-06, "loss": 0.07972334623336792, "step": 2740 }, { "epoch": 0.4122977164328751, "grad_norm": 0.1748046875, "learning_rate": 1.412929248580894e-06, "loss": 0.08981594443321228, "step": 2750 }, { "epoch": 0.41379698085626737, "grad_norm": 0.201171875, "learning_rate": 1.4082865630936134e-06, "loss": 0.10788861513137818, "step": 2760 }, { "epoch": 0.4152962452796597, "grad_norm": 0.251953125, "learning_rate": 1.4036332925404283e-06, "loss": 0.08774803280830383, "step": 2770 }, { "epoch": 0.41679550970305196, "grad_norm": 0.1806640625, "learning_rate": 1.3989695575600763e-06, "loss": 0.0800628900527954, "step": 2780 }, { "epoch": 0.4182947741264442, "grad_norm": 0.216796875, "learning_rate": 1.3942954790625904e-06, "loss": 0.11887997388839722, "step": 2790 }, { "epoch": 0.4197940385498365, "grad_norm": 0.1650390625, "learning_rate": 1.3896111782261668e-06, "loss": 0.09116448163986206, "step": 2800 }, { "epoch": 0.42129330297322876, "grad_norm": 0.400390625, "learning_rate": 1.3849167764940211e-06, "loss": 0.11099686622619628, "step": 2810 }, { "epoch": 0.422792567396621, "grad_norm": 0.228515625, "learning_rate": 1.38021239557124e-06, "loss": 0.09188846349716187, "step": 2820 }, { "epoch": 0.4242918318200133, "grad_norm": 0.1748046875, "learning_rate": 1.3754981574216267e-06, "loss": 0.09292811751365662, "step": 2830 }, { "epoch": 0.42579109624340555, "grad_norm": 0.30078125, "learning_rate": 1.3707741842645392e-06, "loss": 0.0990601897239685, "step": 2840 }, { "epoch": 0.4272903606667979, "grad_norm": 0.21875, "learning_rate": 1.3660405985717212e-06, "loss": 0.0773146092891693, "step": 2850 }, { "epoch": 0.42878962509019014, "grad_norm": 0.224609375, "learning_rate": 1.361297523064126e-06, "loss": 0.09871623516082764, "step": 2860 }, { "epoch": 0.4302888895135824, "grad_norm": 0.224609375, "learning_rate": 1.3565450807087373e-06, "loss": 0.09449006915092469, "step": 2870 }, { "epoch": 0.4317881539369747, "grad_norm": 0.2265625, "learning_rate": 1.3517833947153782e-06, "loss": 0.09626795053482055, "step": 2880 }, { "epoch": 0.43328741836036694, "grad_norm": 0.26953125, "learning_rate": 1.34701258853352e-06, "loss": 0.07917786836624145, "step": 2890 }, { "epoch": 0.4347866827837592, "grad_norm": 0.2578125, "learning_rate": 1.3422327858490792e-06, "loss": 0.10537385940551758, "step": 2900 }, { "epoch": 0.43628594720715147, "grad_norm": 0.1923828125, "learning_rate": 1.337444110581212e-06, "loss": 0.07042791247367859, "step": 2910 }, { "epoch": 0.4377852116305438, "grad_norm": 0.2119140625, "learning_rate": 1.3326466868791013e-06, "loss": 0.0855652630329132, "step": 2920 }, { "epoch": 0.43928447605393606, "grad_norm": 0.205078125, "learning_rate": 1.3278406391187391e-06, "loss": 0.09092465043067932, "step": 2930 }, { "epoch": 0.4407837404773283, "grad_norm": 0.216796875, "learning_rate": 1.3230260918997004e-06, "loss": 0.10829230546951293, "step": 2940 }, { "epoch": 0.4422830049007206, "grad_norm": 0.31640625, "learning_rate": 1.3182031700419129e-06, "loss": 0.09212432503700256, "step": 2950 }, { "epoch": 0.44378226932411285, "grad_norm": 0.1708984375, "learning_rate": 1.3133719985824237e-06, "loss": 0.06796190738677979, "step": 2960 }, { "epoch": 0.4452815337475051, "grad_norm": 0.2080078125, "learning_rate": 1.3085327027721536e-06, "loss": 0.08660737872123718, "step": 2970 }, { "epoch": 0.4467807981708974, "grad_norm": 0.1943359375, "learning_rate": 1.3036854080726525e-06, "loss": 0.07199004888534546, "step": 2980 }, { "epoch": 0.44828006259428965, "grad_norm": 0.197265625, "learning_rate": 1.298830240152847e-06, "loss": 0.11634057760238647, "step": 2990 }, { "epoch": 0.449779327017682, "grad_norm": 0.240234375, "learning_rate": 1.2939673248857805e-06, "loss": 0.11802215576171875, "step": 3000 }, { "epoch": 0.45127859144107424, "grad_norm": 0.21875, "learning_rate": 1.2890967883453509e-06, "loss": 0.10256350040435791, "step": 3010 }, { "epoch": 0.4527778558644665, "grad_norm": 0.2470703125, "learning_rate": 1.2842187568030431e-06, "loss": 0.08822081089019776, "step": 3020 }, { "epoch": 0.45427712028785877, "grad_norm": 0.205078125, "learning_rate": 1.2793333567246526e-06, "loss": 0.08067854046821595, "step": 3030 }, { "epoch": 0.45577638471125104, "grad_norm": 0.2099609375, "learning_rate": 1.2744407147670098e-06, "loss": 0.09741014242172241, "step": 3040 }, { "epoch": 0.4572756491346433, "grad_norm": 0.234375, "learning_rate": 1.269540957774695e-06, "loss": 0.07846143245697021, "step": 3050 }, { "epoch": 0.45877491355803557, "grad_norm": 0.353515625, "learning_rate": 1.2646342127767486e-06, "loss": 0.10557938814163208, "step": 3060 }, { "epoch": 0.46027417798142783, "grad_norm": 0.251953125, "learning_rate": 1.2597206069833805e-06, "loss": 0.0840741217136383, "step": 3070 }, { "epoch": 0.46177344240482016, "grad_norm": 0.205078125, "learning_rate": 1.2548002677826704e-06, "loss": 0.09562651515007019, "step": 3080 }, { "epoch": 0.4632727068282124, "grad_norm": 0.2236328125, "learning_rate": 1.2498733227372648e-06, "loss": 0.09925270080566406, "step": 3090 }, { "epoch": 0.4647719712516047, "grad_norm": 0.2255859375, "learning_rate": 1.2449398995810709e-06, "loss": 0.10337086915969848, "step": 3100 }, { "epoch": 0.46627123567499695, "grad_norm": 0.3671875, "learning_rate": 1.2400001262159458e-06, "loss": 0.07978419065475464, "step": 3110 }, { "epoch": 0.4677705000983892, "grad_norm": 0.30859375, "learning_rate": 1.2350541307083776e-06, "loss": 0.07110666632652282, "step": 3120 }, { "epoch": 0.4692697645217815, "grad_norm": 0.2197265625, "learning_rate": 1.2301020412861675e-06, "loss": 0.07428762912750245, "step": 3130 }, { "epoch": 0.47076902894517375, "grad_norm": 0.36328125, "learning_rate": 1.2251439863351068e-06, "loss": 0.09102022051811218, "step": 3140 }, { "epoch": 0.47226829336856607, "grad_norm": 0.302734375, "learning_rate": 1.220180094395644e-06, "loss": 0.08342552185058594, "step": 3150 }, { "epoch": 0.47376755779195834, "grad_norm": 0.24609375, "learning_rate": 1.2152104941595562e-06, "loss": 0.12274667024612426, "step": 3160 }, { "epoch": 0.4752668222153506, "grad_norm": 0.19921875, "learning_rate": 1.2102353144666117e-06, "loss": 0.09014168381690979, "step": 3170 }, { "epoch": 0.47676608663874287, "grad_norm": 0.17578125, "learning_rate": 1.205254684301229e-06, "loss": 0.07782111167907715, "step": 3180 }, { "epoch": 0.47826535106213514, "grad_norm": 0.2001953125, "learning_rate": 1.2002687327891328e-06, "loss": 0.07985667586326599, "step": 3190 }, { "epoch": 0.4797646154855274, "grad_norm": 0.2578125, "learning_rate": 1.1952775891940082e-06, "loss": 0.09129717350006103, "step": 3200 }, { "epoch": 0.48126387990891967, "grad_norm": 0.234375, "learning_rate": 1.190281382914146e-06, "loss": 0.1002733588218689, "step": 3210 }, { "epoch": 0.48276314433231193, "grad_norm": 0.23046875, "learning_rate": 1.185280243479092e-06, "loss": 0.08630979657173157, "step": 3220 }, { "epoch": 0.48426240875570425, "grad_norm": 0.1982421875, "learning_rate": 1.1802743005462862e-06, "loss": 0.08386391997337342, "step": 3230 }, { "epoch": 0.4857616731790965, "grad_norm": 0.23828125, "learning_rate": 1.1752636838977013e-06, "loss": 0.08188863396644593, "step": 3240 }, { "epoch": 0.4872609376024888, "grad_norm": 0.298828125, "learning_rate": 1.1702485234364797e-06, "loss": 0.10928175449371338, "step": 3250 }, { "epoch": 0.48876020202588105, "grad_norm": 0.1923828125, "learning_rate": 1.165228949183565e-06, "loss": 0.09540101885795593, "step": 3260 }, { "epoch": 0.4902594664492733, "grad_norm": 0.2265625, "learning_rate": 1.16020509127433e-06, "loss": 0.092869633436203, "step": 3270 }, { "epoch": 0.4917587308726656, "grad_norm": 0.259765625, "learning_rate": 1.1551770799552039e-06, "loss": 0.09745745658874512, "step": 3280 }, { "epoch": 0.49325799529605785, "grad_norm": 0.19921875, "learning_rate": 1.1501450455802968e-06, "loss": 0.09029659032821655, "step": 3290 }, { "epoch": 0.49475725971945017, "grad_norm": 0.228515625, "learning_rate": 1.145109118608017e-06, "loss": 0.09824432134628296, "step": 3300 }, { "epoch": 0.49625652414284244, "grad_norm": 0.26171875, "learning_rate": 1.1400694295976915e-06, "loss": 0.08436204195022583, "step": 3310 }, { "epoch": 0.4977557885662347, "grad_norm": 0.2158203125, "learning_rate": 1.135026109206181e-06, "loss": 0.10501574277877808, "step": 3320 }, { "epoch": 0.49925505298962697, "grad_norm": 0.337890625, "learning_rate": 1.1299792881844906e-06, "loss": 0.09339694380760193, "step": 3330 }, { "epoch": 0.5007543174130192, "grad_norm": 0.224609375, "learning_rate": 1.1249290973743814e-06, "loss": 0.07747515439987182, "step": 3340 }, { "epoch": 0.5022535818364116, "grad_norm": 0.232421875, "learning_rate": 1.1198756677049796e-06, "loss": 0.09033283591270447, "step": 3350 }, { "epoch": 0.5037528462598038, "grad_norm": 0.234375, "learning_rate": 1.1148191301893795e-06, "loss": 0.06604780554771424, "step": 3360 }, { "epoch": 0.5052521106831961, "grad_norm": 0.220703125, "learning_rate": 1.1097596159212475e-06, "loss": 0.08669602274894714, "step": 3370 }, { "epoch": 0.5067513751065883, "grad_norm": 0.255859375, "learning_rate": 1.104697256071426e-06, "loss": 0.11573494672775268, "step": 3380 }, { "epoch": 0.5082506395299806, "grad_norm": 0.255859375, "learning_rate": 1.0996321818845294e-06, "loss": 0.09091781973838806, "step": 3390 }, { "epoch": 0.5097499039533728, "grad_norm": 0.244140625, "learning_rate": 1.0945645246755424e-06, "loss": 0.0938392698764801, "step": 3400 }, { "epoch": 0.5112491683767652, "grad_norm": 0.2158203125, "learning_rate": 1.089494415826418e-06, "loss": 0.08227325677871704, "step": 3410 }, { "epoch": 0.5127484328001575, "grad_norm": 0.2138671875, "learning_rate": 1.084421986782667e-06, "loss": 0.07320802211761475, "step": 3420 }, { "epoch": 0.5142476972235497, "grad_norm": 0.1953125, "learning_rate": 1.079347369049954e-06, "loss": 0.08411517143249511, "step": 3430 }, { "epoch": 0.515746961646942, "grad_norm": 0.2451171875, "learning_rate": 1.0742706941906873e-06, "loss": 0.1013220191001892, "step": 3440 }, { "epoch": 0.5172462260703342, "grad_norm": 0.2255859375, "learning_rate": 1.0691920938206052e-06, "loss": 0.08412815928459168, "step": 3450 }, { "epoch": 0.5187454904937265, "grad_norm": 0.21484375, "learning_rate": 1.0641116996053678e-06, "loss": 0.08085081577301026, "step": 3460 }, { "epoch": 0.5202447549171187, "grad_norm": 0.291015625, "learning_rate": 1.0590296432571414e-06, "loss": 0.08313990831375122, "step": 3470 }, { "epoch": 0.5217440193405111, "grad_norm": 0.275390625, "learning_rate": 1.0539460565311836e-06, "loss": 0.0919266939163208, "step": 3480 }, { "epoch": 0.5232432837639034, "grad_norm": 0.2470703125, "learning_rate": 1.048861071222428e-06, "loss": 0.09890375733375549, "step": 3490 }, { "epoch": 0.5247425481872956, "grad_norm": 0.2451171875, "learning_rate": 1.0437748191620678e-06, "loss": 0.08521285653114319, "step": 3500 }, { "epoch": 0.5262418126106879, "grad_norm": 0.2275390625, "learning_rate": 1.0386874322141365e-06, "loss": 0.08201659321784974, "step": 3510 }, { "epoch": 0.5277410770340801, "grad_norm": 0.419921875, "learning_rate": 1.0335990422720908e-06, "loss": 0.08876433968544006, "step": 3520 }, { "epoch": 0.5292403414574725, "grad_norm": 0.2099609375, "learning_rate": 1.0285097812553916e-06, "loss": 0.08933233618736267, "step": 3530 }, { "epoch": 0.5307396058808647, "grad_norm": 0.240234375, "learning_rate": 1.0234197811060808e-06, "loss": 0.07142494320869446, "step": 3540 }, { "epoch": 0.532238870304257, "grad_norm": 0.220703125, "learning_rate": 1.0183291737853636e-06, "loss": 0.07216275334358216, "step": 3550 }, { "epoch": 0.5337381347276492, "grad_norm": 0.2353515625, "learning_rate": 1.0132380912701884e-06, "loss": 0.09240591526031494, "step": 3560 }, { "epoch": 0.5352373991510415, "grad_norm": 0.1962890625, "learning_rate": 1.0081466655498198e-06, "loss": 0.08051929473876954, "step": 3570 }, { "epoch": 0.5367366635744338, "grad_norm": 0.2451171875, "learning_rate": 1.0030550286224228e-06, "loss": 0.06649044156074524, "step": 3580 }, { "epoch": 0.538235927997826, "grad_norm": 0.2158203125, "learning_rate": 9.979633124916373e-07, "loss": 0.09150764346122742, "step": 3590 }, { "epoch": 0.5397351924212184, "grad_norm": 0.212890625, "learning_rate": 9.928716491631568e-07, "loss": 0.09035595655441284, "step": 3600 }, { "epoch": 0.5412344568446106, "grad_norm": 0.1806640625, "learning_rate": 9.877801706413051e-07, "loss": 0.09294023513793945, "step": 3610 }, { "epoch": 0.5427337212680029, "grad_norm": 0.2265625, "learning_rate": 9.826890089256157e-07, "loss": 0.1178174376487732, "step": 3620 }, { "epoch": 0.5442329856913951, "grad_norm": 0.2490234375, "learning_rate": 9.775982960074077e-07, "loss": 0.10003062486648559, "step": 3630 }, { "epoch": 0.5457322501147874, "grad_norm": 0.333984375, "learning_rate": 9.725081638663661e-07, "loss": 0.10663024187088013, "step": 3640 }, { "epoch": 0.5472315145381798, "grad_norm": 0.2421875, "learning_rate": 9.674187444671184e-07, "loss": 0.09378329515457154, "step": 3650 }, { "epoch": 0.548730778961572, "grad_norm": 0.244140625, "learning_rate": 9.623301697558134e-07, "loss": 0.0637846291065216, "step": 3660 }, { "epoch": 0.5502300433849643, "grad_norm": 0.185546875, "learning_rate": 9.572425716567015e-07, "loss": 0.0605103075504303, "step": 3670 }, { "epoch": 0.5517293078083565, "grad_norm": 0.2236328125, "learning_rate": 9.521560820687135e-07, "loss": 0.09556649327278137, "step": 3680 }, { "epoch": 0.5532285722317488, "grad_norm": 0.24609375, "learning_rate": 9.470708328620413e-07, "loss": 0.09757782220840454, "step": 3690 }, { "epoch": 0.554727836655141, "grad_norm": 0.197265625, "learning_rate": 9.419869558747198e-07, "loss": 0.09097603559494019, "step": 3700 }, { "epoch": 0.5562271010785333, "grad_norm": 0.234375, "learning_rate": 9.369045829092076e-07, "loss": 0.089606112241745, "step": 3710 }, { "epoch": 0.5577263655019257, "grad_norm": 0.2158203125, "learning_rate": 9.318238457289711e-07, "loss": 0.09462766051292419, "step": 3720 }, { "epoch": 0.5592256299253179, "grad_norm": 0.1513671875, "learning_rate": 9.267448760550683e-07, "loss": 0.06713712811470032, "step": 3730 }, { "epoch": 0.5607248943487102, "grad_norm": 0.2109375, "learning_rate": 9.216678055627325e-07, "loss": 0.08841444849967957, "step": 3740 }, { "epoch": 0.5622241587721024, "grad_norm": 0.2373046875, "learning_rate": 9.165927658779603e-07, "loss": 0.07210164666175842, "step": 3750 }, { "epoch": 0.5637234231954947, "grad_norm": 0.2373046875, "learning_rate": 9.11519888574099e-07, "loss": 0.09946097731590271, "step": 3760 }, { "epoch": 0.5652226876188869, "grad_norm": 0.2373046875, "learning_rate": 9.064493051684341e-07, "loss": 0.07101974487304688, "step": 3770 }, { "epoch": 0.5667219520422793, "grad_norm": 0.2236328125, "learning_rate": 9.013811471187807e-07, "loss": 0.10910413265228272, "step": 3780 }, { "epoch": 0.5682212164656715, "grad_norm": 0.25, "learning_rate": 8.963155458200753e-07, "loss": 0.07558327913284302, "step": 3790 }, { "epoch": 0.5697204808890638, "grad_norm": 0.2392578125, "learning_rate": 8.912526326009686e-07, "loss": 0.08378031253814697, "step": 3800 }, { "epoch": 0.5712197453124561, "grad_norm": 0.291015625, "learning_rate": 8.861925387204217e-07, "loss": 0.0926354169845581, "step": 3810 }, { "epoch": 0.5727190097358483, "grad_norm": 0.2421875, "learning_rate": 8.811353953643031e-07, "loss": 0.0765921413898468, "step": 3820 }, { "epoch": 0.5742182741592406, "grad_norm": 0.197265625, "learning_rate": 8.760813336419868e-07, "loss": 0.09550715684890747, "step": 3830 }, { "epoch": 0.5757175385826329, "grad_norm": 0.2109375, "learning_rate": 8.710304845829533e-07, "loss": 0.07235878109931945, "step": 3840 }, { "epoch": 0.5772168030060252, "grad_norm": 0.1943359375, "learning_rate": 8.65982979133394e-07, "loss": 0.08240407705307007, "step": 3850 }, { "epoch": 0.5787160674294174, "grad_norm": 0.212890625, "learning_rate": 8.609389481528138e-07, "loss": 0.0828467309474945, "step": 3860 }, { "epoch": 0.5802153318528097, "grad_norm": 0.2080078125, "learning_rate": 8.558985224106409e-07, "loss": 0.06905397176742553, "step": 3870 }, { "epoch": 0.581714596276202, "grad_norm": 0.1953125, "learning_rate": 8.508618325828361e-07, "loss": 0.08870742321014405, "step": 3880 }, { "epoch": 0.5832138606995942, "grad_norm": 0.32421875, "learning_rate": 8.458290092485034e-07, "loss": 0.08924266099929809, "step": 3890 }, { "epoch": 0.5847131251229866, "grad_norm": 0.265625, "learning_rate": 8.408001828865064e-07, "loss": 0.08538001179695129, "step": 3900 }, { "epoch": 0.5862123895463788, "grad_norm": 0.21875, "learning_rate": 8.357754838720846e-07, "loss": 0.05365139842033386, "step": 3910 }, { "epoch": 0.5877116539697711, "grad_norm": 0.197265625, "learning_rate": 8.307550424734735e-07, "loss": 0.07388515472412109, "step": 3920 }, { "epoch": 0.5892109183931633, "grad_norm": 0.1875, "learning_rate": 8.257389888485274e-07, "loss": 0.09646939039230347, "step": 3930 }, { "epoch": 0.5907101828165556, "grad_norm": 0.3046875, "learning_rate": 8.207274530413457e-07, "loss": 0.09254279732704163, "step": 3940 }, { "epoch": 0.592209447239948, "grad_norm": 0.2109375, "learning_rate": 8.157205649789001e-07, "loss": 0.06844722628593444, "step": 3950 }, { "epoch": 0.5937087116633402, "grad_norm": 0.2080078125, "learning_rate": 8.107184544676671e-07, "loss": 0.07432733774185181, "step": 3960 }, { "epoch": 0.5952079760867325, "grad_norm": 0.271484375, "learning_rate": 8.057212511902623e-07, "loss": 0.08080208897590638, "step": 3970 }, { "epoch": 0.5967072405101247, "grad_norm": 0.189453125, "learning_rate": 8.007290847020783e-07, "loss": 0.10689427852630615, "step": 3980 }, { "epoch": 0.598206504933517, "grad_norm": 0.203125, "learning_rate": 7.957420844279256e-07, "loss": 0.0826223611831665, "step": 3990 }, { "epoch": 0.5997057693569092, "grad_norm": 0.330078125, "learning_rate": 7.907603796586793e-07, "loss": 0.08745207786560058, "step": 4000 }, { "epoch": 0.6012050337803015, "grad_norm": 0.205078125, "learning_rate": 7.857840995479237e-07, "loss": 0.06742951273918152, "step": 4010 }, { "epoch": 0.6027042982036939, "grad_norm": 0.296875, "learning_rate": 7.808133731086063e-07, "loss": 0.10504342317581176, "step": 4020 }, { "epoch": 0.6042035626270861, "grad_norm": 0.34765625, "learning_rate": 7.758483292096928e-07, "loss": 0.10398197174072266, "step": 4030 }, { "epoch": 0.6057028270504784, "grad_norm": 0.28515625, "learning_rate": 7.708890965728249e-07, "loss": 0.11235659122467041, "step": 4040 }, { "epoch": 0.6072020914738706, "grad_norm": 0.28515625, "learning_rate": 7.659358037689845e-07, "loss": 0.10213931798934936, "step": 4050 }, { "epoch": 0.6087013558972629, "grad_norm": 0.2314453125, "learning_rate": 7.609885792151602e-07, "loss": 0.09277363419532776, "step": 4060 }, { "epoch": 0.6102006203206551, "grad_norm": 0.279296875, "learning_rate": 7.560475511710174e-07, "loss": 0.08845908641815185, "step": 4070 }, { "epoch": 0.6116998847440475, "grad_norm": 0.2275390625, "learning_rate": 7.511128477355728e-07, "loss": 0.06152995824813843, "step": 4080 }, { "epoch": 0.6131991491674397, "grad_norm": 0.1982421875, "learning_rate": 7.461845968438753e-07, "loss": 0.0993484079837799, "step": 4090 }, { "epoch": 0.614698413590832, "grad_norm": 0.232421875, "learning_rate": 7.412629262636861e-07, "loss": 0.08685197830200195, "step": 4100 }, { "epoch": 0.6161976780142243, "grad_norm": 0.203125, "learning_rate": 7.363479635921693e-07, "loss": 0.10489131212234497, "step": 4110 }, { "epoch": 0.6176969424376165, "grad_norm": 0.2265625, "learning_rate": 7.314398362525827e-07, "loss": 0.0976183295249939, "step": 4120 }, { "epoch": 0.6191962068610088, "grad_norm": 0.318359375, "learning_rate": 7.265386714909732e-07, "loss": 0.10362049341201782, "step": 4130 }, { "epoch": 0.6206954712844011, "grad_norm": 0.21875, "learning_rate": 7.216445963728795e-07, "loss": 0.09439095258712768, "step": 4140 }, { "epoch": 0.6221947357077934, "grad_norm": 0.20703125, "learning_rate": 7.167577377800372e-07, "loss": 0.07266764044761657, "step": 4150 }, { "epoch": 0.6236940001311856, "grad_norm": 0.2021484375, "learning_rate": 7.118782224070886e-07, "loss": 0.08935718536376953, "step": 4160 }, { "epoch": 0.6251932645545779, "grad_norm": 0.27734375, "learning_rate": 7.070061767582993e-07, "loss": 0.09530102014541626, "step": 4170 }, { "epoch": 0.6266925289779702, "grad_norm": 0.205078125, "learning_rate": 7.021417271442786e-07, "loss": 0.08460386395454407, "step": 4180 }, { "epoch": 0.6281917934013624, "grad_norm": 0.25390625, "learning_rate": 6.972849996787029e-07, "loss": 0.09141365885734558, "step": 4190 }, { "epoch": 0.6296910578247548, "grad_norm": 0.18359375, "learning_rate": 6.924361202750484e-07, "loss": 0.09532070755958558, "step": 4200 }, { "epoch": 0.631190322248147, "grad_norm": 0.2158203125, "learning_rate": 6.875952146433252e-07, "loss": 0.09375123977661133, "step": 4210 }, { "epoch": 0.6326895866715393, "grad_norm": 0.2158203125, "learning_rate": 6.827624082868191e-07, "loss": 0.07426313161849976, "step": 4220 }, { "epoch": 0.6341888510949315, "grad_norm": 0.267578125, "learning_rate": 6.779378264988369e-07, "loss": 0.09327669143676758, "step": 4230 }, { "epoch": 0.6356881155183238, "grad_norm": 0.3046875, "learning_rate": 6.731215943594597e-07, "loss": 0.08692552447319031, "step": 4240 }, { "epoch": 0.6371873799417161, "grad_norm": 0.283203125, "learning_rate": 6.683138367322982e-07, "loss": 0.0770199477672577, "step": 4250 }, { "epoch": 0.6386866443651084, "grad_norm": 0.220703125, "learning_rate": 6.635146782612568e-07, "loss": 0.07209202647209167, "step": 4260 }, { "epoch": 0.6401859087885007, "grad_norm": 0.2060546875, "learning_rate": 6.587242433673023e-07, "loss": 0.07247981429100037, "step": 4270 }, { "epoch": 0.6416851732118929, "grad_norm": 0.19921875, "learning_rate": 6.539426562452364e-07, "loss": 0.07441559433937073, "step": 4280 }, { "epoch": 0.6431844376352852, "grad_norm": 0.2021484375, "learning_rate": 6.491700408604781e-07, "loss": 0.0830713927745819, "step": 4290 }, { "epoch": 0.6446837020586774, "grad_norm": 0.1845703125, "learning_rate": 6.444065209458494e-07, "loss": 0.0942071557044983, "step": 4300 }, { "epoch": 0.6461829664820697, "grad_norm": 0.259765625, "learning_rate": 6.396522199983659e-07, "loss": 0.08134819865226746, "step": 4310 }, { "epoch": 0.647682230905462, "grad_norm": 0.236328125, "learning_rate": 6.349072612760366e-07, "loss": 0.10018385648727417, "step": 4320 }, { "epoch": 0.6491814953288543, "grad_norm": 0.228515625, "learning_rate": 6.301717677946678e-07, "loss": 0.09734719395637512, "step": 4330 }, { "epoch": 0.6506807597522466, "grad_norm": 0.2431640625, "learning_rate": 6.254458623246745e-07, "loss": 0.0996459424495697, "step": 4340 }, { "epoch": 0.6521800241756388, "grad_norm": 0.2236328125, "learning_rate": 6.207296673878957e-07, "loss": 0.070529043674469, "step": 4350 }, { "epoch": 0.6536792885990311, "grad_norm": 0.20703125, "learning_rate": 6.160233052544206e-07, "loss": 0.07517372369766236, "step": 4360 }, { "epoch": 0.6551785530224233, "grad_norm": 0.25390625, "learning_rate": 6.113268979394162e-07, "loss": 0.08323991298675537, "step": 4370 }, { "epoch": 0.6566778174458157, "grad_norm": 0.2294921875, "learning_rate": 6.066405671999657e-07, "loss": 0.09829720854759216, "step": 4380 }, { "epoch": 0.6581770818692079, "grad_norm": 0.30859375, "learning_rate": 6.019644345319108e-07, "loss": 0.06705747246742248, "step": 4390 }, { "epoch": 0.6596763462926002, "grad_norm": 0.326171875, "learning_rate": 5.972986211667032e-07, "loss": 0.08918554186820984, "step": 4400 }, { "epoch": 0.6611756107159925, "grad_norm": 0.193359375, "learning_rate": 5.92643248068259e-07, "loss": 0.0527131199836731, "step": 4410 }, { "epoch": 0.6626748751393847, "grad_norm": 0.2109375, "learning_rate": 5.87998435929826e-07, "loss": 0.061626529693603514, "step": 4420 }, { "epoch": 0.664174139562777, "grad_norm": 0.2314453125, "learning_rate": 5.83364305170852e-07, "loss": 0.10371142625808716, "step": 4430 }, { "epoch": 0.6656734039861693, "grad_norm": 0.2236328125, "learning_rate": 5.787409759338644e-07, "loss": 0.08246560096740722, "step": 4440 }, { "epoch": 0.6671726684095616, "grad_norm": 0.2099609375, "learning_rate": 5.741285680813544e-07, "loss": 0.07695434689521789, "step": 4450 }, { "epoch": 0.6686719328329538, "grad_norm": 0.1982421875, "learning_rate": 5.695272011926701e-07, "loss": 0.06416907906532288, "step": 4460 }, { "epoch": 0.6701711972563461, "grad_norm": 0.310546875, "learning_rate": 5.649369945609169e-07, "loss": 0.05495827198028565, "step": 4470 }, { "epoch": 0.6716704616797384, "grad_norm": 0.224609375, "learning_rate": 5.603580671898629e-07, "loss": 0.07965745329856873, "step": 4480 }, { "epoch": 0.6731697261031306, "grad_norm": 0.322265625, "learning_rate": 5.557905377908558e-07, "loss": 0.10348300933837891, "step": 4490 }, { "epoch": 0.674668990526523, "grad_norm": 0.337890625, "learning_rate": 5.512345247797437e-07, "loss": 0.11305124759674072, "step": 4500 }, { "epoch": 0.6761682549499152, "grad_norm": 0.2119140625, "learning_rate": 5.466901462738057e-07, "loss": 0.06318964958190917, "step": 4510 }, { "epoch": 0.6776675193733075, "grad_norm": 0.23046875, "learning_rate": 5.421575200886899e-07, "loss": 0.10519200563430786, "step": 4520 }, { "epoch": 0.6791667837966997, "grad_norm": 0.1787109375, "learning_rate": 5.376367637353586e-07, "loss": 0.08189275860786438, "step": 4530 }, { "epoch": 0.680666048220092, "grad_norm": 0.498046875, "learning_rate": 5.331279944170417e-07, "loss": 0.09210953116416931, "step": 4540 }, { "epoch": 0.6821653126434843, "grad_norm": 0.189453125, "learning_rate": 5.286313290261982e-07, "loss": 0.07461657524108886, "step": 4550 }, { "epoch": 0.6836645770668766, "grad_norm": 0.30859375, "learning_rate": 5.24146884141486e-07, "loss": 0.09393454194068909, "step": 4560 }, { "epoch": 0.6851638414902689, "grad_norm": 0.22265625, "learning_rate": 5.19674776024739e-07, "loss": 0.08053632378578186, "step": 4570 }, { "epoch": 0.6866631059136611, "grad_norm": 0.294921875, "learning_rate": 5.152151206179538e-07, "loss": 0.07931421399116516, "step": 4580 }, { "epoch": 0.6881623703370534, "grad_norm": 0.236328125, "learning_rate": 5.107680335402824e-07, "loss": 0.09329952597618103, "step": 4590 }, { "epoch": 0.6896616347604456, "grad_norm": 0.240234375, "learning_rate": 5.063336300850362e-07, "loss": 0.07256720066070557, "step": 4600 }, { "epoch": 0.6911608991838379, "grad_norm": 0.255859375, "learning_rate": 5.019120252166966e-07, "loss": 0.07386515140533448, "step": 4610 }, { "epoch": 0.6926601636072302, "grad_norm": 0.1904296875, "learning_rate": 4.975033335679332e-07, "loss": 0.0855524480342865, "step": 4620 }, { "epoch": 0.6941594280306225, "grad_norm": 0.220703125, "learning_rate": 4.931076694366337e-07, "loss": 0.08902753591537475, "step": 4630 }, { "epoch": 0.6956586924540148, "grad_norm": 0.2236328125, "learning_rate": 4.887251467829398e-07, "loss": 0.09814743995666504, "step": 4640 }, { "epoch": 0.697157956877407, "grad_norm": 0.294921875, "learning_rate": 4.843558792262924e-07, "loss": 0.09769907593727112, "step": 4650 }, { "epoch": 0.6986572213007993, "grad_norm": 0.294921875, "learning_rate": 4.799999800424867e-07, "loss": 0.12376710176467895, "step": 4660 }, { "epoch": 0.7001564857241915, "grad_norm": 0.2158203125, "learning_rate": 4.7565756216073505e-07, "loss": 0.07605620622634887, "step": 4670 }, { "epoch": 0.7016557501475839, "grad_norm": 0.296875, "learning_rate": 4.713287381607389e-07, "loss": 0.09146468043327331, "step": 4680 }, { "epoch": 0.7031550145709761, "grad_norm": 0.2001953125, "learning_rate": 4.670136202697706e-07, "loss": 0.11566205024719238, "step": 4690 }, { "epoch": 0.7046542789943684, "grad_norm": 0.2099609375, "learning_rate": 4.6271232035976395e-07, "loss": 0.07541021108627319, "step": 4700 }, { "epoch": 0.7061535434177607, "grad_norm": 0.2255859375, "learning_rate": 4.5842494994441315e-07, "loss": 0.10867191553115844, "step": 4710 }, { "epoch": 0.7076528078411529, "grad_norm": 0.298828125, "learning_rate": 4.541516201762824e-07, "loss": 0.08358562588691712, "step": 4720 }, { "epoch": 0.7091520722645452, "grad_norm": 0.2158203125, "learning_rate": 4.4989244184392405e-07, "loss": 0.10019409656524658, "step": 4730 }, { "epoch": 0.7106513366879375, "grad_norm": 0.2353515625, "learning_rate": 4.456475253690061e-07, "loss": 0.08848651647567748, "step": 4740 }, { "epoch": 0.7121506011113298, "grad_norm": 0.201171875, "learning_rate": 4.414169808034496e-07, "loss": 0.07086822390556335, "step": 4750 }, { "epoch": 0.713649865534722, "grad_norm": 0.255859375, "learning_rate": 4.3720091782657574e-07, "loss": 0.1078036069869995, "step": 4760 }, { "epoch": 0.7151491299581143, "grad_norm": 0.2314453125, "learning_rate": 4.32999445742262e-07, "loss": 0.09499780535697937, "step": 4770 }, { "epoch": 0.7166483943815066, "grad_norm": 0.2431640625, "learning_rate": 4.2881267347610837e-07, "loss": 0.08308950662612916, "step": 4780 }, { "epoch": 0.7181476588048988, "grad_norm": 0.2890625, "learning_rate": 4.2464070957261375e-07, "loss": 0.08044061660766602, "step": 4790 }, { "epoch": 0.7196469232282912, "grad_norm": 0.2001953125, "learning_rate": 4.204836621923618e-07, "loss": 0.06061916947364807, "step": 4800 }, { "epoch": 0.7211461876516834, "grad_norm": 0.2490234375, "learning_rate": 4.1634163910921606e-07, "loss": 0.10452162027359009, "step": 4810 }, { "epoch": 0.7226454520750757, "grad_norm": 0.2158203125, "learning_rate": 4.1221474770752696e-07, "loss": 0.0969232976436615, "step": 4820 }, { "epoch": 0.7241447164984679, "grad_norm": 0.1728515625, "learning_rate": 4.081030949793471e-07, "loss": 0.07360079884529114, "step": 4830 }, { "epoch": 0.7256439809218602, "grad_norm": 0.1943359375, "learning_rate": 4.0400678752165807e-07, "loss": 0.08355346322059631, "step": 4840 }, { "epoch": 0.7271432453452524, "grad_norm": 0.279296875, "learning_rate": 3.9992593153360563e-07, "loss": 0.07457499504089356, "step": 4850 }, { "epoch": 0.7286425097686448, "grad_norm": 0.314453125, "learning_rate": 3.9586063281374796e-07, "loss": 0.0845346987247467, "step": 4860 }, { "epoch": 0.7301417741920371, "grad_norm": 0.2275390625, "learning_rate": 3.9181099675731154e-07, "loss": 0.07429866194725036, "step": 4870 }, { "epoch": 0.7316410386154293, "grad_norm": 0.2255859375, "learning_rate": 3.8777712835345966e-07, "loss": 0.05976992845535278, "step": 4880 }, { "epoch": 0.7331403030388216, "grad_norm": 0.1884765625, "learning_rate": 3.837591321825696e-07, "loss": 0.07514649033546447, "step": 4890 }, { "epoch": 0.7346395674622138, "grad_norm": 0.22265625, "learning_rate": 3.7975711241352224e-07, "loss": 0.0838453233242035, "step": 4900 }, { "epoch": 0.7361388318856061, "grad_norm": 0.28125, "learning_rate": 3.757711728010007e-07, "loss": 0.08041094541549683, "step": 4910 }, { "epoch": 0.7376380963089983, "grad_norm": 0.271484375, "learning_rate": 3.7180141668280065e-07, "loss": 0.0707211971282959, "step": 4920 }, { "epoch": 0.7391373607323907, "grad_norm": 0.2109375, "learning_rate": 3.678479469771516e-07, "loss": 0.09502058625221252, "step": 4930 }, { "epoch": 0.740636625155783, "grad_norm": 0.25390625, "learning_rate": 3.639108661800482e-07, "loss": 0.09508728384971618, "step": 4940 }, { "epoch": 0.7421358895791752, "grad_norm": 0.26953125, "learning_rate": 3.59990276362593e-07, "loss": 0.07535126805305481, "step": 4950 }, { "epoch": 0.7436351540025675, "grad_norm": 0.271484375, "learning_rate": 3.5608627916835077e-07, "loss": 0.07866016626358033, "step": 4960 }, { "epoch": 0.7451344184259597, "grad_norm": 0.177734375, "learning_rate": 3.521989758107122e-07, "loss": 0.10100013017654419, "step": 4970 }, { "epoch": 0.746633682849352, "grad_norm": 0.365234375, "learning_rate": 3.4832846707027144e-07, "loss": 0.08256787061691284, "step": 4980 }, { "epoch": 0.7481329472727443, "grad_norm": 0.185546875, "learning_rate": 3.444748532922116e-07, "loss": 0.08142110109329223, "step": 4990 }, { "epoch": 0.7496322116961366, "grad_norm": 0.1806640625, "learning_rate": 3.4063823438370477e-07, "loss": 0.09730502367019653, "step": 5000 }, { "epoch": 0.7511314761195289, "grad_norm": 0.2578125, "learning_rate": 3.3681870981132076e-07, "loss": 0.051060861349105834, "step": 5010 }, { "epoch": 0.7526307405429211, "grad_norm": 0.29296875, "learning_rate": 3.330163785984491e-07, "loss": 0.07702358365058899, "step": 5020 }, { "epoch": 0.7541300049663134, "grad_norm": 0.25, "learning_rate": 3.292313393227313e-07, "loss": 0.07249666452407837, "step": 5030 }, { "epoch": 0.7556292693897056, "grad_norm": 0.2119140625, "learning_rate": 3.254636901135055e-07, "loss": 0.08777963519096374, "step": 5040 }, { "epoch": 0.757128533813098, "grad_norm": 0.26171875, "learning_rate": 3.2171352864926216e-07, "loss": 0.09629991054534912, "step": 5050 }, { "epoch": 0.7586277982364902, "grad_norm": 0.400390625, "learning_rate": 3.179809521551119e-07, "loss": 0.07828204035758972, "step": 5060 }, { "epoch": 0.7601270626598825, "grad_norm": 0.2236328125, "learning_rate": 3.142660574002648e-07, "loss": 0.06039868593215943, "step": 5070 }, { "epoch": 0.7616263270832748, "grad_norm": 0.26171875, "learning_rate": 3.1056894069552154e-07, "loss": 0.06850762367248535, "step": 5080 }, { "epoch": 0.763125591506667, "grad_norm": 0.25390625, "learning_rate": 3.0688969789077656e-07, "loss": 0.07535871863365173, "step": 5090 }, { "epoch": 0.7646248559300594, "grad_norm": 0.2275390625, "learning_rate": 3.0322842437253303e-07, "loss": 0.0845901370048523, "step": 5100 }, { "epoch": 0.7661241203534516, "grad_norm": 0.267578125, "learning_rate": 2.9958521506143006e-07, "loss": 0.09275015592575073, "step": 5110 }, { "epoch": 0.7676233847768439, "grad_norm": 0.24609375, "learning_rate": 2.9596016440978175e-07, "loss": 0.10449213981628418, "step": 5120 }, { "epoch": 0.7691226492002361, "grad_norm": 0.2060546875, "learning_rate": 2.923533663991282e-07, "loss": 0.08837388157844543, "step": 5130 }, { "epoch": 0.7706219136236284, "grad_norm": 0.330078125, "learning_rate": 2.8876491453779936e-07, "loss": 0.09125276803970336, "step": 5140 }, { "epoch": 0.7721211780470206, "grad_norm": 0.2734375, "learning_rate": 2.851949018584906e-07, "loss": 0.0870974063873291, "step": 5150 }, { "epoch": 0.773620442470413, "grad_norm": 1.109375, "learning_rate": 2.816434209158508e-07, "loss": 0.11278444528579712, "step": 5160 }, { "epoch": 0.7751197068938053, "grad_norm": 0.2431640625, "learning_rate": 2.781105637840829e-07, "loss": 0.11417597532272339, "step": 5170 }, { "epoch": 0.7766189713171975, "grad_norm": 0.2080078125, "learning_rate": 2.7459642205455657e-07, "loss": 0.0695708453655243, "step": 5180 }, { "epoch": 0.7781182357405898, "grad_norm": 0.294921875, "learning_rate": 2.71101086833434e-07, "loss": 0.07352896332740784, "step": 5190 }, { "epoch": 0.779617500163982, "grad_norm": 0.265625, "learning_rate": 2.6762464873930754e-07, "loss": 0.09707750678062439, "step": 5200 }, { "epoch": 0.7811167645873743, "grad_norm": 0.1865234375, "learning_rate": 2.6416719790085084e-07, "loss": 0.09525392651557922, "step": 5210 }, { "epoch": 0.7826160290107665, "grad_norm": 0.291015625, "learning_rate": 2.607288239544817e-07, "loss": 0.10324461460113525, "step": 5220 }, { "epoch": 0.7841152934341589, "grad_norm": 0.2490234375, "learning_rate": 2.573096160420386e-07, "loss": 0.056819206476211546, "step": 5230 }, { "epoch": 0.7856145578575512, "grad_norm": 0.1875, "learning_rate": 2.5390966280846925e-07, "loss": 0.07321354150772094, "step": 5240 }, { "epoch": 0.7871138222809434, "grad_norm": 0.2177734375, "learning_rate": 2.505290523995329e-07, "loss": 0.05529284477233887, "step": 5250 }, { "epoch": 0.7886130867043357, "grad_norm": 0.28515625, "learning_rate": 2.4716787245951465e-07, "loss": 0.08749927282333374, "step": 5260 }, { "epoch": 0.7901123511277279, "grad_norm": 0.251953125, "learning_rate": 2.4382621012895367e-07, "loss": 0.10226259231567383, "step": 5270 }, { "epoch": 0.7916116155511203, "grad_norm": 0.369140625, "learning_rate": 2.405041520423835e-07, "loss": 0.08864956498146057, "step": 5280 }, { "epoch": 0.7931108799745125, "grad_norm": 0.2197265625, "learning_rate": 2.372017843260864e-07, "loss": 0.10684455633163452, "step": 5290 }, { "epoch": 0.7946101443979048, "grad_norm": 0.1884765625, "learning_rate": 2.3391919259586057e-07, "loss": 0.09059134125709534, "step": 5300 }, { "epoch": 0.7961094088212971, "grad_norm": 0.2158203125, "learning_rate": 2.3065646195479992e-07, "loss": 0.07700026631355286, "step": 5310 }, { "epoch": 0.7976086732446893, "grad_norm": 0.37890625, "learning_rate": 2.2741367699108839e-07, "loss": 0.08473354578018188, "step": 5320 }, { "epoch": 0.7991079376680816, "grad_norm": 0.2265625, "learning_rate": 2.2419092177580666e-07, "loss": 0.07873227596282958, "step": 5330 }, { "epoch": 0.8006072020914738, "grad_norm": 0.26953125, "learning_rate": 2.209882798607523e-07, "loss": 0.09732807874679565, "step": 5340 }, { "epoch": 0.8021064665148662, "grad_norm": 0.26953125, "learning_rate": 2.178058342762743e-07, "loss": 0.10025830268859863, "step": 5350 }, { "epoch": 0.8036057309382584, "grad_norm": 0.263671875, "learning_rate": 2.1464366752911979e-07, "loss": 0.09230310916900634, "step": 5360 }, { "epoch": 0.8051049953616507, "grad_norm": 0.2353515625, "learning_rate": 2.1150186160029525e-07, "loss": 0.06340540051460267, "step": 5370 }, { "epoch": 0.8066042597850429, "grad_norm": 0.26953125, "learning_rate": 2.0838049794294132e-07, "loss": 0.10046428442001343, "step": 5380 }, { "epoch": 0.8081035242084352, "grad_norm": 0.220703125, "learning_rate": 2.052796574802209e-07, "loss": 0.06854251027107239, "step": 5390 }, { "epoch": 0.8096027886318276, "grad_norm": 0.2216796875, "learning_rate": 2.0219942060322114e-07, "loss": 0.08301514387130737, "step": 5400 }, { "epoch": 0.8111020530552198, "grad_norm": 0.2734375, "learning_rate": 1.99139867168869e-07, "loss": 0.06499930620193481, "step": 5410 }, { "epoch": 0.8126013174786121, "grad_norm": 0.275390625, "learning_rate": 1.9610107649786167e-07, "loss": 0.08899691700935364, "step": 5420 }, { "epoch": 0.8141005819020043, "grad_norm": 0.1923828125, "learning_rate": 1.9308312737260934e-07, "loss": 0.06367949843406677, "step": 5430 }, { "epoch": 0.8155998463253966, "grad_norm": 0.2578125, "learning_rate": 1.9008609803519304e-07, "loss": 0.09109672904014587, "step": 5440 }, { "epoch": 0.8170991107487888, "grad_norm": 0.2373046875, "learning_rate": 1.871100661853363e-07, "loss": 0.0652251660823822, "step": 5450 }, { "epoch": 0.8185983751721811, "grad_norm": 0.263671875, "learning_rate": 1.841551089783907e-07, "loss": 0.10543818473815918, "step": 5460 }, { "epoch": 0.8200976395955735, "grad_norm": 0.2333984375, "learning_rate": 1.8122130302333517e-07, "loss": 0.07551140189170838, "step": 5470 }, { "epoch": 0.8215969040189657, "grad_norm": 0.255859375, "learning_rate": 1.7830872438079048e-07, "loss": 0.07271650433540344, "step": 5480 }, { "epoch": 0.823096168442358, "grad_norm": 0.21484375, "learning_rate": 1.7541744856104667e-07, "loss": 0.07429500818252563, "step": 5490 }, { "epoch": 0.8245954328657502, "grad_norm": 0.287109375, "learning_rate": 1.7254755052210624e-07, "loss": 0.06771766543388366, "step": 5500 }, { "epoch": 0.8260946972891425, "grad_norm": 0.3046875, "learning_rate": 1.6969910466773973e-07, "loss": 0.11255881786346436, "step": 5510 }, { "epoch": 0.8275939617125347, "grad_norm": 0.2080078125, "learning_rate": 1.66872184845558e-07, "loss": 0.07378043532371521, "step": 5520 }, { "epoch": 0.8290932261359271, "grad_norm": 0.2236328125, "learning_rate": 1.6406686434509644e-07, "loss": 0.06890552639961242, "step": 5530 }, { "epoch": 0.8305924905593194, "grad_norm": 0.2060546875, "learning_rate": 1.6128321589591587e-07, "loss": 0.08552584648132325, "step": 5540 }, { "epoch": 0.8320917549827116, "grad_norm": 0.326171875, "learning_rate": 1.5852131166571648e-07, "loss": 0.08140406608581544, "step": 5550 }, { "epoch": 0.8335910194061039, "grad_norm": 0.251953125, "learning_rate": 1.55781223258467e-07, "loss": 0.09987716674804688, "step": 5560 }, { "epoch": 0.8350902838294961, "grad_norm": 0.1982421875, "learning_rate": 1.5306302171254836e-07, "loss": 0.0620901346206665, "step": 5570 }, { "epoch": 0.8365895482528884, "grad_norm": 0.263671875, "learning_rate": 1.503667774989119e-07, "loss": 0.07742155194282532, "step": 5580 }, { "epoch": 0.8380888126762807, "grad_norm": 0.27734375, "learning_rate": 1.4769256051925228e-07, "loss": 0.09683317542076111, "step": 5590 }, { "epoch": 0.839588077099673, "grad_norm": 0.2177734375, "learning_rate": 1.4504044010419513e-07, "loss": 0.10250561237335205, "step": 5600 }, { "epoch": 0.8410873415230652, "grad_norm": 0.2314453125, "learning_rate": 1.4241048501150088e-07, "loss": 0.0593035876750946, "step": 5610 }, { "epoch": 0.8425866059464575, "grad_norm": 0.33203125, "learning_rate": 1.3980276342427966e-07, "loss": 0.07098089456558228, "step": 5620 }, { "epoch": 0.8440858703698498, "grad_norm": 0.25, "learning_rate": 1.3721734294922594e-07, "loss": 0.08620147705078125, "step": 5630 }, { "epoch": 0.845585134793242, "grad_norm": 0.2138671875, "learning_rate": 1.346542906148649e-07, "loss": 0.08298314213752747, "step": 5640 }, { "epoch": 0.8470843992166344, "grad_norm": 0.34765625, "learning_rate": 1.3211367286981458e-07, "loss": 0.1136427640914917, "step": 5650 }, { "epoch": 0.8485836636400266, "grad_norm": 0.25, "learning_rate": 1.2959555558106282e-07, "loss": 0.0708082675933838, "step": 5660 }, { "epoch": 0.8500829280634189, "grad_norm": 0.25390625, "learning_rate": 1.271000040322614e-07, "loss": 0.09266042709350586, "step": 5670 }, { "epoch": 0.8515821924868111, "grad_norm": 0.298828125, "learning_rate": 1.2462708292203062e-07, "loss": 0.09188313484191894, "step": 5680 }, { "epoch": 0.8530814569102034, "grad_norm": 0.3046875, "learning_rate": 1.2217685636228447e-07, "loss": 0.11194919347763062, "step": 5690 }, { "epoch": 0.8545807213335957, "grad_norm": 0.259765625, "learning_rate": 1.1974938787656742e-07, "loss": 0.0845366358757019, "step": 5700 }, { "epoch": 0.856079985756988, "grad_norm": 0.28515625, "learning_rate": 1.1734474039840737e-07, "loss": 0.07923954129219055, "step": 5710 }, { "epoch": 0.8575792501803803, "grad_norm": 0.306640625, "learning_rate": 1.1496297626968465e-07, "loss": 0.09228439927101136, "step": 5720 }, { "epoch": 0.8590785146037725, "grad_norm": 0.2314453125, "learning_rate": 1.1260415723901584e-07, "loss": 0.08742096424102783, "step": 5730 }, { "epoch": 0.8605777790271648, "grad_norm": 0.2353515625, "learning_rate": 1.1026834446015177e-07, "loss": 0.07722960710525513, "step": 5740 }, { "epoch": 0.862077043450557, "grad_norm": 0.2060546875, "learning_rate": 1.0795559849039315e-07, "loss": 0.08857112526893615, "step": 5750 }, { "epoch": 0.8635763078739493, "grad_norm": 0.205078125, "learning_rate": 1.0566597928902043e-07, "loss": 0.06474360227584838, "step": 5760 }, { "epoch": 0.8650755722973417, "grad_norm": 0.29296875, "learning_rate": 1.033995462157392e-07, "loss": 0.09699549674987792, "step": 5770 }, { "epoch": 0.8665748367207339, "grad_norm": 0.2451171875, "learning_rate": 1.0115635802914101e-07, "loss": 0.07245502471923829, "step": 5780 }, { "epoch": 0.8680741011441262, "grad_norm": 0.26171875, "learning_rate": 9.89364728851807e-08, "loss": 0.07710716128349304, "step": 5790 }, { "epoch": 0.8695733655675184, "grad_norm": 0.294921875, "learning_rate": 9.673994833566746e-08, "loss": 0.07985681295394897, "step": 5800 }, { "epoch": 0.8710726299909107, "grad_norm": 0.212890625, "learning_rate": 9.456684132677418e-08, "loss": 0.07051183581352234, "step": 5810 }, { "epoch": 0.8725718944143029, "grad_norm": 0.2392578125, "learning_rate": 9.241720819756016e-08, "loss": 0.09385765790939331, "step": 5820 }, { "epoch": 0.8740711588376953, "grad_norm": 0.302734375, "learning_rate": 9.029110467851076e-08, "loss": 0.07226101160049439, "step": 5830 }, { "epoch": 0.8755704232610876, "grad_norm": 0.224609375, "learning_rate": 8.818858589009248e-08, "loss": 0.07575808763504029, "step": 5840 }, { "epoch": 0.8770696876844798, "grad_norm": 0.1962890625, "learning_rate": 8.610970634132465e-08, "loss": 0.07295922040939332, "step": 5850 }, { "epoch": 0.8785689521078721, "grad_norm": 0.291015625, "learning_rate": 8.405451992836442e-08, "loss": 0.08540709614753723, "step": 5860 }, { "epoch": 0.8800682165312643, "grad_norm": 0.240234375, "learning_rate": 8.202307993311153e-08, "loss": 0.08457719087600708, "step": 5870 }, { "epoch": 0.8815674809546566, "grad_norm": 0.224609375, "learning_rate": 8.001543902182594e-08, "loss": 0.06852260828018189, "step": 5880 }, { "epoch": 0.8830667453780489, "grad_norm": 0.208984375, "learning_rate": 7.803164924376248e-08, "loss": 0.0945811927318573, "step": 5890 }, { "epoch": 0.8845660098014412, "grad_norm": 0.2734375, "learning_rate": 7.607176202982112e-08, "loss": 0.07205227017402649, "step": 5900 }, { "epoch": 0.8860652742248334, "grad_norm": 0.25390625, "learning_rate": 7.413582819121511e-08, "loss": 0.08640796542167664, "step": 5910 }, { "epoch": 0.8875645386482257, "grad_norm": 0.2060546875, "learning_rate": 7.22238979181512e-08, "loss": 0.0951160728931427, "step": 5920 }, { "epoch": 0.889063803071618, "grad_norm": 0.21484375, "learning_rate": 7.033602077853052e-08, "loss": 0.07211223244667053, "step": 5930 }, { "epoch": 0.8905630674950102, "grad_norm": 0.2373046875, "learning_rate": 6.847224571666277e-08, "loss": 0.07400254607200622, "step": 5940 }, { "epoch": 0.8920623319184026, "grad_norm": 0.298828125, "learning_rate": 6.663262105199718e-08, "loss": 0.09436286687850952, "step": 5950 }, { "epoch": 0.8935615963417948, "grad_norm": 0.255859375, "learning_rate": 6.481719447786971e-08, "loss": 0.07624666690826416, "step": 5960 }, { "epoch": 0.8950608607651871, "grad_norm": 0.25, "learning_rate": 6.302601306026755e-08, "loss": 0.08409606218338013, "step": 5970 }, { "epoch": 0.8965601251885793, "grad_norm": 0.2265625, "learning_rate": 6.125912323660709e-08, "loss": 0.07607480883598328, "step": 5980 }, { "epoch": 0.8980593896119716, "grad_norm": 0.2412109375, "learning_rate": 5.951657081453176e-08, "loss": 0.08595433235168456, "step": 5990 }, { "epoch": 0.899558654035364, "grad_norm": 0.181640625, "learning_rate": 5.7798400970723634e-08, "loss": 0.0745903193950653, "step": 6000 }, { "epoch": 0.9010579184587562, "grad_norm": 0.2392578125, "learning_rate": 5.610465824973232e-08, "loss": 0.07999681830406188, "step": 6010 }, { "epoch": 0.9025571828821485, "grad_norm": 0.205078125, "learning_rate": 5.443538656281954e-08, "loss": 0.08919501900672913, "step": 6020 }, { "epoch": 0.9040564473055407, "grad_norm": 0.20703125, "learning_rate": 5.279062918682253e-08, "loss": 0.07325602769851684, "step": 6030 }, { "epoch": 0.905555711728933, "grad_norm": 0.2236328125, "learning_rate": 5.117042876302946e-08, "loss": 0.07375933527946472, "step": 6040 }, { "epoch": 0.9070549761523252, "grad_norm": 0.30859375, "learning_rate": 4.9574827296075986e-08, "loss": 0.09143089056015015, "step": 6050 }, { "epoch": 0.9085542405757175, "grad_norm": 0.205078125, "learning_rate": 4.800386615285534e-08, "loss": 0.06721729636192322, "step": 6060 }, { "epoch": 0.9100535049991099, "grad_norm": 0.2314453125, "learning_rate": 4.645758606144623e-08, "loss": 0.0724267840385437, "step": 6070 }, { "epoch": 0.9115527694225021, "grad_norm": 0.263671875, "learning_rate": 4.49360271100564e-08, "loss": 0.09417140483856201, "step": 6080 }, { "epoch": 0.9130520338458944, "grad_norm": 0.2138671875, "learning_rate": 4.3439228745984493e-08, "loss": 0.10223345756530762, "step": 6090 }, { "epoch": 0.9145512982692866, "grad_norm": 0.259765625, "learning_rate": 4.196722977459566e-08, "loss": 0.08283578753471374, "step": 6100 }, { "epoch": 0.9160505626926789, "grad_norm": 0.3359375, "learning_rate": 4.0520068358317e-08, "loss": 0.11019489765167237, "step": 6110 }, { "epoch": 0.9175498271160711, "grad_norm": 0.21484375, "learning_rate": 3.9097782015647286e-08, "loss": 0.07297813296318054, "step": 6120 }, { "epoch": 0.9190490915394635, "grad_norm": 0.2060546875, "learning_rate": 3.7700407620184674e-08, "loss": 0.07638216018676758, "step": 6130 }, { "epoch": 0.9205483559628557, "grad_norm": 0.1962890625, "learning_rate": 3.632798139967064e-08, "loss": 0.09769478440284729, "step": 6140 }, { "epoch": 0.922047620386248, "grad_norm": 0.345703125, "learning_rate": 3.498053893505126e-08, "loss": 0.07059162259101867, "step": 6150 }, { "epoch": 0.9235468848096403, "grad_norm": 0.2080078125, "learning_rate": 3.365811515955319e-08, "loss": 0.10193029642105103, "step": 6160 }, { "epoch": 0.9250461492330325, "grad_norm": 0.2099609375, "learning_rate": 3.236074435777991e-08, "loss": 0.08877017498016357, "step": 6170 }, { "epoch": 0.9265454136564248, "grad_norm": 0.19921875, "learning_rate": 3.1088460164821694e-08, "loss": 0.07558783888816833, "step": 6180 }, { "epoch": 0.928044678079817, "grad_norm": 0.26953125, "learning_rate": 2.984129556538417e-08, "loss": 0.10496606826782226, "step": 6190 }, { "epoch": 0.9295439425032094, "grad_norm": 0.1826171875, "learning_rate": 2.8619282892932472e-08, "loss": 0.08706371784210205, "step": 6200 }, { "epoch": 0.9310432069266016, "grad_norm": 0.24609375, "learning_rate": 2.742245382885422e-08, "loss": 0.07445533275604248, "step": 6210 }, { "epoch": 0.9325424713499939, "grad_norm": 0.2392578125, "learning_rate": 2.6250839401636636e-08, "loss": 0.08374568819999695, "step": 6220 }, { "epoch": 0.9340417357733862, "grad_norm": 0.220703125, "learning_rate": 2.510446998606297e-08, "loss": 0.08437891006469726, "step": 6230 }, { "epoch": 0.9355410001967784, "grad_norm": 0.232421875, "learning_rate": 2.3983375302425445e-08, "loss": 0.06599584221839905, "step": 6240 }, { "epoch": 0.9370402646201708, "grad_norm": 0.390625, "learning_rate": 2.2887584415753558e-08, "loss": 0.08677806854248046, "step": 6250 }, { "epoch": 0.938539529043563, "grad_norm": 0.173828125, "learning_rate": 2.1817125735061448e-08, "loss": 0.057446730136871335, "step": 6260 }, { "epoch": 0.9400387934669553, "grad_norm": 0.279296875, "learning_rate": 2.0772027012611382e-08, "loss": 0.07344555258750915, "step": 6270 }, { "epoch": 0.9415380578903475, "grad_norm": 0.2333984375, "learning_rate": 1.975231534319366e-08, "loss": 0.061513519287109374, "step": 6280 }, { "epoch": 0.9430373223137398, "grad_norm": 0.234375, "learning_rate": 1.875801716342462e-08, "loss": 0.08662024140357971, "step": 6290 }, { "epoch": 0.9445365867371321, "grad_norm": 0.2890625, "learning_rate": 1.7789158251061087e-08, "loss": 0.08880329728126526, "step": 6300 }, { "epoch": 0.9460358511605244, "grad_norm": 0.23046875, "learning_rate": 1.684576372433222e-08, "loss": 0.08403295874595643, "step": 6310 }, { "epoch": 0.9475351155839167, "grad_norm": 0.2099609375, "learning_rate": 1.5927858041288154e-08, "loss": 0.07371333837509156, "step": 6320 }, { "epoch": 0.9490343800073089, "grad_norm": 0.255859375, "learning_rate": 1.503546499916608e-08, "loss": 0.0930757999420166, "step": 6330 }, { "epoch": 0.9505336444307012, "grad_norm": 0.2490234375, "learning_rate": 1.4168607733773042e-08, "loss": 0.09260554909706116, "step": 6340 }, { "epoch": 0.9520329088540934, "grad_norm": 0.23046875, "learning_rate": 1.3327308718886322e-08, "loss": 0.06500183939933776, "step": 6350 }, { "epoch": 0.9535321732774857, "grad_norm": 0.259765625, "learning_rate": 1.2511589765670682e-08, "loss": 0.12267719507217408, "step": 6360 }, { "epoch": 0.9550314377008781, "grad_norm": 0.23046875, "learning_rate": 1.1721472022113044e-08, "loss": 0.15489401817321777, "step": 6370 }, { "epoch": 0.9565307021242703, "grad_norm": 0.41015625, "learning_rate": 1.0956975972474136e-08, "loss": 0.08266881704330445, "step": 6380 }, { "epoch": 0.9580299665476626, "grad_norm": 0.21875, "learning_rate": 1.0218121436757266e-08, "loss": 0.062265390157699586, "step": 6390 }, { "epoch": 0.9595292309710548, "grad_norm": 0.3125, "learning_rate": 9.504927570194831e-09, "loss": 0.11146190166473388, "step": 6400 }, { "epoch": 0.9610284953944471, "grad_norm": 0.271484375, "learning_rate": 8.817412862751172e-09, "loss": 0.11401185989379883, "step": 6410 }, { "epoch": 0.9625277598178393, "grad_norm": 0.259765625, "learning_rate": 8.155595138644055e-09, "loss": 0.06964959502220154, "step": 6420 }, { "epoch": 0.9640270242412317, "grad_norm": 0.248046875, "learning_rate": 7.519491555881497e-09, "loss": 0.08737698793411255, "step": 6430 }, { "epoch": 0.9655262886646239, "grad_norm": 0.39453125, "learning_rate": 6.909118605817776e-09, "loss": 0.09992367029190063, "step": 6440 }, { "epoch": 0.9670255530880162, "grad_norm": 0.458984375, "learning_rate": 6.324492112725676e-09, "loss": 0.10620630979537964, "step": 6450 }, { "epoch": 0.9685248175114085, "grad_norm": 0.27734375, "learning_rate": 5.765627233386028e-09, "loss": 0.09715937972068786, "step": 6460 }, { "epoch": 0.9700240819348007, "grad_norm": 0.18359375, "learning_rate": 5.2325384566949126e-09, "loss": 0.07616119980812072, "step": 6470 }, { "epoch": 0.971523346358193, "grad_norm": 0.2041015625, "learning_rate": 4.725239603287856e-09, "loss": 0.08586298823356628, "step": 6480 }, { "epoch": 0.9730226107815853, "grad_norm": 0.2158203125, "learning_rate": 4.243743825181889e-09, "loss": 0.10227413177490234, "step": 6490 }, { "epoch": 0.9745218752049776, "grad_norm": 0.240234375, "learning_rate": 3.788063605434267e-09, "loss": 0.1260104298591614, "step": 6500 }, { "epoch": 0.9760211396283698, "grad_norm": 0.2236328125, "learning_rate": 3.358210757819058e-09, "loss": 0.0673329472541809, "step": 6510 }, { "epoch": 0.9775204040517621, "grad_norm": 0.216796875, "learning_rate": 2.9541964265203945e-09, "loss": 0.08001582026481628, "step": 6520 }, { "epoch": 0.9790196684751544, "grad_norm": 0.1962890625, "learning_rate": 2.5760310858441436e-09, "loss": 0.07936614751815796, "step": 6530 }, { "epoch": 0.9805189328985466, "grad_norm": 0.375, "learning_rate": 2.2237245399460147e-09, "loss": 0.08582746386528015, "step": 6540 }, { "epoch": 0.982018197321939, "grad_norm": 0.3046875, "learning_rate": 1.8972859225776517e-09, "loss": 0.0957387626171112, "step": 6550 }, { "epoch": 0.9835174617453312, "grad_norm": 0.232421875, "learning_rate": 1.596723696849489e-09, "loss": 0.06561749577522277, "step": 6560 }, { "epoch": 0.9850167261687235, "grad_norm": 0.244140625, "learning_rate": 1.3220456550113723e-09, "loss": 0.06993853449821472, "step": 6570 }, { "epoch": 0.9865159905921157, "grad_norm": 0.203125, "learning_rate": 1.073258918250941e-09, "loss": 0.0709548532962799, "step": 6580 }, { "epoch": 0.988015255015508, "grad_norm": 0.349609375, "learning_rate": 8.503699365084438e-10, "loss": 0.08012692332267761, "step": 6590 }, { "epoch": 0.9895145194389003, "grad_norm": 0.251953125, "learning_rate": 6.533844883102046e-10, "loss": 0.06314979791641236, "step": 6600 }, { "epoch": 0.9910137838622926, "grad_norm": 0.1884765625, "learning_rate": 4.823076806180771e-10, "loss": 0.06169562935829163, "step": 6610 }, { "epoch": 0.9925130482856849, "grad_norm": 0.2041015625, "learning_rate": 3.371439486975491e-10, "loss": 0.06976774334907532, "step": 6620 }, { "epoch": 0.9940123127090771, "grad_norm": 0.255859375, "learning_rate": 2.1789705600250287e-10, "loss": 0.06677849292755127, "step": 6630 }, { "epoch": 0.9955115771324694, "grad_norm": 0.2255859375, "learning_rate": 1.2457009407784714e-10, "loss": 0.06477210521697999, "step": 6640 }, { "epoch": 0.9970108415558616, "grad_norm": 0.2236328125, "learning_rate": 5.716548247902686e-11, "loss": 0.08652416467666627, "step": 6650 }, { "epoch": 0.9985101059792539, "grad_norm": 0.29296875, "learning_rate": 1.5684968709850367e-11, "loss": 0.0895546019077301, "step": 6660 }, { "epoch": 1.0, "grad_norm": 0.263671875, "learning_rate": 1.296281766371976e-13, "loss": 0.07611684799194336, "step": 6670 } ], "logging_steps": 10, "max_steps": 6670, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.263796339119374e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }