{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5000058835296471, "eval_steps": 500, "global_step": 10623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047068237176847136, "grad_norm": 3.2121660709381104, "learning_rate": 6.269592476489028e-07, "loss": 0.3835, "step": 10 }, { "epoch": 0.0009413647435369427, "grad_norm": 2.1459333896636963, "learning_rate": 1.2539184952978056e-06, "loss": 0.3567, "step": 20 }, { "epoch": 0.0014120471153054141, "grad_norm": 1.4950684309005737, "learning_rate": 1.8808777429467086e-06, "loss": 0.2723, "step": 30 }, { "epoch": 0.0018827294870738854, "grad_norm": 1.0901622772216797, "learning_rate": 2.507836990595611e-06, "loss": 0.1799, "step": 40 }, { "epoch": 0.0023534118588423567, "grad_norm": 0.8344390988349915, "learning_rate": 3.1347962382445144e-06, "loss": 0.1276, "step": 50 }, { "epoch": 0.0028240942306108283, "grad_norm": 0.8088454604148865, "learning_rate": 3.7617554858934172e-06, "loss": 0.0945, "step": 60 }, { "epoch": 0.0032947766023792993, "grad_norm": 0.7016109824180603, "learning_rate": 4.3887147335423205e-06, "loss": 0.0785, "step": 70 }, { "epoch": 0.003765458974147771, "grad_norm": 0.6808176040649414, "learning_rate": 5.015673981191222e-06, "loss": 0.0777, "step": 80 }, { "epoch": 0.004236141345916242, "grad_norm": 0.637493908405304, "learning_rate": 5.642633228840125e-06, "loss": 0.0614, "step": 90 }, { "epoch": 0.0047068237176847135, "grad_norm": 0.6663233637809753, "learning_rate": 6.269592476489029e-06, "loss": 0.0647, "step": 100 }, { "epoch": 0.0051775060894531846, "grad_norm": 0.5964338183403015, "learning_rate": 6.896551724137932e-06, "loss": 0.0552, "step": 110 }, { "epoch": 0.0056481884612216565, "grad_norm": 0.5919690132141113, "learning_rate": 7.5235109717868345e-06, "loss": 0.0662, "step": 120 }, { "epoch": 0.006118870832990128, "grad_norm": 0.6082098484039307, "learning_rate": 8.150470219435737e-06, "loss": 0.0578, "step": 130 }, { "epoch": 0.006589553204758599, "grad_norm": 0.5305130481719971, "learning_rate": 8.777429467084641e-06, "loss": 0.0576, "step": 140 }, { "epoch": 0.00706023557652707, "grad_norm": 0.5822517275810242, "learning_rate": 9.404388714733543e-06, "loss": 0.0644, "step": 150 }, { "epoch": 0.007530917948295542, "grad_norm": 0.4910309314727783, "learning_rate": 1.0031347962382445e-05, "loss": 0.0507, "step": 160 }, { "epoch": 0.008001600320064013, "grad_norm": 0.47091054916381836, "learning_rate": 1.0658307210031348e-05, "loss": 0.0499, "step": 170 }, { "epoch": 0.008472282691832485, "grad_norm": 0.4046013057231903, "learning_rate": 1.128526645768025e-05, "loss": 0.0451, "step": 180 }, { "epoch": 0.008942965063600955, "grad_norm": 0.4918869137763977, "learning_rate": 1.1912225705329154e-05, "loss": 0.0493, "step": 190 }, { "epoch": 0.009413647435369427, "grad_norm": 0.5643543004989624, "learning_rate": 1.2539184952978058e-05, "loss": 0.0432, "step": 200 }, { "epoch": 0.009884329807137899, "grad_norm": 0.445388525724411, "learning_rate": 1.316614420062696e-05, "loss": 0.0508, "step": 210 }, { "epoch": 0.010355012178906369, "grad_norm": 0.4503282606601715, "learning_rate": 1.3793103448275863e-05, "loss": 0.052, "step": 220 }, { "epoch": 0.010825694550674841, "grad_norm": 0.3859306275844574, "learning_rate": 1.4420062695924765e-05, "loss": 0.0443, "step": 230 }, { "epoch": 0.011296376922443313, "grad_norm": 0.4950064420700073, "learning_rate": 1.5047021943573669e-05, "loss": 0.0414, "step": 240 }, { "epoch": 0.011767059294211783, "grad_norm": 0.43678030371665955, "learning_rate": 1.567398119122257e-05, "loss": 0.0416, "step": 250 }, { "epoch": 0.012237741665980255, "grad_norm": 0.3631042540073395, "learning_rate": 1.6300940438871475e-05, "loss": 0.0422, "step": 260 }, { "epoch": 0.012708424037748725, "grad_norm": 0.4427097737789154, "learning_rate": 1.6927899686520378e-05, "loss": 0.0346, "step": 270 }, { "epoch": 0.013179106409517197, "grad_norm": 0.44671741127967834, "learning_rate": 1.7554858934169282e-05, "loss": 0.0493, "step": 280 }, { "epoch": 0.01364978878128567, "grad_norm": 0.3654371201992035, "learning_rate": 1.8181818181818182e-05, "loss": 0.0462, "step": 290 }, { "epoch": 0.01412047115305414, "grad_norm": 0.4591113030910492, "learning_rate": 1.8808777429467086e-05, "loss": 0.0396, "step": 300 }, { "epoch": 0.014591153524822612, "grad_norm": 0.4140564799308777, "learning_rate": 1.943573667711599e-05, "loss": 0.0395, "step": 310 }, { "epoch": 0.015061835896591083, "grad_norm": 0.3526355028152466, "learning_rate": 1.999999953520864e-05, "loss": 0.0353, "step": 320 }, { "epoch": 0.015532518268359554, "grad_norm": 0.37384408712387085, "learning_rate": 1.9999943760297725e-05, "loss": 0.0357, "step": 330 }, { "epoch": 0.016003200640128026, "grad_norm": 0.39968496561050415, "learning_rate": 1.99997950277089e-05, "loss": 0.0384, "step": 340 }, { "epoch": 0.016473883011896496, "grad_norm": 0.33978167176246643, "learning_rate": 1.999955333882476e-05, "loss": 0.0409, "step": 350 }, { "epoch": 0.01694456538366497, "grad_norm": 0.3861013352870941, "learning_rate": 1.9999218695892e-05, "loss": 0.0345, "step": 360 }, { "epoch": 0.01741524775543344, "grad_norm": 0.26358628273010254, "learning_rate": 1.9998791102021396e-05, "loss": 0.0373, "step": 370 }, { "epoch": 0.01788593012720191, "grad_norm": 0.3237617611885071, "learning_rate": 1.999827056118779e-05, "loss": 0.0313, "step": 380 }, { "epoch": 0.018356612498970384, "grad_norm": 0.28069791197776794, "learning_rate": 1.9997657078230034e-05, "loss": 0.0314, "step": 390 }, { "epoch": 0.018827294870738854, "grad_norm": 0.28065061569213867, "learning_rate": 1.9996950658850956e-05, "loss": 0.0369, "step": 400 }, { "epoch": 0.019297977242507324, "grad_norm": 0.33597075939178467, "learning_rate": 1.99961513096173e-05, "loss": 0.0354, "step": 410 }, { "epoch": 0.019768659614275798, "grad_norm": 0.32852622866630554, "learning_rate": 1.999525903795968e-05, "loss": 0.0326, "step": 420 }, { "epoch": 0.020239341986044268, "grad_norm": 0.330789715051651, "learning_rate": 1.9994273852172484e-05, "loss": 0.0294, "step": 430 }, { "epoch": 0.020710024357812738, "grad_norm": 0.3159064054489136, "learning_rate": 1.9993195761413823e-05, "loss": 0.0325, "step": 440 }, { "epoch": 0.021180706729581212, "grad_norm": 0.26775479316711426, "learning_rate": 1.999202477570544e-05, "loss": 0.0307, "step": 450 }, { "epoch": 0.021651389101349682, "grad_norm": 0.2963818609714508, "learning_rate": 1.9990760905932605e-05, "loss": 0.0319, "step": 460 }, { "epoch": 0.022122071473118152, "grad_norm": 0.28052231669425964, "learning_rate": 1.9989404163844026e-05, "loss": 0.0338, "step": 470 }, { "epoch": 0.022592753844886626, "grad_norm": 0.27980944514274597, "learning_rate": 1.9987954562051724e-05, "loss": 0.0262, "step": 480 }, { "epoch": 0.023063436216655096, "grad_norm": 0.31243380904197693, "learning_rate": 1.998641211403095e-05, "loss": 0.0285, "step": 490 }, { "epoch": 0.023534118588423567, "grad_norm": 0.2522485554218292, "learning_rate": 1.9984776834120015e-05, "loss": 0.0357, "step": 500 }, { "epoch": 0.024004800960192037, "grad_norm": 0.4013010263442993, "learning_rate": 1.9983048737520186e-05, "loss": 0.0262, "step": 510 }, { "epoch": 0.02447548333196051, "grad_norm": 0.30129656195640564, "learning_rate": 1.9981227840295544e-05, "loss": 0.0316, "step": 520 }, { "epoch": 0.02494616570372898, "grad_norm": 0.33049213886260986, "learning_rate": 1.9979314159372815e-05, "loss": 0.0339, "step": 530 }, { "epoch": 0.02541684807549745, "grad_norm": 0.3306364417076111, "learning_rate": 1.997730771254124e-05, "loss": 0.0344, "step": 540 }, { "epoch": 0.025887530447265925, "grad_norm": 0.25138357281684875, "learning_rate": 1.9975208518452384e-05, "loss": 0.0315, "step": 550 }, { "epoch": 0.026358212819034395, "grad_norm": 0.2609182596206665, "learning_rate": 1.9973016596619973e-05, "loss": 0.0314, "step": 560 }, { "epoch": 0.026828895190802865, "grad_norm": 0.24546240270137787, "learning_rate": 1.9970731967419724e-05, "loss": 0.029, "step": 570 }, { "epoch": 0.02729957756257134, "grad_norm": 0.37703031301498413, "learning_rate": 1.9968354652089142e-05, "loss": 0.0266, "step": 580 }, { "epoch": 0.02777025993433981, "grad_norm": 0.2936958968639374, "learning_rate": 1.9965884672727313e-05, "loss": 0.0285, "step": 590 }, { "epoch": 0.02824094230610828, "grad_norm": 0.31345826387405396, "learning_rate": 1.996332205229473e-05, "loss": 0.0288, "step": 600 }, { "epoch": 0.028711624677876753, "grad_norm": 0.19102059304714203, "learning_rate": 1.9960666814613043e-05, "loss": 0.0227, "step": 610 }, { "epoch": 0.029182307049645223, "grad_norm": 0.408699631690979, "learning_rate": 1.995791898436487e-05, "loss": 0.0308, "step": 620 }, { "epoch": 0.029652989421413693, "grad_norm": 0.3756527900695801, "learning_rate": 1.995507858709354e-05, "loss": 0.0336, "step": 630 }, { "epoch": 0.030123671793182167, "grad_norm": 0.2407246232032776, "learning_rate": 1.9952145649202885e-05, "loss": 0.0342, "step": 640 }, { "epoch": 0.030594354164950637, "grad_norm": 0.29445934295654297, "learning_rate": 1.9949120197956956e-05, "loss": 0.0289, "step": 650 }, { "epoch": 0.031065036536719107, "grad_norm": 0.3455241024494171, "learning_rate": 1.9946002261479817e-05, "loss": 0.03, "step": 660 }, { "epoch": 0.03153571890848758, "grad_norm": 0.2687731087207794, "learning_rate": 1.9942791868755233e-05, "loss": 0.0349, "step": 670 }, { "epoch": 0.03200640128025605, "grad_norm": 0.3837105333805084, "learning_rate": 1.993948904962644e-05, "loss": 0.0277, "step": 680 }, { "epoch": 0.03247708365202452, "grad_norm": 0.26051077246665955, "learning_rate": 1.9936093834795853e-05, "loss": 0.0256, "step": 690 }, { "epoch": 0.03294776602379299, "grad_norm": 0.2722662687301636, "learning_rate": 1.993260625582478e-05, "loss": 0.0313, "step": 700 }, { "epoch": 0.03341844839556146, "grad_norm": 0.2689184844493866, "learning_rate": 1.992902634513312e-05, "loss": 0.0284, "step": 710 }, { "epoch": 0.03388913076732994, "grad_norm": 0.20353814959526062, "learning_rate": 1.9925354135999083e-05, "loss": 0.0222, "step": 720 }, { "epoch": 0.03435981313909841, "grad_norm": 0.3134755492210388, "learning_rate": 1.992158966255887e-05, "loss": 0.0258, "step": 730 }, { "epoch": 0.03483049551086688, "grad_norm": 0.302522748708725, "learning_rate": 1.9917732959806336e-05, "loss": 0.0309, "step": 740 }, { "epoch": 0.03530117788263535, "grad_norm": 0.1800171136856079, "learning_rate": 1.9913784063592708e-05, "loss": 0.022, "step": 750 }, { "epoch": 0.03577186025440382, "grad_norm": 0.2651924788951874, "learning_rate": 1.990974301062621e-05, "loss": 0.0283, "step": 760 }, { "epoch": 0.03624254262617229, "grad_norm": 0.23250947892665863, "learning_rate": 1.990560983847174e-05, "loss": 0.024, "step": 770 }, { "epoch": 0.03671322499794077, "grad_norm": 0.2604784071445465, "learning_rate": 1.9901384585550527e-05, "loss": 0.0263, "step": 780 }, { "epoch": 0.03718390736970924, "grad_norm": 0.22437463700771332, "learning_rate": 1.989706729113976e-05, "loss": 0.026, "step": 790 }, { "epoch": 0.03765458974147771, "grad_norm": 0.3088092803955078, "learning_rate": 1.9892657995372227e-05, "loss": 0.0255, "step": 800 }, { "epoch": 0.03812527211324618, "grad_norm": 0.2234327793121338, "learning_rate": 1.988815673923596e-05, "loss": 0.0264, "step": 810 }, { "epoch": 0.03859595448501465, "grad_norm": 0.3257303535938263, "learning_rate": 1.9883563564573815e-05, "loss": 0.0282, "step": 820 }, { "epoch": 0.03906663685678312, "grad_norm": 0.1974906325340271, "learning_rate": 1.9878878514083124e-05, "loss": 0.0229, "step": 830 }, { "epoch": 0.039537319228551596, "grad_norm": 0.2772673964500427, "learning_rate": 1.9874101631315268e-05, "loss": 0.023, "step": 840 }, { "epoch": 0.040008001600320066, "grad_norm": 0.2952466607093811, "learning_rate": 1.9869232960675292e-05, "loss": 0.0274, "step": 850 }, { "epoch": 0.040478683972088536, "grad_norm": 0.23016895353794098, "learning_rate": 1.9864272547421482e-05, "loss": 0.0243, "step": 860 }, { "epoch": 0.040949366343857006, "grad_norm": 0.20470339059829712, "learning_rate": 1.985922043766495e-05, "loss": 0.0285, "step": 870 }, { "epoch": 0.041420048715625477, "grad_norm": 0.2374616414308548, "learning_rate": 1.9854076678369197e-05, "loss": 0.0223, "step": 880 }, { "epoch": 0.04189073108739395, "grad_norm": 0.21280400454998016, "learning_rate": 1.984884131734968e-05, "loss": 0.0238, "step": 890 }, { "epoch": 0.042361413459162424, "grad_norm": 0.21137377619743347, "learning_rate": 1.9843514403273378e-05, "loss": 0.0239, "step": 900 }, { "epoch": 0.042832095830930894, "grad_norm": 0.20922043919563293, "learning_rate": 1.9838095985658324e-05, "loss": 0.0261, "step": 910 }, { "epoch": 0.043302778202699364, "grad_norm": 0.32684192061424255, "learning_rate": 1.9832586114873154e-05, "loss": 0.0316, "step": 920 }, { "epoch": 0.043773460574467835, "grad_norm": 0.13897359371185303, "learning_rate": 1.9826984842136637e-05, "loss": 0.0226, "step": 930 }, { "epoch": 0.044244142946236305, "grad_norm": 0.23431269824504852, "learning_rate": 1.982129221951719e-05, "loss": 0.0238, "step": 940 }, { "epoch": 0.044714825318004775, "grad_norm": 0.2124047577381134, "learning_rate": 1.9815508299932417e-05, "loss": 0.0223, "step": 950 }, { "epoch": 0.04518550768977325, "grad_norm": 0.19508586823940277, "learning_rate": 1.980963313714859e-05, "loss": 0.027, "step": 960 }, { "epoch": 0.04565619006154172, "grad_norm": 0.2443627119064331, "learning_rate": 1.9803666785780165e-05, "loss": 0.0261, "step": 970 }, { "epoch": 0.04612687243331019, "grad_norm": 0.28225573897361755, "learning_rate": 1.979760930128927e-05, "loss": 0.0245, "step": 980 }, { "epoch": 0.04659755480507866, "grad_norm": 0.24464906752109528, "learning_rate": 1.979146073998519e-05, "loss": 0.0235, "step": 990 }, { "epoch": 0.04706823717684713, "grad_norm": 0.2401905208826065, "learning_rate": 1.9785221159023852e-05, "loss": 0.0269, "step": 1000 }, { "epoch": 0.0475389195486156, "grad_norm": 0.22430147230625153, "learning_rate": 1.9778890616407266e-05, "loss": 0.0275, "step": 1010 }, { "epoch": 0.048009601920384073, "grad_norm": 0.2961570620536804, "learning_rate": 1.9772469170983026e-05, "loss": 0.0287, "step": 1020 }, { "epoch": 0.04848028429215255, "grad_norm": 0.2170732170343399, "learning_rate": 1.976595688244373e-05, "loss": 0.0233, "step": 1030 }, { "epoch": 0.04895096666392102, "grad_norm": 0.25931650400161743, "learning_rate": 1.975935381132644e-05, "loss": 0.0198, "step": 1040 }, { "epoch": 0.04942164903568949, "grad_norm": 0.20673753321170807, "learning_rate": 1.975266001901212e-05, "loss": 0.0217, "step": 1050 }, { "epoch": 0.04989233140745796, "grad_norm": 0.217953160405159, "learning_rate": 1.9745875567725058e-05, "loss": 0.0274, "step": 1060 }, { "epoch": 0.05036301377922643, "grad_norm": 0.21465808153152466, "learning_rate": 1.973900052053229e-05, "loss": 0.0266, "step": 1070 }, { "epoch": 0.0508336961509949, "grad_norm": 0.19897782802581787, "learning_rate": 1.973203494134302e-05, "loss": 0.02, "step": 1080 }, { "epoch": 0.05130437852276338, "grad_norm": 0.21559686958789825, "learning_rate": 1.9724978894908017e-05, "loss": 0.0257, "step": 1090 }, { "epoch": 0.05177506089453185, "grad_norm": 0.19682246446609497, "learning_rate": 1.971783244681902e-05, "loss": 0.0238, "step": 1100 }, { "epoch": 0.05224574326630032, "grad_norm": 0.27452391386032104, "learning_rate": 1.9710595663508125e-05, "loss": 0.0234, "step": 1110 }, { "epoch": 0.05271642563806879, "grad_norm": 0.322608083486557, "learning_rate": 1.9703268612247172e-05, "loss": 0.0285, "step": 1120 }, { "epoch": 0.05318710800983726, "grad_norm": 0.24620217084884644, "learning_rate": 1.9695851361147098e-05, "loss": 0.0207, "step": 1130 }, { "epoch": 0.05365779038160573, "grad_norm": 0.17132923007011414, "learning_rate": 1.9688343979157348e-05, "loss": 0.0279, "step": 1140 }, { "epoch": 0.05412847275337421, "grad_norm": 0.31226620078086853, "learning_rate": 1.968074653606519e-05, "loss": 0.0261, "step": 1150 }, { "epoch": 0.05459915512514268, "grad_norm": 0.1799079030752182, "learning_rate": 1.9673059102495084e-05, "loss": 0.0228, "step": 1160 }, { "epoch": 0.05506983749691115, "grad_norm": 0.2895565927028656, "learning_rate": 1.9665281749908034e-05, "loss": 0.0245, "step": 1170 }, { "epoch": 0.05554051986867962, "grad_norm": 0.25858330726623535, "learning_rate": 1.9657414550600907e-05, "loss": 0.021, "step": 1180 }, { "epoch": 0.05601120224044809, "grad_norm": 0.2365933060646057, "learning_rate": 1.964945757770578e-05, "loss": 0.0224, "step": 1190 }, { "epoch": 0.05648188461221656, "grad_norm": 0.1789606511592865, "learning_rate": 1.964141090518923e-05, "loss": 0.022, "step": 1200 }, { "epoch": 0.056952566983985035, "grad_norm": 0.20084995031356812, "learning_rate": 1.963327460785168e-05, "loss": 0.0232, "step": 1210 }, { "epoch": 0.057423249355753506, "grad_norm": 0.24878057837486267, "learning_rate": 1.962504876132669e-05, "loss": 0.0263, "step": 1220 }, { "epoch": 0.057893931727521976, "grad_norm": 0.22912390530109406, "learning_rate": 1.9616733442080253e-05, "loss": 0.0223, "step": 1230 }, { "epoch": 0.058364614099290446, "grad_norm": 0.1336241066455841, "learning_rate": 1.960832872741007e-05, "loss": 0.0178, "step": 1240 }, { "epoch": 0.058835296471058916, "grad_norm": 0.18239843845367432, "learning_rate": 1.9599834695444863e-05, "loss": 0.0233, "step": 1250 }, { "epoch": 0.059305978842827387, "grad_norm": 0.19737780094146729, "learning_rate": 1.959125142514362e-05, "loss": 0.0216, "step": 1260 }, { "epoch": 0.059776661214595864, "grad_norm": 0.1718551516532898, "learning_rate": 1.9582578996294882e-05, "loss": 0.023, "step": 1270 }, { "epoch": 0.060247343586364334, "grad_norm": 0.1518109142780304, "learning_rate": 1.957381748951599e-05, "loss": 0.0209, "step": 1280 }, { "epoch": 0.060718025958132804, "grad_norm": 0.26714184880256653, "learning_rate": 1.9564966986252326e-05, "loss": 0.022, "step": 1290 }, { "epoch": 0.061188708329901274, "grad_norm": 0.18353547155857086, "learning_rate": 1.9556027568776577e-05, "loss": 0.0228, "step": 1300 }, { "epoch": 0.061659390701669745, "grad_norm": 0.24423734843730927, "learning_rate": 1.9546999320187966e-05, "loss": 0.0215, "step": 1310 }, { "epoch": 0.062130073073438215, "grad_norm": 0.24629823863506317, "learning_rate": 1.953788232441147e-05, "loss": 0.0206, "step": 1320 }, { "epoch": 0.06260075544520669, "grad_norm": 0.20937883853912354, "learning_rate": 1.9528676666197026e-05, "loss": 0.0254, "step": 1330 }, { "epoch": 0.06307143781697516, "grad_norm": 0.3114871382713318, "learning_rate": 1.951938243111879e-05, "loss": 0.0188, "step": 1340 }, { "epoch": 0.06354212018874363, "grad_norm": 0.22290900349617004, "learning_rate": 1.9509999705574293e-05, "loss": 0.0192, "step": 1350 }, { "epoch": 0.0640128025605121, "grad_norm": 0.17567254602909088, "learning_rate": 1.9500528576783667e-05, "loss": 0.0244, "step": 1360 }, { "epoch": 0.06448348493228058, "grad_norm": 0.2604529857635498, "learning_rate": 1.9490969132788815e-05, "loss": 0.0261, "step": 1370 }, { "epoch": 0.06495416730404904, "grad_norm": 0.29492446780204773, "learning_rate": 1.9481321462452617e-05, "loss": 0.0212, "step": 1380 }, { "epoch": 0.06542484967581752, "grad_norm": 0.3971514403820038, "learning_rate": 1.9471585655458073e-05, "loss": 0.0229, "step": 1390 }, { "epoch": 0.06589553204758598, "grad_norm": 0.269244521856308, "learning_rate": 1.9461761802307494e-05, "loss": 0.0262, "step": 1400 }, { "epoch": 0.06636621441935446, "grad_norm": 0.31963616609573364, "learning_rate": 1.945184999432166e-05, "loss": 0.0197, "step": 1410 }, { "epoch": 0.06683689679112292, "grad_norm": 0.20610783994197845, "learning_rate": 1.9441850323638944e-05, "loss": 0.0207, "step": 1420 }, { "epoch": 0.0673075791628914, "grad_norm": 0.2013525664806366, "learning_rate": 1.943176288321449e-05, "loss": 0.0213, "step": 1430 }, { "epoch": 0.06777826153465988, "grad_norm": 0.18529535830020905, "learning_rate": 1.942158776681933e-05, "loss": 0.0253, "step": 1440 }, { "epoch": 0.06824894390642834, "grad_norm": 0.18055978417396545, "learning_rate": 1.941132506903951e-05, "loss": 0.0203, "step": 1450 }, { "epoch": 0.06871962627819682, "grad_norm": 0.20997396111488342, "learning_rate": 1.9400974885275226e-05, "loss": 0.022, "step": 1460 }, { "epoch": 0.06919030864996528, "grad_norm": 0.21085289120674133, "learning_rate": 1.9390537311739927e-05, "loss": 0.0253, "step": 1470 }, { "epoch": 0.06966099102173376, "grad_norm": 0.16178180277347565, "learning_rate": 1.938001244545941e-05, "loss": 0.0228, "step": 1480 }, { "epoch": 0.07013167339350224, "grad_norm": 0.2259250283241272, "learning_rate": 1.9369400384270948e-05, "loss": 0.0219, "step": 1490 }, { "epoch": 0.0706023557652707, "grad_norm": 0.1730378270149231, "learning_rate": 1.935870122682235e-05, "loss": 0.0247, "step": 1500 }, { "epoch": 0.07107303813703918, "grad_norm": 0.19767075777053833, "learning_rate": 1.934791507257105e-05, "loss": 0.0191, "step": 1510 }, { "epoch": 0.07154372050880764, "grad_norm": 0.2398705631494522, "learning_rate": 1.933704202178321e-05, "loss": 0.0244, "step": 1520 }, { "epoch": 0.07201440288057612, "grad_norm": 0.17935000360012054, "learning_rate": 1.9326082175532744e-05, "loss": 0.0205, "step": 1530 }, { "epoch": 0.07248508525234458, "grad_norm": 0.21696268022060394, "learning_rate": 1.9315035635700412e-05, "loss": 0.0222, "step": 1540 }, { "epoch": 0.07295576762411306, "grad_norm": 0.18124236166477203, "learning_rate": 1.9303902504972866e-05, "loss": 0.0257, "step": 1550 }, { "epoch": 0.07342644999588153, "grad_norm": 0.1973320096731186, "learning_rate": 1.9292682886841683e-05, "loss": 0.0213, "step": 1560 }, { "epoch": 0.07389713236765, "grad_norm": 0.20220108330249786, "learning_rate": 1.9281376885602412e-05, "loss": 0.0237, "step": 1570 }, { "epoch": 0.07436781473941848, "grad_norm": 0.26648613810539246, "learning_rate": 1.926998460635361e-05, "loss": 0.0231, "step": 1580 }, { "epoch": 0.07483849711118694, "grad_norm": 0.20783020555973053, "learning_rate": 1.9258506154995854e-05, "loss": 0.0195, "step": 1590 }, { "epoch": 0.07530917948295542, "grad_norm": 0.26222914457321167, "learning_rate": 1.924694163823076e-05, "loss": 0.0279, "step": 1600 }, { "epoch": 0.07577986185472389, "grad_norm": 0.2147466093301773, "learning_rate": 1.9235291163559996e-05, "loss": 0.0233, "step": 1610 }, { "epoch": 0.07625054422649236, "grad_norm": 0.19049124419689178, "learning_rate": 1.922355483928428e-05, "loss": 0.0189, "step": 1620 }, { "epoch": 0.07672122659826083, "grad_norm": 0.19948463141918182, "learning_rate": 1.9211732774502372e-05, "loss": 0.0194, "step": 1630 }, { "epoch": 0.0771919089700293, "grad_norm": 0.16653354465961456, "learning_rate": 1.919982507911006e-05, "loss": 0.0187, "step": 1640 }, { "epoch": 0.07766259134179777, "grad_norm": 0.1999356746673584, "learning_rate": 1.918783186379913e-05, "loss": 0.0203, "step": 1650 }, { "epoch": 0.07813327371356624, "grad_norm": 0.1986234039068222, "learning_rate": 1.917575324005636e-05, "loss": 0.0201, "step": 1660 }, { "epoch": 0.07860395608533471, "grad_norm": 0.180724635720253, "learning_rate": 1.916358932016246e-05, "loss": 0.0205, "step": 1670 }, { "epoch": 0.07907463845710319, "grad_norm": 0.19597767293453217, "learning_rate": 1.9151340217191042e-05, "loss": 0.0218, "step": 1680 }, { "epoch": 0.07954532082887165, "grad_norm": 0.1517166644334793, "learning_rate": 1.9139006045007567e-05, "loss": 0.019, "step": 1690 }, { "epoch": 0.08001600320064013, "grad_norm": 0.1383761167526245, "learning_rate": 1.9126586918268275e-05, "loss": 0.0201, "step": 1700 }, { "epoch": 0.0804866855724086, "grad_norm": 0.15916861593723297, "learning_rate": 1.9114082952419134e-05, "loss": 0.0208, "step": 1710 }, { "epoch": 0.08095736794417707, "grad_norm": 0.17054808139801025, "learning_rate": 1.9101494263694764e-05, "loss": 0.0204, "step": 1720 }, { "epoch": 0.08142805031594554, "grad_norm": 0.16759757697582245, "learning_rate": 1.9088820969117348e-05, "loss": 0.0188, "step": 1730 }, { "epoch": 0.08189873268771401, "grad_norm": 0.2104874700307846, "learning_rate": 1.907606318649555e-05, "loss": 0.0205, "step": 1740 }, { "epoch": 0.08236941505948249, "grad_norm": 0.16602571308612823, "learning_rate": 1.906322103442343e-05, "loss": 0.0214, "step": 1750 }, { "epoch": 0.08284009743125095, "grad_norm": 0.2404191493988037, "learning_rate": 1.9050294632279317e-05, "loss": 0.0212, "step": 1760 }, { "epoch": 0.08331077980301943, "grad_norm": 0.176087886095047, "learning_rate": 1.9037284100224714e-05, "loss": 0.027, "step": 1770 }, { "epoch": 0.0837814621747879, "grad_norm": 0.23235034942626953, "learning_rate": 1.9024189559203193e-05, "loss": 0.0199, "step": 1780 }, { "epoch": 0.08425214454655637, "grad_norm": 0.2387867420911789, "learning_rate": 1.9011011130939254e-05, "loss": 0.0181, "step": 1790 }, { "epoch": 0.08472282691832485, "grad_norm": 0.17257186770439148, "learning_rate": 1.8997748937937188e-05, "loss": 0.0216, "step": 1800 }, { "epoch": 0.08519350929009331, "grad_norm": 0.20097611844539642, "learning_rate": 1.8984403103479957e-05, "loss": 0.0221, "step": 1810 }, { "epoch": 0.08566419166186179, "grad_norm": 0.16651542484760284, "learning_rate": 1.897097375162804e-05, "loss": 0.0173, "step": 1820 }, { "epoch": 0.08613487403363025, "grad_norm": 0.18618135154247284, "learning_rate": 1.8957461007218272e-05, "loss": 0.0177, "step": 1830 }, { "epoch": 0.08660555640539873, "grad_norm": 0.14787648618221283, "learning_rate": 1.8943864995862692e-05, "loss": 0.0204, "step": 1840 }, { "epoch": 0.08707623877716719, "grad_norm": 0.20184212923049927, "learning_rate": 1.8930185843947382e-05, "loss": 0.0199, "step": 1850 }, { "epoch": 0.08754692114893567, "grad_norm": 0.17956119775772095, "learning_rate": 1.891642367863127e-05, "loss": 0.0184, "step": 1860 }, { "epoch": 0.08801760352070415, "grad_norm": 0.17752839624881744, "learning_rate": 1.8902578627844975e-05, "loss": 0.0198, "step": 1870 }, { "epoch": 0.08848828589247261, "grad_norm": 0.17751826345920563, "learning_rate": 1.8888650820289594e-05, "loss": 0.0196, "step": 1880 }, { "epoch": 0.08895896826424109, "grad_norm": 0.21696944534778595, "learning_rate": 1.8874640385435515e-05, "loss": 0.0186, "step": 1890 }, { "epoch": 0.08942965063600955, "grad_norm": 0.12640699744224548, "learning_rate": 1.8860547453521232e-05, "loss": 0.0168, "step": 1900 }, { "epoch": 0.08990033300777803, "grad_norm": 0.18913812935352325, "learning_rate": 1.8846372155552095e-05, "loss": 0.0233, "step": 1910 }, { "epoch": 0.0903710153795465, "grad_norm": 0.17923256754875183, "learning_rate": 1.8832114623299125e-05, "loss": 0.02, "step": 1920 }, { "epoch": 0.09084169775131497, "grad_norm": 0.17282487452030182, "learning_rate": 1.8817774989297776e-05, "loss": 0.0215, "step": 1930 }, { "epoch": 0.09131238012308344, "grad_norm": 0.18908515572547913, "learning_rate": 1.8803353386846708e-05, "loss": 0.0177, "step": 1940 }, { "epoch": 0.09178306249485191, "grad_norm": 0.23107506334781647, "learning_rate": 1.878884995000654e-05, "loss": 0.0178, "step": 1950 }, { "epoch": 0.09225374486662039, "grad_norm": 0.16468971967697144, "learning_rate": 1.8774264813598614e-05, "loss": 0.0196, "step": 1960 }, { "epoch": 0.09272442723838885, "grad_norm": 0.16342712938785553, "learning_rate": 1.875959811320373e-05, "loss": 0.0189, "step": 1970 }, { "epoch": 0.09319510961015733, "grad_norm": 0.21671873331069946, "learning_rate": 1.87448499851609e-05, "loss": 0.023, "step": 1980 }, { "epoch": 0.0936657919819258, "grad_norm": 0.2030368447303772, "learning_rate": 1.8730020566566068e-05, "loss": 0.0197, "step": 1990 }, { "epoch": 0.09413647435369427, "grad_norm": 0.1771450787782669, "learning_rate": 1.8715109995270836e-05, "loss": 0.0193, "step": 2000 }, { "epoch": 0.09460715672546274, "grad_norm": 0.16633151471614838, "learning_rate": 1.8700118409881198e-05, "loss": 0.0207, "step": 2010 }, { "epoch": 0.0950778390972312, "grad_norm": 0.27168479561805725, "learning_rate": 1.8685045949756232e-05, "loss": 0.0224, "step": 2020 }, { "epoch": 0.09554852146899968, "grad_norm": 0.1796126663684845, "learning_rate": 1.8669892755006816e-05, "loss": 0.0166, "step": 2030 }, { "epoch": 0.09601920384076815, "grad_norm": 0.18188242614269257, "learning_rate": 1.865465896649432e-05, "loss": 0.0153, "step": 2040 }, { "epoch": 0.09648988621253662, "grad_norm": 0.2484055459499359, "learning_rate": 1.8639344725829302e-05, "loss": 0.0208, "step": 2050 }, { "epoch": 0.0969605685843051, "grad_norm": 0.2012706845998764, "learning_rate": 1.862395017537019e-05, "loss": 0.0179, "step": 2060 }, { "epoch": 0.09743125095607356, "grad_norm": 0.188084676861763, "learning_rate": 1.860847545822195e-05, "loss": 0.0156, "step": 2070 }, { "epoch": 0.09790193332784204, "grad_norm": 0.17676419019699097, "learning_rate": 1.8592920718234775e-05, "loss": 0.0203, "step": 2080 }, { "epoch": 0.0983726156996105, "grad_norm": 0.15742889046669006, "learning_rate": 1.8577286100002723e-05, "loss": 0.0182, "step": 2090 }, { "epoch": 0.09884329807137898, "grad_norm": 0.2136330008506775, "learning_rate": 1.8561571748862394e-05, "loss": 0.0222, "step": 2100 }, { "epoch": 0.09931398044314746, "grad_norm": 0.1901516616344452, "learning_rate": 1.854577781089157e-05, "loss": 0.0174, "step": 2110 }, { "epoch": 0.09978466281491592, "grad_norm": 0.18743844330310822, "learning_rate": 1.8529904432907858e-05, "loss": 0.0176, "step": 2120 }, { "epoch": 0.1002553451866844, "grad_norm": 0.17568612098693848, "learning_rate": 1.8513951762467318e-05, "loss": 0.022, "step": 2130 }, { "epoch": 0.10072602755845286, "grad_norm": 0.12301900237798691, "learning_rate": 1.849791994786311e-05, "loss": 0.0237, "step": 2140 }, { "epoch": 0.10119670993022134, "grad_norm": 0.21497417986392975, "learning_rate": 1.8481809138124092e-05, "loss": 0.0171, "step": 2150 }, { "epoch": 0.1016673923019898, "grad_norm": 0.20563672482967377, "learning_rate": 1.846561948301346e-05, "loss": 0.0199, "step": 2160 }, { "epoch": 0.10213807467375828, "grad_norm": 0.14239314198493958, "learning_rate": 1.8449351133027327e-05, "loss": 0.0161, "step": 2170 }, { "epoch": 0.10260875704552676, "grad_norm": 0.21211259067058563, "learning_rate": 1.8433004239393353e-05, "loss": 0.0219, "step": 2180 }, { "epoch": 0.10307943941729522, "grad_norm": 0.1391133815050125, "learning_rate": 1.8416578954069318e-05, "loss": 0.0205, "step": 2190 }, { "epoch": 0.1035501217890637, "grad_norm": 0.21394485235214233, "learning_rate": 1.840007542974172e-05, "loss": 0.0175, "step": 2200 }, { "epoch": 0.10402080416083216, "grad_norm": 0.12785886228084564, "learning_rate": 1.838349381982435e-05, "loss": 0.0192, "step": 2210 }, { "epoch": 0.10449148653260064, "grad_norm": 0.1993100494146347, "learning_rate": 1.8366834278456872e-05, "loss": 0.0204, "step": 2220 }, { "epoch": 0.10496216890436912, "grad_norm": 0.14523109793663025, "learning_rate": 1.8350096960503383e-05, "loss": 0.0154, "step": 2230 }, { "epoch": 0.10543285127613758, "grad_norm": 0.16498471796512604, "learning_rate": 1.833328202155098e-05, "loss": 0.0165, "step": 2240 }, { "epoch": 0.10590353364790606, "grad_norm": 0.19226130843162537, "learning_rate": 1.831638961790831e-05, "loss": 0.0176, "step": 2250 }, { "epoch": 0.10637421601967452, "grad_norm": 0.15666921436786652, "learning_rate": 1.8299419906604115e-05, "loss": 0.0184, "step": 2260 }, { "epoch": 0.106844898391443, "grad_norm": 0.21028366684913635, "learning_rate": 1.8282373045385786e-05, "loss": 0.0203, "step": 2270 }, { "epoch": 0.10731558076321146, "grad_norm": 0.16798938810825348, "learning_rate": 1.8265249192717868e-05, "loss": 0.0168, "step": 2280 }, { "epoch": 0.10778626313497994, "grad_norm": 0.16505767405033112, "learning_rate": 1.8248048507780626e-05, "loss": 0.0161, "step": 2290 }, { "epoch": 0.10825694550674841, "grad_norm": 0.3912341892719269, "learning_rate": 1.8230771150468517e-05, "loss": 0.0212, "step": 2300 }, { "epoch": 0.10872762787851688, "grad_norm": 0.1633525937795639, "learning_rate": 1.821341728138876e-05, "loss": 0.0166, "step": 2310 }, { "epoch": 0.10919831025028535, "grad_norm": 0.19325989484786987, "learning_rate": 1.819598706185979e-05, "loss": 0.015, "step": 2320 }, { "epoch": 0.10966899262205382, "grad_norm": 0.19846157729625702, "learning_rate": 1.8178480653909795e-05, "loss": 0.0214, "step": 2330 }, { "epoch": 0.1101396749938223, "grad_norm": 0.12296116352081299, "learning_rate": 1.8160898220275196e-05, "loss": 0.0161, "step": 2340 }, { "epoch": 0.11061035736559076, "grad_norm": 0.13783031702041626, "learning_rate": 1.8143239924399127e-05, "loss": 0.0181, "step": 2350 }, { "epoch": 0.11108103973735924, "grad_norm": 0.1732431948184967, "learning_rate": 1.8125505930429936e-05, "loss": 0.0187, "step": 2360 }, { "epoch": 0.11155172210912771, "grad_norm": 0.1288699209690094, "learning_rate": 1.810769640321963e-05, "loss": 0.0157, "step": 2370 }, { "epoch": 0.11202240448089618, "grad_norm": 0.20089437067508698, "learning_rate": 1.8089811508322382e-05, "loss": 0.0175, "step": 2380 }, { "epoch": 0.11249308685266465, "grad_norm": 0.1331891417503357, "learning_rate": 1.8071851411992948e-05, "loss": 0.0144, "step": 2390 }, { "epoch": 0.11296376922443312, "grad_norm": 0.18561014533042908, "learning_rate": 1.8053816281185154e-05, "loss": 0.0173, "step": 2400 }, { "epoch": 0.1134344515962016, "grad_norm": 0.2035892754793167, "learning_rate": 1.803570628355033e-05, "loss": 0.0169, "step": 2410 }, { "epoch": 0.11390513396797007, "grad_norm": 0.15459296107292175, "learning_rate": 1.801752158743576e-05, "loss": 0.015, "step": 2420 }, { "epoch": 0.11437581633973853, "grad_norm": 0.1719622015953064, "learning_rate": 1.7999262361883102e-05, "loss": 0.0186, "step": 2430 }, { "epoch": 0.11484649871150701, "grad_norm": 0.14672547578811646, "learning_rate": 1.7980928776626833e-05, "loss": 0.0241, "step": 2440 }, { "epoch": 0.11531718108327547, "grad_norm": 0.16649580001831055, "learning_rate": 1.796252100209266e-05, "loss": 0.018, "step": 2450 }, { "epoch": 0.11578786345504395, "grad_norm": 0.1590607613325119, "learning_rate": 1.794403920939595e-05, "loss": 0.0242, "step": 2460 }, { "epoch": 0.11625854582681242, "grad_norm": 0.16787108778953552, "learning_rate": 1.7925483570340118e-05, "loss": 0.0163, "step": 2470 }, { "epoch": 0.11672922819858089, "grad_norm": 0.1703803986310959, "learning_rate": 1.7906854257415048e-05, "loss": 0.0226, "step": 2480 }, { "epoch": 0.11719991057034937, "grad_norm": 0.1310303807258606, "learning_rate": 1.7888151443795478e-05, "loss": 0.0168, "step": 2490 }, { "epoch": 0.11767059294211783, "grad_norm": 0.19779440760612488, "learning_rate": 1.78693753033394e-05, "loss": 0.0181, "step": 2500 }, { "epoch": 0.11814127531388631, "grad_norm": 0.1330758035182953, "learning_rate": 1.7850526010586437e-05, "loss": 0.0173, "step": 2510 }, { "epoch": 0.11861195768565477, "grad_norm": 0.16539008915424347, "learning_rate": 1.7831603740756223e-05, "loss": 0.0205, "step": 2520 }, { "epoch": 0.11908264005742325, "grad_norm": 0.25288063287734985, "learning_rate": 1.7812608669746774e-05, "loss": 0.0197, "step": 2530 }, { "epoch": 0.11955332242919173, "grad_norm": 0.16131377220153809, "learning_rate": 1.779354097413285e-05, "loss": 0.0179, "step": 2540 }, { "epoch": 0.12002400480096019, "grad_norm": 0.15160414576530457, "learning_rate": 1.777440083116432e-05, "loss": 0.0139, "step": 2550 }, { "epoch": 0.12049468717272867, "grad_norm": 0.20332884788513184, "learning_rate": 1.7755188418764517e-05, "loss": 0.0189, "step": 2560 }, { "epoch": 0.12096536954449713, "grad_norm": 0.16422846913337708, "learning_rate": 1.7735903915528553e-05, "loss": 0.0209, "step": 2570 }, { "epoch": 0.12143605191626561, "grad_norm": 0.150728240609169, "learning_rate": 1.7716547500721715e-05, "loss": 0.0198, "step": 2580 }, { "epoch": 0.12190673428803407, "grad_norm": 0.12881092727184296, "learning_rate": 1.7697119354277746e-05, "loss": 0.019, "step": 2590 }, { "epoch": 0.12237741665980255, "grad_norm": 0.2340802103281021, "learning_rate": 1.76776196567972e-05, "loss": 0.0192, "step": 2600 }, { "epoch": 0.12284809903157103, "grad_norm": 0.14337819814682007, "learning_rate": 1.7658048589545757e-05, "loss": 0.0188, "step": 2610 }, { "epoch": 0.12331878140333949, "grad_norm": 0.11415712535381317, "learning_rate": 1.7638406334452535e-05, "loss": 0.0146, "step": 2620 }, { "epoch": 0.12378946377510797, "grad_norm": 0.15873154997825623, "learning_rate": 1.7618693074108405e-05, "loss": 0.0195, "step": 2630 }, { "epoch": 0.12426014614687643, "grad_norm": 0.17095762491226196, "learning_rate": 1.7598908991764288e-05, "loss": 0.02, "step": 2640 }, { "epoch": 0.1247308285186449, "grad_norm": 0.20288196206092834, "learning_rate": 1.7579054271329457e-05, "loss": 0.0167, "step": 2650 }, { "epoch": 0.12520151089041337, "grad_norm": 0.15298855304718018, "learning_rate": 1.755912909736981e-05, "loss": 0.0204, "step": 2660 }, { "epoch": 0.12567219326218185, "grad_norm": 0.20021982491016388, "learning_rate": 1.753913365510619e-05, "loss": 0.0158, "step": 2670 }, { "epoch": 0.12614287563395032, "grad_norm": 0.17904764413833618, "learning_rate": 1.751906813041263e-05, "loss": 0.0158, "step": 2680 }, { "epoch": 0.1266135580057188, "grad_norm": 0.20741869509220123, "learning_rate": 1.749893270981463e-05, "loss": 0.0175, "step": 2690 }, { "epoch": 0.12708424037748725, "grad_norm": 0.1554831862449646, "learning_rate": 1.747872758048744e-05, "loss": 0.0175, "step": 2700 }, { "epoch": 0.12755492274925573, "grad_norm": 0.19903723895549774, "learning_rate": 1.745845293025431e-05, "loss": 0.0156, "step": 2710 }, { "epoch": 0.1280256051210242, "grad_norm": 0.16253547370433807, "learning_rate": 1.7438108947584737e-05, "loss": 0.0189, "step": 2720 }, { "epoch": 0.12849628749279268, "grad_norm": 0.14778700470924377, "learning_rate": 1.7417695821592727e-05, "loss": 0.0167, "step": 2730 }, { "epoch": 0.12896696986456116, "grad_norm": 0.14217066764831543, "learning_rate": 1.739721374203502e-05, "loss": 0.0178, "step": 2740 }, { "epoch": 0.1294376522363296, "grad_norm": 0.1682458072900772, "learning_rate": 1.7376662899309346e-05, "loss": 0.0161, "step": 2750 }, { "epoch": 0.12990833460809809, "grad_norm": 0.16743063926696777, "learning_rate": 1.7356043484452643e-05, "loss": 0.0163, "step": 2760 }, { "epoch": 0.13037901697986656, "grad_norm": 0.18026761710643768, "learning_rate": 1.733535568913928e-05, "loss": 0.0158, "step": 2770 }, { "epoch": 0.13084969935163504, "grad_norm": 0.14549121260643005, "learning_rate": 1.731459970567928e-05, "loss": 0.016, "step": 2780 }, { "epoch": 0.13132038172340352, "grad_norm": 0.19981275498867035, "learning_rate": 1.729377572701653e-05, "loss": 0.0161, "step": 2790 }, { "epoch": 0.13179106409517197, "grad_norm": 0.20063026249408722, "learning_rate": 1.7272883946726986e-05, "loss": 0.0192, "step": 2800 }, { "epoch": 0.13226174646694044, "grad_norm": 0.1602613627910614, "learning_rate": 1.7251924559016885e-05, "loss": 0.0194, "step": 2810 }, { "epoch": 0.13273242883870892, "grad_norm": 0.1732335239648819, "learning_rate": 1.7230897758720916e-05, "loss": 0.0192, "step": 2820 }, { "epoch": 0.1332031112104774, "grad_norm": 0.20478792488574982, "learning_rate": 1.720980374130044e-05, "loss": 0.0234, "step": 2830 }, { "epoch": 0.13367379358224585, "grad_norm": 0.1489650160074234, "learning_rate": 1.7188642702841643e-05, "loss": 0.0149, "step": 2840 }, { "epoch": 0.13414447595401433, "grad_norm": 0.14599387347698212, "learning_rate": 1.716741484005373e-05, "loss": 0.0166, "step": 2850 }, { "epoch": 0.1346151583257828, "grad_norm": 0.46743980050086975, "learning_rate": 1.7146120350267094e-05, "loss": 0.0191, "step": 2860 }, { "epoch": 0.13508584069755128, "grad_norm": 0.1759539246559143, "learning_rate": 1.7124759431431485e-05, "loss": 0.0164, "step": 2870 }, { "epoch": 0.13555652306931976, "grad_norm": 0.15650507807731628, "learning_rate": 1.7103332282114156e-05, "loss": 0.0206, "step": 2880 }, { "epoch": 0.1360272054410882, "grad_norm": 0.11699323356151581, "learning_rate": 1.7081839101498033e-05, "loss": 0.0167, "step": 2890 }, { "epoch": 0.13649788781285668, "grad_norm": 0.14939193427562714, "learning_rate": 1.7060280089379854e-05, "loss": 0.0156, "step": 2900 }, { "epoch": 0.13696857018462516, "grad_norm": 0.2103535681962967, "learning_rate": 1.703865544616832e-05, "loss": 0.0181, "step": 2910 }, { "epoch": 0.13743925255639364, "grad_norm": 0.21595115959644318, "learning_rate": 1.7016965372882227e-05, "loss": 0.0147, "step": 2920 }, { "epoch": 0.13790993492816211, "grad_norm": 0.22824503481388092, "learning_rate": 1.6995210071148582e-05, "loss": 0.0154, "step": 2930 }, { "epoch": 0.13838061729993056, "grad_norm": 0.16841448843479156, "learning_rate": 1.6973389743200764e-05, "loss": 0.0181, "step": 2940 }, { "epoch": 0.13885129967169904, "grad_norm": 0.11514698714017868, "learning_rate": 1.6951504591876614e-05, "loss": 0.0205, "step": 2950 }, { "epoch": 0.13932198204346752, "grad_norm": 0.15565893054008484, "learning_rate": 1.692955482061656e-05, "loss": 0.0148, "step": 2960 }, { "epoch": 0.139792664415236, "grad_norm": 0.12231289595365524, "learning_rate": 1.6907540633461728e-05, "loss": 0.018, "step": 2970 }, { "epoch": 0.14026334678700447, "grad_norm": 0.2031562328338623, "learning_rate": 1.6885462235052038e-05, "loss": 0.0172, "step": 2980 }, { "epoch": 0.14073402915877292, "grad_norm": 0.17363503575325012, "learning_rate": 1.6863319830624313e-05, "loss": 0.0207, "step": 2990 }, { "epoch": 0.1412047115305414, "grad_norm": 0.16768331825733185, "learning_rate": 1.6841113626010358e-05, "loss": 0.0193, "step": 3000 }, { "epoch": 0.14167539390230988, "grad_norm": 0.17275945842266083, "learning_rate": 1.6818843827635052e-05, "loss": 0.0145, "step": 3010 }, { "epoch": 0.14214607627407835, "grad_norm": 0.1718882918357849, "learning_rate": 1.679651064251444e-05, "loss": 0.0163, "step": 3020 }, { "epoch": 0.1426167586458468, "grad_norm": 0.17124678194522858, "learning_rate": 1.677411427825379e-05, "loss": 0.016, "step": 3030 }, { "epoch": 0.14308744101761528, "grad_norm": 0.13187319040298462, "learning_rate": 1.6751654943045672e-05, "loss": 0.0161, "step": 3040 }, { "epoch": 0.14355812338938376, "grad_norm": 0.17828577756881714, "learning_rate": 1.672913284566803e-05, "loss": 0.0149, "step": 3050 }, { "epoch": 0.14402880576115223, "grad_norm": 0.13027949631214142, "learning_rate": 1.6706548195482222e-05, "loss": 0.0144, "step": 3060 }, { "epoch": 0.1444994881329207, "grad_norm": 0.16512738168239594, "learning_rate": 1.66839012024311e-05, "loss": 0.0162, "step": 3070 }, { "epoch": 0.14497017050468916, "grad_norm": 0.15044449269771576, "learning_rate": 1.666119207703703e-05, "loss": 0.0153, "step": 3080 }, { "epoch": 0.14544085287645764, "grad_norm": 0.17028893530368805, "learning_rate": 1.6638421030399962e-05, "loss": 0.0165, "step": 3090 }, { "epoch": 0.14591153524822612, "grad_norm": 0.1824485957622528, "learning_rate": 1.6615588274195445e-05, "loss": 0.0165, "step": 3100 }, { "epoch": 0.1463822176199946, "grad_norm": 0.13072068989276886, "learning_rate": 1.6592694020672667e-05, "loss": 0.018, "step": 3110 }, { "epoch": 0.14685289999176307, "grad_norm": 0.1269024759531021, "learning_rate": 1.65697384826525e-05, "loss": 0.0141, "step": 3120 }, { "epoch": 0.14732358236353152, "grad_norm": 0.16082976758480072, "learning_rate": 1.6546721873525488e-05, "loss": 0.0156, "step": 3130 }, { "epoch": 0.1477942647353, "grad_norm": 0.19593431055545807, "learning_rate": 1.6523644407249893e-05, "loss": 0.0138, "step": 3140 }, { "epoch": 0.14826494710706847, "grad_norm": 0.1753019541501999, "learning_rate": 1.6500506298349682e-05, "loss": 0.0173, "step": 3150 }, { "epoch": 0.14873562947883695, "grad_norm": 0.16250555217266083, "learning_rate": 1.6477307761912555e-05, "loss": 0.0164, "step": 3160 }, { "epoch": 0.14920631185060543, "grad_norm": 0.1323000192642212, "learning_rate": 1.645404901358794e-05, "loss": 0.0122, "step": 3170 }, { "epoch": 0.14967699422237388, "grad_norm": 0.1453583538532257, "learning_rate": 1.6430730269584963e-05, "loss": 0.0156, "step": 3180 }, { "epoch": 0.15014767659414235, "grad_norm": 0.1393812596797943, "learning_rate": 1.6407351746670484e-05, "loss": 0.0144, "step": 3190 }, { "epoch": 0.15061835896591083, "grad_norm": 0.18590575456619263, "learning_rate": 1.638391366216704e-05, "loss": 0.0147, "step": 3200 }, { "epoch": 0.1510890413376793, "grad_norm": 0.12945562601089478, "learning_rate": 1.636041623395085e-05, "loss": 0.0122, "step": 3210 }, { "epoch": 0.15155972370944779, "grad_norm": 0.1565374732017517, "learning_rate": 1.6336859680449773e-05, "loss": 0.017, "step": 3220 }, { "epoch": 0.15203040608121624, "grad_norm": 0.13287587463855743, "learning_rate": 1.6313244220641304e-05, "loss": 0.0155, "step": 3230 }, { "epoch": 0.1525010884529847, "grad_norm": 0.17368289828300476, "learning_rate": 1.6289570074050492e-05, "loss": 0.0148, "step": 3240 }, { "epoch": 0.1529717708247532, "grad_norm": 0.11012851446866989, "learning_rate": 1.626583746074796e-05, "loss": 0.0161, "step": 3250 }, { "epoch": 0.15344245319652167, "grad_norm": 0.10993830114603043, "learning_rate": 1.6242046601347796e-05, "loss": 0.0134, "step": 3260 }, { "epoch": 0.15391313556829012, "grad_norm": 0.13184867799282074, "learning_rate": 1.6218197717005562e-05, "loss": 0.0153, "step": 3270 }, { "epoch": 0.1543838179400586, "grad_norm": 0.1233256384730339, "learning_rate": 1.6194291029416188e-05, "loss": 0.0166, "step": 3280 }, { "epoch": 0.15485450031182707, "grad_norm": 0.15933425724506378, "learning_rate": 1.617032676081194e-05, "loss": 0.0171, "step": 3290 }, { "epoch": 0.15532518268359555, "grad_norm": 0.19141517579555511, "learning_rate": 1.614630513396035e-05, "loss": 0.0161, "step": 3300 }, { "epoch": 0.15579586505536402, "grad_norm": 0.145005464553833, "learning_rate": 1.6122226372162137e-05, "loss": 0.0166, "step": 3310 }, { "epoch": 0.15626654742713247, "grad_norm": 0.11310192197561264, "learning_rate": 1.6098090699249144e-05, "loss": 0.0198, "step": 3320 }, { "epoch": 0.15673722979890095, "grad_norm": 0.11065463721752167, "learning_rate": 1.607389833958223e-05, "loss": 0.0183, "step": 3330 }, { "epoch": 0.15720791217066943, "grad_norm": 0.12816224992275238, "learning_rate": 1.6049649518049234e-05, "loss": 0.0154, "step": 3340 }, { "epoch": 0.1576785945424379, "grad_norm": 0.10557534545660019, "learning_rate": 1.6025344460062826e-05, "loss": 0.0152, "step": 3350 }, { "epoch": 0.15814927691420638, "grad_norm": 0.1360410898923874, "learning_rate": 1.6000983391558457e-05, "loss": 0.0164, "step": 3360 }, { "epoch": 0.15861995928597483, "grad_norm": 0.11115196347236633, "learning_rate": 1.5976566538992237e-05, "loss": 0.014, "step": 3370 }, { "epoch": 0.1590906416577433, "grad_norm": 0.12190092355012894, "learning_rate": 1.5952094129338834e-05, "loss": 0.0143, "step": 3380 }, { "epoch": 0.1595613240295118, "grad_norm": 0.1640314906835556, "learning_rate": 1.5927566390089362e-05, "loss": 0.0138, "step": 3390 }, { "epoch": 0.16003200640128026, "grad_norm": 0.13104459643363953, "learning_rate": 1.5902983549249272e-05, "loss": 0.0184, "step": 3400 }, { "epoch": 0.16050268877304874, "grad_norm": 0.18852287530899048, "learning_rate": 1.5878345835336232e-05, "loss": 0.0126, "step": 3410 }, { "epoch": 0.1609733711448172, "grad_norm": 0.12984438240528107, "learning_rate": 1.5853653477377996e-05, "loss": 0.0152, "step": 3420 }, { "epoch": 0.16144405351658567, "grad_norm": 0.11026235669851303, "learning_rate": 1.582890670491028e-05, "loss": 0.0159, "step": 3430 }, { "epoch": 0.16191473588835414, "grad_norm": 0.22633954882621765, "learning_rate": 1.5804105747974626e-05, "loss": 0.0163, "step": 3440 }, { "epoch": 0.16238541826012262, "grad_norm": 0.13691726326942444, "learning_rate": 1.5779250837116275e-05, "loss": 0.014, "step": 3450 }, { "epoch": 0.16285610063189107, "grad_norm": 0.12894703447818756, "learning_rate": 1.5754342203382003e-05, "loss": 0.0171, "step": 3460 }, { "epoch": 0.16332678300365955, "grad_norm": 0.10772807151079178, "learning_rate": 1.5729380078317982e-05, "loss": 0.0145, "step": 3470 }, { "epoch": 0.16379746537542803, "grad_norm": 0.1087990403175354, "learning_rate": 1.570436469396764e-05, "loss": 0.0154, "step": 3480 }, { "epoch": 0.1642681477471965, "grad_norm": 0.19474056363105774, "learning_rate": 1.567929628286949e-05, "loss": 0.0177, "step": 3490 }, { "epoch": 0.16473883011896498, "grad_norm": 0.20649202167987823, "learning_rate": 1.5654175078054965e-05, "loss": 0.0149, "step": 3500 }, { "epoch": 0.16520951249073343, "grad_norm": 0.16578665375709534, "learning_rate": 1.562900131304627e-05, "loss": 0.0165, "step": 3510 }, { "epoch": 0.1656801948625019, "grad_norm": 0.13669690489768982, "learning_rate": 1.5603775221854195e-05, "loss": 0.0163, "step": 3520 }, { "epoch": 0.16615087723427038, "grad_norm": 0.12036110460758209, "learning_rate": 1.557849703897594e-05, "loss": 0.0161, "step": 3530 }, { "epoch": 0.16662155960603886, "grad_norm": 0.10850819945335388, "learning_rate": 1.5553166999392954e-05, "loss": 0.0179, "step": 3540 }, { "epoch": 0.16709224197780734, "grad_norm": 0.13059228658676147, "learning_rate": 1.5527785338568718e-05, "loss": 0.0137, "step": 3550 }, { "epoch": 0.1675629243495758, "grad_norm": 0.13332828879356384, "learning_rate": 1.550235229244659e-05, "loss": 0.0159, "step": 3560 }, { "epoch": 0.16803360672134426, "grad_norm": 0.1266319304704666, "learning_rate": 1.5476868097447586e-05, "loss": 0.0164, "step": 3570 }, { "epoch": 0.16850428909311274, "grad_norm": 0.13164667785167694, "learning_rate": 1.5451332990468202e-05, "loss": 0.0145, "step": 3580 }, { "epoch": 0.16897497146488122, "grad_norm": 0.14292672276496887, "learning_rate": 1.5425747208878195e-05, "loss": 0.0174, "step": 3590 }, { "epoch": 0.1694456538366497, "grad_norm": 1.1672334671020508, "learning_rate": 1.5400110990518386e-05, "loss": 0.0175, "step": 3600 }, { "epoch": 0.16991633620841815, "grad_norm": 0.18112429976463318, "learning_rate": 1.5374424573698453e-05, "loss": 0.0161, "step": 3610 }, { "epoch": 0.17038701858018662, "grad_norm": 0.15434442460536957, "learning_rate": 1.5348688197194696e-05, "loss": 0.0152, "step": 3620 }, { "epoch": 0.1708577009519551, "grad_norm": 0.22394922375679016, "learning_rate": 1.532290210024785e-05, "loss": 0.0146, "step": 3630 }, { "epoch": 0.17132838332372358, "grad_norm": 0.13832026720046997, "learning_rate": 1.529706652256083e-05, "loss": 0.0169, "step": 3640 }, { "epoch": 0.17179906569549203, "grad_norm": 0.16990678012371063, "learning_rate": 1.5271181704296513e-05, "loss": 0.0155, "step": 3650 }, { "epoch": 0.1722697480672605, "grad_norm": 0.10266239941120148, "learning_rate": 1.5245247886075518e-05, "loss": 0.0149, "step": 3660 }, { "epoch": 0.17274043043902898, "grad_norm": 0.12167726457118988, "learning_rate": 1.5219265308973952e-05, "loss": 0.0111, "step": 3670 }, { "epoch": 0.17321111281079746, "grad_norm": 0.23519952595233917, "learning_rate": 1.519323421452117e-05, "loss": 0.0206, "step": 3680 }, { "epoch": 0.17368179518256593, "grad_norm": 0.19322697818279266, "learning_rate": 1.5167154844697549e-05, "loss": 0.0137, "step": 3690 }, { "epoch": 0.17415247755433438, "grad_norm": 0.17974957823753357, "learning_rate": 1.5141027441932217e-05, "loss": 0.0161, "step": 3700 }, { "epoch": 0.17462315992610286, "grad_norm": 0.14519457519054413, "learning_rate": 1.5114852249100811e-05, "loss": 0.0174, "step": 3710 }, { "epoch": 0.17509384229787134, "grad_norm": 0.1755804866552353, "learning_rate": 1.5088629509523207e-05, "loss": 0.0163, "step": 3720 }, { "epoch": 0.17556452466963982, "grad_norm": 0.12559199333190918, "learning_rate": 1.5062359466961283e-05, "loss": 0.017, "step": 3730 }, { "epoch": 0.1760352070414083, "grad_norm": 0.09101469814777374, "learning_rate": 1.5036042365616621e-05, "loss": 0.0137, "step": 3740 }, { "epoch": 0.17650588941317674, "grad_norm": 0.2089969366788864, "learning_rate": 1.5009678450128263e-05, "loss": 0.0142, "step": 3750 }, { "epoch": 0.17697657178494522, "grad_norm": 0.21098655462265015, "learning_rate": 1.498326796557042e-05, "loss": 0.0145, "step": 3760 }, { "epoch": 0.1774472541567137, "grad_norm": 0.20048439502716064, "learning_rate": 1.495681115745021e-05, "loss": 0.0163, "step": 3770 }, { "epoch": 0.17791793652848217, "grad_norm": 0.12956009805202484, "learning_rate": 1.4930308271705357e-05, "loss": 0.0152, "step": 3780 }, { "epoch": 0.17838861890025065, "grad_norm": 0.16518688201904297, "learning_rate": 1.4903759554701922e-05, "loss": 0.0138, "step": 3790 }, { "epoch": 0.1788593012720191, "grad_norm": 0.1568118780851364, "learning_rate": 1.4877165253231995e-05, "loss": 0.0142, "step": 3800 }, { "epoch": 0.17932998364378758, "grad_norm": 0.23500652611255646, "learning_rate": 1.4850525614511427e-05, "loss": 0.0171, "step": 3810 }, { "epoch": 0.17980066601555605, "grad_norm": 0.12185561656951904, "learning_rate": 1.4823840886177494e-05, "loss": 0.0163, "step": 3820 }, { "epoch": 0.18027134838732453, "grad_norm": 0.15749108791351318, "learning_rate": 1.4797111316286639e-05, "loss": 0.0174, "step": 3830 }, { "epoch": 0.180742030759093, "grad_norm": 0.1392006129026413, "learning_rate": 1.4770337153312131e-05, "loss": 0.0125, "step": 3840 }, { "epoch": 0.18121271313086146, "grad_norm": 0.16977669298648834, "learning_rate": 1.474351864614177e-05, "loss": 0.015, "step": 3850 }, { "epoch": 0.18168339550262994, "grad_norm": 0.2235575020313263, "learning_rate": 1.4716656044075577e-05, "loss": 0.0162, "step": 3860 }, { "epoch": 0.1821540778743984, "grad_norm": 0.12928543984889984, "learning_rate": 1.468974959682346e-05, "loss": 0.0174, "step": 3870 }, { "epoch": 0.1826247602461669, "grad_norm": 0.22299332916736603, "learning_rate": 1.466279955450292e-05, "loss": 0.0182, "step": 3880 }, { "epoch": 0.18309544261793534, "grad_norm": 0.15691237151622772, "learning_rate": 1.4635806167636698e-05, "loss": 0.0149, "step": 3890 }, { "epoch": 0.18356612498970382, "grad_norm": 0.14020195603370667, "learning_rate": 1.4608769687150459e-05, "loss": 0.0161, "step": 3900 }, { "epoch": 0.1840368073614723, "grad_norm": 0.11680819094181061, "learning_rate": 1.4581690364370466e-05, "loss": 0.0179, "step": 3910 }, { "epoch": 0.18450748973324077, "grad_norm": 0.13952572643756866, "learning_rate": 1.455456845102123e-05, "loss": 0.017, "step": 3920 }, { "epoch": 0.18497817210500925, "grad_norm": 0.20927928388118744, "learning_rate": 1.4527404199223173e-05, "loss": 0.0152, "step": 3930 }, { "epoch": 0.1854488544767777, "grad_norm": 0.16642796993255615, "learning_rate": 1.4500197861490293e-05, "loss": 0.0173, "step": 3940 }, { "epoch": 0.18591953684854617, "grad_norm": 0.14634211361408234, "learning_rate": 1.4472949690727813e-05, "loss": 0.0134, "step": 3950 }, { "epoch": 0.18639021922031465, "grad_norm": 0.1133720800280571, "learning_rate": 1.4445659940229827e-05, "loss": 0.0188, "step": 3960 }, { "epoch": 0.18686090159208313, "grad_norm": 0.18339353799819946, "learning_rate": 1.441832886367694e-05, "loss": 0.0139, "step": 3970 }, { "epoch": 0.1873315839638516, "grad_norm": 0.15965554118156433, "learning_rate": 1.4390956715133928e-05, "loss": 0.0142, "step": 3980 }, { "epoch": 0.18780226633562006, "grad_norm": 0.14258499443531036, "learning_rate": 1.4363543749047354e-05, "loss": 0.0153, "step": 3990 }, { "epoch": 0.18827294870738853, "grad_norm": 0.20728687942028046, "learning_rate": 1.4336090220243222e-05, "loss": 0.0108, "step": 4000 }, { "epoch": 0.188743631079157, "grad_norm": 0.12849122285842896, "learning_rate": 1.4308596383924593e-05, "loss": 0.0163, "step": 4010 }, { "epoch": 0.1892143134509255, "grad_norm": 0.2140861451625824, "learning_rate": 1.4281062495669224e-05, "loss": 0.0151, "step": 4020 }, { "epoch": 0.18968499582269396, "grad_norm": 0.1454402506351471, "learning_rate": 1.4253488811427188e-05, "loss": 0.0146, "step": 4030 }, { "epoch": 0.1901556781944624, "grad_norm": 0.09839578717947006, "learning_rate": 1.4225875587518485e-05, "loss": 0.016, "step": 4040 }, { "epoch": 0.1906263605662309, "grad_norm": 0.1886250078678131, "learning_rate": 1.4198223080630686e-05, "loss": 0.0147, "step": 4050 }, { "epoch": 0.19109704293799937, "grad_norm": 0.1242780089378357, "learning_rate": 1.4170531547816513e-05, "loss": 0.0124, "step": 4060 }, { "epoch": 0.19156772530976784, "grad_norm": 0.14154621958732605, "learning_rate": 1.4142801246491476e-05, "loss": 0.0135, "step": 4070 }, { "epoch": 0.1920384076815363, "grad_norm": 0.14485566318035126, "learning_rate": 1.4115032434431461e-05, "loss": 0.0125, "step": 4080 }, { "epoch": 0.19250909005330477, "grad_norm": 0.10651139169931412, "learning_rate": 1.4087225369770356e-05, "loss": 0.0136, "step": 4090 }, { "epoch": 0.19297977242507325, "grad_norm": 0.1922823041677475, "learning_rate": 1.4059380310997626e-05, "loss": 0.0151, "step": 4100 }, { "epoch": 0.19345045479684173, "grad_norm": 0.1582033485174179, "learning_rate": 1.403149751695593e-05, "loss": 0.0177, "step": 4110 }, { "epoch": 0.1939211371686102, "grad_norm": 0.12224459648132324, "learning_rate": 1.40035772468387e-05, "loss": 0.0164, "step": 4120 }, { "epoch": 0.19439181954037865, "grad_norm": 0.11291183531284332, "learning_rate": 1.3975619760187746e-05, "loss": 0.0127, "step": 4130 }, { "epoch": 0.19486250191214713, "grad_norm": 0.1642821729183197, "learning_rate": 1.3947625316890836e-05, "loss": 0.0154, "step": 4140 }, { "epoch": 0.1953331842839156, "grad_norm": 0.11538895964622498, "learning_rate": 1.3919594177179272e-05, "loss": 0.0142, "step": 4150 }, { "epoch": 0.19580386665568408, "grad_norm": 0.13825342059135437, "learning_rate": 1.3891526601625492e-05, "loss": 0.0136, "step": 4160 }, { "epoch": 0.19627454902745256, "grad_norm": 0.15297387540340424, "learning_rate": 1.3863422851140624e-05, "loss": 0.0148, "step": 4170 }, { "epoch": 0.196745231399221, "grad_norm": 0.19323775172233582, "learning_rate": 1.3835283186972077e-05, "loss": 0.017, "step": 4180 }, { "epoch": 0.1972159137709895, "grad_norm": 0.0956265777349472, "learning_rate": 1.3807107870701102e-05, "loss": 0.0122, "step": 4190 }, { "epoch": 0.19768659614275796, "grad_norm": 0.14077512919902802, "learning_rate": 1.3778897164240378e-05, "loss": 0.0137, "step": 4200 }, { "epoch": 0.19815727851452644, "grad_norm": 0.14754603803157806, "learning_rate": 1.3750651329831548e-05, "loss": 0.0114, "step": 4210 }, { "epoch": 0.19862796088629492, "grad_norm": 0.12012674659490585, "learning_rate": 1.3722370630042809e-05, "loss": 0.0127, "step": 4220 }, { "epoch": 0.19909864325806337, "grad_norm": 0.11501695215702057, "learning_rate": 1.369405532776646e-05, "loss": 0.0127, "step": 4230 }, { "epoch": 0.19956932562983185, "grad_norm": 0.11318056285381317, "learning_rate": 1.3665705686216457e-05, "loss": 0.014, "step": 4240 }, { "epoch": 0.20004000800160032, "grad_norm": 0.1488237977027893, "learning_rate": 1.3637321968925964e-05, "loss": 0.0135, "step": 4250 }, { "epoch": 0.2005106903733688, "grad_norm": 0.17666472494602203, "learning_rate": 1.3608904439744905e-05, "loss": 0.0171, "step": 4260 }, { "epoch": 0.20098137274513725, "grad_norm": 0.20211124420166016, "learning_rate": 1.3580453362837527e-05, "loss": 0.0141, "step": 4270 }, { "epoch": 0.20145205511690573, "grad_norm": 0.12669050693511963, "learning_rate": 1.355196900267992e-05, "loss": 0.0133, "step": 4280 }, { "epoch": 0.2019227374886742, "grad_norm": 0.13243338465690613, "learning_rate": 1.3523451624057566e-05, "loss": 0.0124, "step": 4290 }, { "epoch": 0.20239341986044268, "grad_norm": 0.16948437690734863, "learning_rate": 1.3494901492062889e-05, "loss": 0.0136, "step": 4300 }, { "epoch": 0.20286410223221116, "grad_norm": 0.13935163617134094, "learning_rate": 1.346631887209278e-05, "loss": 0.0147, "step": 4310 }, { "epoch": 0.2033347846039796, "grad_norm": 0.20013917982578278, "learning_rate": 1.343770402984613e-05, "loss": 0.0145, "step": 4320 }, { "epoch": 0.20380546697574808, "grad_norm": 0.1919165402650833, "learning_rate": 1.3409057231321363e-05, "loss": 0.016, "step": 4330 }, { "epoch": 0.20427614934751656, "grad_norm": 0.1194598376750946, "learning_rate": 1.3380378742813964e-05, "loss": 0.0151, "step": 4340 }, { "epoch": 0.20474683171928504, "grad_norm": 0.14729000627994537, "learning_rate": 1.3351668830914004e-05, "loss": 0.0118, "step": 4350 }, { "epoch": 0.20521751409105352, "grad_norm": 0.13926298916339874, "learning_rate": 1.3322927762503656e-05, "loss": 0.0121, "step": 4360 }, { "epoch": 0.20568819646282197, "grad_norm": 0.13776032626628876, "learning_rate": 1.329415580475472e-05, "loss": 0.0118, "step": 4370 }, { "epoch": 0.20615887883459044, "grad_norm": 0.09277930855751038, "learning_rate": 1.3265353225126143e-05, "loss": 0.0141, "step": 4380 }, { "epoch": 0.20662956120635892, "grad_norm": 0.13311536610126495, "learning_rate": 1.3236520291361516e-05, "loss": 0.0144, "step": 4390 }, { "epoch": 0.2071002435781274, "grad_norm": 0.14378204941749573, "learning_rate": 1.3207657271486607e-05, "loss": 0.0153, "step": 4400 }, { "epoch": 0.20757092594989587, "grad_norm": 0.12196729332208633, "learning_rate": 1.3178764433806858e-05, "loss": 0.0137, "step": 4410 }, { "epoch": 0.20804160832166432, "grad_norm": 0.1723850667476654, "learning_rate": 1.3149842046904885e-05, "loss": 0.0148, "step": 4420 }, { "epoch": 0.2085122906934328, "grad_norm": 0.11916273087263107, "learning_rate": 1.3120890379637996e-05, "loss": 0.0144, "step": 4430 }, { "epoch": 0.20898297306520128, "grad_norm": 0.09409532696008682, "learning_rate": 1.3091909701135676e-05, "loss": 0.0172, "step": 4440 }, { "epoch": 0.20945365543696975, "grad_norm": 0.17635875940322876, "learning_rate": 1.3062900280797104e-05, "loss": 0.0167, "step": 4450 }, { "epoch": 0.20992433780873823, "grad_norm": 0.16211175918579102, "learning_rate": 1.3033862388288628e-05, "loss": 0.0133, "step": 4460 }, { "epoch": 0.21039502018050668, "grad_norm": 0.16622816026210785, "learning_rate": 1.3004796293541269e-05, "loss": 0.0152, "step": 4470 }, { "epoch": 0.21086570255227516, "grad_norm": 0.1458081603050232, "learning_rate": 1.297570226674822e-05, "loss": 0.011, "step": 4480 }, { "epoch": 0.21133638492404364, "grad_norm": 0.1774962693452835, "learning_rate": 1.294658057836232e-05, "loss": 0.0169, "step": 4490 }, { "epoch": 0.2118070672958121, "grad_norm": 0.1504325121641159, "learning_rate": 1.2917431499093538e-05, "loss": 0.0163, "step": 4500 }, { "epoch": 0.21227774966758056, "grad_norm": 0.15470466017723083, "learning_rate": 1.288825529990647e-05, "loss": 0.0149, "step": 4510 }, { "epoch": 0.21274843203934904, "grad_norm": 0.14937037229537964, "learning_rate": 1.2859052252017824e-05, "loss": 0.0136, "step": 4520 }, { "epoch": 0.21321911441111752, "grad_norm": 0.1489335000514984, "learning_rate": 1.2829822626893867e-05, "loss": 0.0147, "step": 4530 }, { "epoch": 0.213689796782886, "grad_norm": 0.1527571827173233, "learning_rate": 1.2800566696247943e-05, "loss": 0.013, "step": 4540 }, { "epoch": 0.21416047915465447, "grad_norm": 0.159316748380661, "learning_rate": 1.2771284732037912e-05, "loss": 0.0164, "step": 4550 }, { "epoch": 0.21463116152642292, "grad_norm": 0.16514410078525543, "learning_rate": 1.274197700646365e-05, "loss": 0.014, "step": 4560 }, { "epoch": 0.2151018438981914, "grad_norm": 0.17233633995056152, "learning_rate": 1.2712643791964501e-05, "loss": 0.0125, "step": 4570 }, { "epoch": 0.21557252626995987, "grad_norm": 0.13130588829517365, "learning_rate": 1.2683285361216745e-05, "loss": 0.0113, "step": 4580 }, { "epoch": 0.21604320864172835, "grad_norm": 0.12724335491657257, "learning_rate": 1.2653901987131074e-05, "loss": 0.0158, "step": 4590 }, { "epoch": 0.21651389101349683, "grad_norm": 0.17174789309501648, "learning_rate": 1.262449394285005e-05, "loss": 0.0131, "step": 4600 }, { "epoch": 0.21698457338526528, "grad_norm": 0.13818171620368958, "learning_rate": 1.2595061501745556e-05, "loss": 0.0187, "step": 4610 }, { "epoch": 0.21745525575703376, "grad_norm": 0.18787473440170288, "learning_rate": 1.2565604937416267e-05, "loss": 0.012, "step": 4620 }, { "epoch": 0.21792593812880223, "grad_norm": 0.1109204962849617, "learning_rate": 1.2536124523685114e-05, "loss": 0.0127, "step": 4630 }, { "epoch": 0.2183966205005707, "grad_norm": 0.1355179399251938, "learning_rate": 1.2506620534596711e-05, "loss": 0.015, "step": 4640 }, { "epoch": 0.2188673028723392, "grad_norm": 0.1300441324710846, "learning_rate": 1.247709324441483e-05, "loss": 0.0108, "step": 4650 }, { "epoch": 0.21933798524410764, "grad_norm": 0.16225352883338928, "learning_rate": 1.2447542927619857e-05, "loss": 0.0134, "step": 4660 }, { "epoch": 0.2198086676158761, "grad_norm": 0.15157760679721832, "learning_rate": 1.2417969858906214e-05, "loss": 0.0128, "step": 4670 }, { "epoch": 0.2202793499876446, "grad_norm": 0.12344833463430405, "learning_rate": 1.2388374313179828e-05, "loss": 0.0132, "step": 4680 }, { "epoch": 0.22075003235941307, "grad_norm": 0.12354397028684616, "learning_rate": 1.2358756565555563e-05, "loss": 0.0124, "step": 4690 }, { "epoch": 0.22122071473118152, "grad_norm": 0.12995843589305878, "learning_rate": 1.2329116891354677e-05, "loss": 0.0124, "step": 4700 }, { "epoch": 0.22169139710295, "grad_norm": 0.17049497365951538, "learning_rate": 1.2299455566102248e-05, "loss": 0.0128, "step": 4710 }, { "epoch": 0.22216207947471847, "grad_norm": 0.14437657594680786, "learning_rate": 1.2269772865524612e-05, "loss": 0.0107, "step": 4720 }, { "epoch": 0.22263276184648695, "grad_norm": 0.139217808842659, "learning_rate": 1.2240069065546823e-05, "loss": 0.0162, "step": 4730 }, { "epoch": 0.22310344421825543, "grad_norm": 0.106772281229496, "learning_rate": 1.2210344442290054e-05, "loss": 0.0125, "step": 4740 }, { "epoch": 0.22357412659002388, "grad_norm": 0.20194077491760254, "learning_rate": 1.2180599272069058e-05, "loss": 0.0136, "step": 4750 }, { "epoch": 0.22404480896179235, "grad_norm": 0.15950024127960205, "learning_rate": 1.215083383138958e-05, "loss": 0.0128, "step": 4760 }, { "epoch": 0.22451549133356083, "grad_norm": 0.13688400387763977, "learning_rate": 1.2121048396945807e-05, "loss": 0.0157, "step": 4770 }, { "epoch": 0.2249861737053293, "grad_norm": 0.12554797530174255, "learning_rate": 1.2091243245617774e-05, "loss": 0.0164, "step": 4780 }, { "epoch": 0.22545685607709778, "grad_norm": 0.1297275424003601, "learning_rate": 1.2061418654468808e-05, "loss": 0.0135, "step": 4790 }, { "epoch": 0.22592753844886623, "grad_norm": 0.13926266133785248, "learning_rate": 1.203157490074294e-05, "loss": 0.0129, "step": 4800 }, { "epoch": 0.2263982208206347, "grad_norm": 0.12042047083377838, "learning_rate": 1.2001712261862335e-05, "loss": 0.0175, "step": 4810 }, { "epoch": 0.2268689031924032, "grad_norm": 0.16293025016784668, "learning_rate": 1.1971831015424713e-05, "loss": 0.0154, "step": 4820 }, { "epoch": 0.22733958556417166, "grad_norm": 0.11358948051929474, "learning_rate": 1.194193143920076e-05, "loss": 0.0129, "step": 4830 }, { "epoch": 0.22781026793594014, "grad_norm": 0.1202591136097908, "learning_rate": 1.1912013811131562e-05, "loss": 0.0136, "step": 4840 }, { "epoch": 0.2282809503077086, "grad_norm": 0.12600010633468628, "learning_rate": 1.1882078409326003e-05, "loss": 0.0114, "step": 4850 }, { "epoch": 0.22875163267947707, "grad_norm": 0.14152638614177704, "learning_rate": 1.1852125512058194e-05, "loss": 0.0161, "step": 4860 }, { "epoch": 0.22922231505124555, "grad_norm": 0.11345827579498291, "learning_rate": 1.1822155397764873e-05, "loss": 0.0152, "step": 4870 }, { "epoch": 0.22969299742301402, "grad_norm": 0.14379754662513733, "learning_rate": 1.179216834504284e-05, "loss": 0.0143, "step": 4880 }, { "epoch": 0.2301636797947825, "grad_norm": 0.12794838845729828, "learning_rate": 1.1762164632646334e-05, "loss": 0.0127, "step": 4890 }, { "epoch": 0.23063436216655095, "grad_norm": 0.10142932832241058, "learning_rate": 1.1732144539484467e-05, "loss": 0.011, "step": 4900 }, { "epoch": 0.23110504453831943, "grad_norm": 0.1283886730670929, "learning_rate": 1.1702108344618627e-05, "loss": 0.0112, "step": 4910 }, { "epoch": 0.2315757269100879, "grad_norm": 0.1828288584947586, "learning_rate": 1.1672056327259876e-05, "loss": 0.0139, "step": 4920 }, { "epoch": 0.23204640928185638, "grad_norm": 0.10254240781068802, "learning_rate": 1.1641988766766359e-05, "loss": 0.0088, "step": 4930 }, { "epoch": 0.23251709165362483, "grad_norm": 0.12948806583881378, "learning_rate": 1.1611905942640707e-05, "loss": 0.0136, "step": 4940 }, { "epoch": 0.2329877740253933, "grad_norm": 0.11084115505218506, "learning_rate": 1.1581808134527443e-05, "loss": 0.0125, "step": 4950 }, { "epoch": 0.23345845639716178, "grad_norm": 0.14607097208499908, "learning_rate": 1.1551695622210377e-05, "loss": 0.0152, "step": 4960 }, { "epoch": 0.23392913876893026, "grad_norm": 0.1159360483288765, "learning_rate": 1.1521568685610003e-05, "loss": 0.0126, "step": 4970 }, { "epoch": 0.23439982114069874, "grad_norm": 0.1760568469762802, "learning_rate": 1.1491427604780898e-05, "loss": 0.0119, "step": 4980 }, { "epoch": 0.2348705035124672, "grad_norm": 0.12127764523029327, "learning_rate": 1.1461272659909137e-05, "loss": 0.0148, "step": 4990 }, { "epoch": 0.23534118588423567, "grad_norm": 0.25472477078437805, "learning_rate": 1.1431104131309654e-05, "loss": 0.0168, "step": 5000 }, { "epoch": 0.23581186825600414, "grad_norm": 0.16803275048732758, "learning_rate": 1.1400922299423663e-05, "loss": 0.0156, "step": 5010 }, { "epoch": 0.23628255062777262, "grad_norm": 0.15267884731292725, "learning_rate": 1.1370727444816045e-05, "loss": 0.013, "step": 5020 }, { "epoch": 0.2367532329995411, "grad_norm": 0.16479924321174622, "learning_rate": 1.1340519848172735e-05, "loss": 0.0177, "step": 5030 }, { "epoch": 0.23722391537130955, "grad_norm": 0.12479148060083389, "learning_rate": 1.1310299790298118e-05, "loss": 0.0134, "step": 5040 }, { "epoch": 0.23769459774307802, "grad_norm": 0.12104110419750214, "learning_rate": 1.1280067552112408e-05, "loss": 0.0125, "step": 5050 }, { "epoch": 0.2381652801148465, "grad_norm": 0.13121286034584045, "learning_rate": 1.124982341464906e-05, "loss": 0.0176, "step": 5060 }, { "epoch": 0.23863596248661498, "grad_norm": 0.16334620118141174, "learning_rate": 1.1219567659052126e-05, "loss": 0.0119, "step": 5070 }, { "epoch": 0.23910664485838345, "grad_norm": 0.1453070193529129, "learning_rate": 1.118930056657367e-05, "loss": 0.0106, "step": 5080 }, { "epoch": 0.2395773272301519, "grad_norm": 0.11464358121156693, "learning_rate": 1.115902241857114e-05, "loss": 0.0147, "step": 5090 }, { "epoch": 0.24004800960192038, "grad_norm": 0.09648099541664124, "learning_rate": 1.1128733496504751e-05, "loss": 0.0144, "step": 5100 }, { "epoch": 0.24051869197368886, "grad_norm": 0.11938716471195221, "learning_rate": 1.1098434081934871e-05, "loss": 0.0156, "step": 5110 }, { "epoch": 0.24098937434545734, "grad_norm": 0.21396268904209137, "learning_rate": 1.1068124456519402e-05, "loss": 0.0139, "step": 5120 }, { "epoch": 0.24146005671722579, "grad_norm": 0.1103433296084404, "learning_rate": 1.1037804902011175e-05, "loss": 0.0122, "step": 5130 }, { "epoch": 0.24193073908899426, "grad_norm": 0.1658787578344345, "learning_rate": 1.1007475700255313e-05, "loss": 0.016, "step": 5140 }, { "epoch": 0.24240142146076274, "grad_norm": 0.1595233976840973, "learning_rate": 1.0977137133186613e-05, "loss": 0.0114, "step": 5150 }, { "epoch": 0.24287210383253122, "grad_norm": 0.18476787209510803, "learning_rate": 1.094678948282694e-05, "loss": 0.0148, "step": 5160 }, { "epoch": 0.2433427862042997, "grad_norm": 0.16430480778217316, "learning_rate": 1.0916433031282592e-05, "loss": 0.0125, "step": 5170 }, { "epoch": 0.24381346857606814, "grad_norm": 0.13292700052261353, "learning_rate": 1.0886068060741676e-05, "loss": 0.0119, "step": 5180 }, { "epoch": 0.24428415094783662, "grad_norm": 0.11464076489210129, "learning_rate": 1.0855694853471499e-05, "loss": 0.0133, "step": 5190 }, { "epoch": 0.2447548333196051, "grad_norm": 0.12834596633911133, "learning_rate": 1.0825313691815928e-05, "loss": 0.0124, "step": 5200 }, { "epoch": 0.24522551569137357, "grad_norm": 0.15689679980278015, "learning_rate": 1.0794924858192779e-05, "loss": 0.0109, "step": 5210 }, { "epoch": 0.24569619806314205, "grad_norm": 0.14201077818870544, "learning_rate": 1.0764528635091179e-05, "loss": 0.0126, "step": 5220 }, { "epoch": 0.2461668804349105, "grad_norm": 0.1737937331199646, "learning_rate": 1.0734125305068943e-05, "loss": 0.0127, "step": 5230 }, { "epoch": 0.24663756280667898, "grad_norm": 0.1588907390832901, "learning_rate": 1.0703715150749967e-05, "loss": 0.0129, "step": 5240 }, { "epoch": 0.24710824517844746, "grad_norm": 0.14610442519187927, "learning_rate": 1.0673298454821567e-05, "loss": 0.0109, "step": 5250 }, { "epoch": 0.24757892755021593, "grad_norm": 0.15229788422584534, "learning_rate": 1.0642875500031878e-05, "loss": 0.014, "step": 5260 }, { "epoch": 0.2480496099219844, "grad_norm": 0.0892505794763565, "learning_rate": 1.0612446569187214e-05, "loss": 0.012, "step": 5270 }, { "epoch": 0.24852029229375286, "grad_norm": 0.14967739582061768, "learning_rate": 1.058201194514944e-05, "loss": 0.0155, "step": 5280 }, { "epoch": 0.24899097466552134, "grad_norm": 0.09590354561805725, "learning_rate": 1.0551571910833344e-05, "loss": 0.0145, "step": 5290 }, { "epoch": 0.2494616570372898, "grad_norm": 0.16319698095321655, "learning_rate": 1.0521126749204009e-05, "loss": 0.0145, "step": 5300 }, { "epoch": 0.2499323394090583, "grad_norm": 0.14433561265468597, "learning_rate": 1.0490676743274181e-05, "loss": 0.0117, "step": 5310 }, { "epoch": 0.25040302178082674, "grad_norm": 0.17442801594734192, "learning_rate": 1.0460222176101635e-05, "loss": 0.0138, "step": 5320 }, { "epoch": 0.25087370415259524, "grad_norm": 0.13118790090084076, "learning_rate": 1.0429763330786546e-05, "loss": 0.0105, "step": 5330 }, { "epoch": 0.2513443865243637, "grad_norm": 0.1032710149884224, "learning_rate": 1.0399300490468862e-05, "loss": 0.0121, "step": 5340 }, { "epoch": 0.25181506889613214, "grad_norm": 0.1624784916639328, "learning_rate": 1.0368833938325667e-05, "loss": 0.0119, "step": 5350 }, { "epoch": 0.25228575126790065, "grad_norm": 0.1460733860731125, "learning_rate": 1.0338363957568544e-05, "loss": 0.0135, "step": 5360 }, { "epoch": 0.2527564336396691, "grad_norm": 0.13395872712135315, "learning_rate": 1.030789083144095e-05, "loss": 0.0115, "step": 5370 }, { "epoch": 0.2532271160114376, "grad_norm": 0.10274216532707214, "learning_rate": 1.027741484321559e-05, "loss": 0.0128, "step": 5380 }, { "epoch": 0.25369779838320605, "grad_norm": 0.1075226366519928, "learning_rate": 1.024693627619176e-05, "loss": 0.0127, "step": 5390 }, { "epoch": 0.2541684807549745, "grad_norm": 0.1437896490097046, "learning_rate": 1.0216455413692738e-05, "loss": 0.0129, "step": 5400 }, { "epoch": 0.254639163126743, "grad_norm": 0.1418798863887787, "learning_rate": 1.0185972539063139e-05, "loss": 0.0119, "step": 5410 }, { "epoch": 0.25510984549851146, "grad_norm": 0.09212790429592133, "learning_rate": 1.0155487935666277e-05, "loss": 0.0123, "step": 5420 }, { "epoch": 0.25558052787027996, "grad_norm": 0.11548860371112823, "learning_rate": 1.0125001886881543e-05, "loss": 0.012, "step": 5430 }, { "epoch": 0.2560512102420484, "grad_norm": 0.18004415929317474, "learning_rate": 1.0094514676101759e-05, "loss": 0.0132, "step": 5440 }, { "epoch": 0.25652189261381686, "grad_norm": 0.11087123304605484, "learning_rate": 1.0064026586730553e-05, "loss": 0.0126, "step": 5450 }, { "epoch": 0.25699257498558536, "grad_norm": 0.1403873860836029, "learning_rate": 1.0033537902179716e-05, "loss": 0.013, "step": 5460 }, { "epoch": 0.2574632573573538, "grad_norm": 0.112074114382267, "learning_rate": 1.0003048905866577e-05, "loss": 0.0139, "step": 5470 }, { "epoch": 0.2579339397291223, "grad_norm": 0.16918501257896423, "learning_rate": 9.972559881211353e-06, "loss": 0.0126, "step": 5480 }, { "epoch": 0.25840462210089077, "grad_norm": 0.17139802873134613, "learning_rate": 9.942071111634538e-06, "loss": 0.0163, "step": 5490 }, { "epoch": 0.2588753044726592, "grad_norm": 0.13993045687675476, "learning_rate": 9.91158288055425e-06, "loss": 0.0122, "step": 5500 }, { "epoch": 0.2593459868444277, "grad_norm": 0.1661345213651657, "learning_rate": 9.88109547138359e-06, "loss": 0.0132, "step": 5510 }, { "epoch": 0.25981666921619617, "grad_norm": 0.12496617436408997, "learning_rate": 9.850609167528038e-06, "loss": 0.0124, "step": 5520 }, { "epoch": 0.2602873515879647, "grad_norm": 0.11937415599822998, "learning_rate": 9.820124252382784e-06, "loss": 0.0124, "step": 5530 }, { "epoch": 0.2607580339597331, "grad_norm": 0.16182279586791992, "learning_rate": 9.789641009330113e-06, "loss": 0.0133, "step": 5540 }, { "epoch": 0.2612287163315016, "grad_norm": 0.18292196094989777, "learning_rate": 9.759159721736772e-06, "loss": 0.0118, "step": 5550 }, { "epoch": 0.2616993987032701, "grad_norm": 0.05322203412652016, "learning_rate": 9.72868067295132e-06, "loss": 0.0131, "step": 5560 }, { "epoch": 0.26217008107503853, "grad_norm": 0.18301557004451752, "learning_rate": 9.698204146301513e-06, "loss": 0.0143, "step": 5570 }, { "epoch": 0.26264076344680704, "grad_norm": 0.1346985101699829, "learning_rate": 9.667730425091666e-06, "loss": 0.0117, "step": 5580 }, { "epoch": 0.2631114458185755, "grad_norm": 0.15309423208236694, "learning_rate": 9.637259792599997e-06, "loss": 0.0148, "step": 5590 }, { "epoch": 0.26358212819034393, "grad_norm": 0.13511566817760468, "learning_rate": 9.606792532076028e-06, "loss": 0.0127, "step": 5600 }, { "epoch": 0.26405281056211244, "grad_norm": 0.14392079412937164, "learning_rate": 9.576328926737936e-06, "loss": 0.0152, "step": 5610 }, { "epoch": 0.2645234929338809, "grad_norm": 0.138591930270195, "learning_rate": 9.545869259769904e-06, "loss": 0.0123, "step": 5620 }, { "epoch": 0.26499417530564934, "grad_norm": 0.14579860866069794, "learning_rate": 9.515413814319524e-06, "loss": 0.0153, "step": 5630 }, { "epoch": 0.26546485767741784, "grad_norm": 0.12173053622245789, "learning_rate": 9.484962873495137e-06, "loss": 0.0138, "step": 5640 }, { "epoch": 0.2659355400491863, "grad_norm": 0.1084262803196907, "learning_rate": 9.454516720363203e-06, "loss": 0.0119, "step": 5650 }, { "epoch": 0.2664062224209548, "grad_norm": 0.14348000288009644, "learning_rate": 9.424075637945692e-06, "loss": 0.0132, "step": 5660 }, { "epoch": 0.26687690479272325, "grad_norm": 0.20071879029273987, "learning_rate": 9.393639909217423e-06, "loss": 0.0132, "step": 5670 }, { "epoch": 0.2673475871644917, "grad_norm": 0.06778442114591599, "learning_rate": 9.363209817103455e-06, "loss": 0.0117, "step": 5680 }, { "epoch": 0.2678182695362602, "grad_norm": 0.1701139509677887, "learning_rate": 9.332785644476452e-06, "loss": 0.0123, "step": 5690 }, { "epoch": 0.26828895190802865, "grad_norm": 0.10796225070953369, "learning_rate": 9.302367674154043e-06, "loss": 0.0099, "step": 5700 }, { "epoch": 0.26875963427979715, "grad_norm": 0.15651920437812805, "learning_rate": 9.271956188896211e-06, "loss": 0.0125, "step": 5710 }, { "epoch": 0.2692303166515656, "grad_norm": 0.09530292451381683, "learning_rate": 9.241551471402654e-06, "loss": 0.0122, "step": 5720 }, { "epoch": 0.26970099902333405, "grad_norm": 0.11104518175125122, "learning_rate": 9.211153804310146e-06, "loss": 0.0102, "step": 5730 }, { "epoch": 0.27017168139510256, "grad_norm": 0.12510010600090027, "learning_rate": 9.180763470189938e-06, "loss": 0.0145, "step": 5740 }, { "epoch": 0.270642363766871, "grad_norm": 0.2923864424228668, "learning_rate": 9.15038075154511e-06, "loss": 0.0118, "step": 5750 }, { "epoch": 0.2711130461386395, "grad_norm": 0.18113361299037933, "learning_rate": 9.120005930807939e-06, "loss": 0.0126, "step": 5760 }, { "epoch": 0.27158372851040796, "grad_norm": 0.11243024468421936, "learning_rate": 9.0896392903373e-06, "loss": 0.0107, "step": 5770 }, { "epoch": 0.2720544108821764, "grad_norm": 0.1881917119026184, "learning_rate": 9.059281112416017e-06, "loss": 0.0126, "step": 5780 }, { "epoch": 0.2725250932539449, "grad_norm": 0.12101048231124878, "learning_rate": 9.028931679248249e-06, "loss": 0.0109, "step": 5790 }, { "epoch": 0.27299577562571337, "grad_norm": 0.10657214373350143, "learning_rate": 8.998591272956866e-06, "loss": 0.0117, "step": 5800 }, { "epoch": 0.27346645799748187, "grad_norm": 0.1264004409313202, "learning_rate": 8.96826017558083e-06, "loss": 0.013, "step": 5810 }, { "epoch": 0.2739371403692503, "grad_norm": 0.10373389720916748, "learning_rate": 8.937938669072557e-06, "loss": 0.0094, "step": 5820 }, { "epoch": 0.27440782274101877, "grad_norm": 0.11538185179233551, "learning_rate": 8.90762703529532e-06, "loss": 0.013, "step": 5830 }, { "epoch": 0.2748785051127873, "grad_norm": 0.07337260991334915, "learning_rate": 8.877325556020615e-06, "loss": 0.0122, "step": 5840 }, { "epoch": 0.2753491874845557, "grad_norm": 0.15605731308460236, "learning_rate": 8.847034512925536e-06, "loss": 0.016, "step": 5850 }, { "epoch": 0.27581986985632423, "grad_norm": 0.11484365910291672, "learning_rate": 8.816754187590175e-06, "loss": 0.0133, "step": 5860 }, { "epoch": 0.2762905522280927, "grad_norm": 0.12648023664951324, "learning_rate": 8.786484861494984e-06, "loss": 0.0111, "step": 5870 }, { "epoch": 0.27676123459986113, "grad_norm": 0.10003248602151871, "learning_rate": 8.756226816018172e-06, "loss": 0.0093, "step": 5880 }, { "epoch": 0.27723191697162963, "grad_norm": 0.09011956304311752, "learning_rate": 8.725980332433089e-06, "loss": 0.011, "step": 5890 }, { "epoch": 0.2777025993433981, "grad_norm": 0.13490058481693268, "learning_rate": 8.695745691905599e-06, "loss": 0.015, "step": 5900 }, { "epoch": 0.2781732817151666, "grad_norm": 0.12892021238803864, "learning_rate": 8.665523175491484e-06, "loss": 0.0117, "step": 5910 }, { "epoch": 0.27864396408693504, "grad_norm": 0.13579197227954865, "learning_rate": 8.635313064133817e-06, "loss": 0.014, "step": 5920 }, { "epoch": 0.2791146464587035, "grad_norm": 0.13082632422447205, "learning_rate": 8.605115638660356e-06, "loss": 0.0123, "step": 5930 }, { "epoch": 0.279585328830472, "grad_norm": 0.1175336167216301, "learning_rate": 8.57493117978094e-06, "loss": 0.0121, "step": 5940 }, { "epoch": 0.28005601120224044, "grad_norm": 0.15471363067626953, "learning_rate": 8.544759968084863e-06, "loss": 0.0124, "step": 5950 }, { "epoch": 0.28052669357400895, "grad_norm": 0.13702160120010376, "learning_rate": 8.51460228403828e-06, "loss": 0.011, "step": 5960 }, { "epoch": 0.2809973759457774, "grad_norm": 0.20038306713104248, "learning_rate": 8.484458407981601e-06, "loss": 0.014, "step": 5970 }, { "epoch": 0.28146805831754584, "grad_norm": 0.1740778535604477, "learning_rate": 8.454328620126871e-06, "loss": 0.0126, "step": 5980 }, { "epoch": 0.28193874068931435, "grad_norm": 0.11139131337404251, "learning_rate": 8.424213200555171e-06, "loss": 0.0128, "step": 5990 }, { "epoch": 0.2824094230610828, "grad_norm": 0.0976518988609314, "learning_rate": 8.394112429214032e-06, "loss": 0.0111, "step": 6000 }, { "epoch": 0.2828801054328513, "grad_norm": 0.06873276084661484, "learning_rate": 8.364026585914802e-06, "loss": 0.0119, "step": 6010 }, { "epoch": 0.28335078780461975, "grad_norm": 0.09966694563627243, "learning_rate": 8.33395595033007e-06, "loss": 0.0114, "step": 6020 }, { "epoch": 0.2838214701763882, "grad_norm": 0.1292872279882431, "learning_rate": 8.303900801991052e-06, "loss": 0.0133, "step": 6030 }, { "epoch": 0.2842921525481567, "grad_norm": 0.12036121636629105, "learning_rate": 8.273861420285e-06, "loss": 0.0118, "step": 6040 }, { "epoch": 0.28476283491992516, "grad_norm": 0.11319997161626816, "learning_rate": 8.243838084452603e-06, "loss": 0.0097, "step": 6050 }, { "epoch": 0.2852335172916936, "grad_norm": 0.10165838897228241, "learning_rate": 8.213831073585385e-06, "loss": 0.012, "step": 6060 }, { "epoch": 0.2857041996634621, "grad_norm": 0.11226401478052139, "learning_rate": 8.183840666623123e-06, "loss": 0.0139, "step": 6070 }, { "epoch": 0.28617488203523056, "grad_norm": 0.1088690459728241, "learning_rate": 8.153867142351242e-06, "loss": 0.012, "step": 6080 }, { "epoch": 0.28664556440699906, "grad_norm": 0.17496810853481293, "learning_rate": 8.123910779398233e-06, "loss": 0.012, "step": 6090 }, { "epoch": 0.2871162467787675, "grad_norm": 0.14205272495746613, "learning_rate": 8.093971856233051e-06, "loss": 0.0116, "step": 6100 }, { "epoch": 0.28758692915053596, "grad_norm": 0.12823930382728577, "learning_rate": 8.064050651162546e-06, "loss": 0.0134, "step": 6110 }, { "epoch": 0.28805761152230447, "grad_norm": 0.12222576886415482, "learning_rate": 8.034147442328852e-06, "loss": 0.0092, "step": 6120 }, { "epoch": 0.2885282938940729, "grad_norm": 0.1779102385044098, "learning_rate": 8.004262507706819e-06, "loss": 0.0131, "step": 6130 }, { "epoch": 0.2889989762658414, "grad_norm": 0.09277702867984772, "learning_rate": 7.97439612510142e-06, "loss": 0.0121, "step": 6140 }, { "epoch": 0.2894696586376099, "grad_norm": 0.11202935874462128, "learning_rate": 7.944548572145178e-06, "loss": 0.0132, "step": 6150 }, { "epoch": 0.2899403410093783, "grad_norm": 0.16479094326496124, "learning_rate": 7.914720126295572e-06, "loss": 0.013, "step": 6160 }, { "epoch": 0.2904110233811468, "grad_norm": 0.10826072841882706, "learning_rate": 7.884911064832466e-06, "loss": 0.0112, "step": 6170 }, { "epoch": 0.2908817057529153, "grad_norm": 0.11586777865886688, "learning_rate": 7.855121664855535e-06, "loss": 0.0126, "step": 6180 }, { "epoch": 0.2913523881246838, "grad_norm": 0.10926918685436249, "learning_rate": 7.825352203281682e-06, "loss": 0.0112, "step": 6190 }, { "epoch": 0.29182307049645223, "grad_norm": 0.13751401007175446, "learning_rate": 7.79560295684246e-06, "loss": 0.0108, "step": 6200 }, { "epoch": 0.2922937528682207, "grad_norm": 0.0948687419295311, "learning_rate": 7.765874202081516e-06, "loss": 0.0117, "step": 6210 }, { "epoch": 0.2927644352399892, "grad_norm": 0.1486055701971054, "learning_rate": 7.736166215352004e-06, "loss": 0.0115, "step": 6220 }, { "epoch": 0.29323511761175763, "grad_norm": 0.1545831561088562, "learning_rate": 7.706479272814024e-06, "loss": 0.0133, "step": 6230 }, { "epoch": 0.29370579998352614, "grad_norm": 0.12388255447149277, "learning_rate": 7.67681365043205e-06, "loss": 0.0122, "step": 6240 }, { "epoch": 0.2941764823552946, "grad_norm": 0.05917057394981384, "learning_rate": 7.64716962397237e-06, "loss": 0.0099, "step": 6250 }, { "epoch": 0.29464716472706304, "grad_norm": 0.13088931143283844, "learning_rate": 7.617547469000524e-06, "loss": 0.0115, "step": 6260 }, { "epoch": 0.29511784709883154, "grad_norm": 0.08930119127035141, "learning_rate": 7.587947460878731e-06, "loss": 0.0104, "step": 6270 }, { "epoch": 0.2955885294706, "grad_norm": 0.1314438134431839, "learning_rate": 7.5583698747633394e-06, "loss": 0.0119, "step": 6280 }, { "epoch": 0.2960592118423685, "grad_norm": 0.11299990862607956, "learning_rate": 7.528814985602273e-06, "loss": 0.01, "step": 6290 }, { "epoch": 0.29652989421413695, "grad_norm": 0.1425415426492691, "learning_rate": 7.49928306813246e-06, "loss": 0.0139, "step": 6300 }, { "epoch": 0.2970005765859054, "grad_norm": 0.13529139757156372, "learning_rate": 7.4697743968772906e-06, "loss": 0.0122, "step": 6310 }, { "epoch": 0.2974712589576739, "grad_norm": 0.2132551074028015, "learning_rate": 7.440289246144067e-06, "loss": 0.0124, "step": 6320 }, { "epoch": 0.29794194132944235, "grad_norm": 0.10853094607591629, "learning_rate": 7.410827890021444e-06, "loss": 0.0107, "step": 6330 }, { "epoch": 0.29841262370121086, "grad_norm": 0.137883722782135, "learning_rate": 7.381390602376882e-06, "loss": 0.0139, "step": 6340 }, { "epoch": 0.2988833060729793, "grad_norm": 0.1995558887720108, "learning_rate": 7.351977656854118e-06, "loss": 0.0144, "step": 6350 }, { "epoch": 0.29935398844474775, "grad_norm": 0.1346898376941681, "learning_rate": 7.322589326870597e-06, "loss": 0.0106, "step": 6360 }, { "epoch": 0.29982467081651626, "grad_norm": 0.13844065368175507, "learning_rate": 7.293225885614948e-06, "loss": 0.0119, "step": 6370 }, { "epoch": 0.3002953531882847, "grad_norm": 0.16830752789974213, "learning_rate": 7.263887606044437e-06, "loss": 0.0098, "step": 6380 }, { "epoch": 0.3007660355600532, "grad_norm": 0.2335188239812851, "learning_rate": 7.234574760882431e-06, "loss": 0.0119, "step": 6390 }, { "epoch": 0.30123671793182166, "grad_norm": 0.08024924993515015, "learning_rate": 7.205287622615866e-06, "loss": 0.0103, "step": 6400 }, { "epoch": 0.3017074003035901, "grad_norm": 0.14689774811267853, "learning_rate": 7.176026463492711e-06, "loss": 0.0091, "step": 6410 }, { "epoch": 0.3021780826753586, "grad_norm": 0.10744727402925491, "learning_rate": 7.146791555519431e-06, "loss": 0.0123, "step": 6420 }, { "epoch": 0.30264876504712707, "grad_norm": 0.09913976490497589, "learning_rate": 7.117583170458478e-06, "loss": 0.0091, "step": 6430 }, { "epoch": 0.30311944741889557, "grad_norm": 0.15804488956928253, "learning_rate": 7.0884015798257365e-06, "loss": 0.0142, "step": 6440 }, { "epoch": 0.303590129790664, "grad_norm": 0.1074845939874649, "learning_rate": 7.059247054888025e-06, "loss": 0.0118, "step": 6450 }, { "epoch": 0.30406081216243247, "grad_norm": 0.15226592123508453, "learning_rate": 7.030119866660565e-06, "loss": 0.0122, "step": 6460 }, { "epoch": 0.304531494534201, "grad_norm": 0.08823225647211075, "learning_rate": 7.001020285904454e-06, "loss": 0.0104, "step": 6470 }, { "epoch": 0.3050021769059694, "grad_norm": 0.10311225056648254, "learning_rate": 6.971948583124159e-06, "loss": 0.0111, "step": 6480 }, { "epoch": 0.3054728592777379, "grad_norm": 0.1055111214518547, "learning_rate": 6.9429050285650015e-06, "loss": 0.0102, "step": 6490 }, { "epoch": 0.3059435416495064, "grad_norm": 0.11811132729053497, "learning_rate": 6.913889892210631e-06, "loss": 0.0098, "step": 6500 }, { "epoch": 0.30641422402127483, "grad_norm": 0.12295475602149963, "learning_rate": 6.884903443780541e-06, "loss": 0.0104, "step": 6510 }, { "epoch": 0.30688490639304333, "grad_norm": 0.1621255725622177, "learning_rate": 6.8559459527275426e-06, "loss": 0.0129, "step": 6520 }, { "epoch": 0.3073555887648118, "grad_norm": 0.08003263920545578, "learning_rate": 6.827017688235255e-06, "loss": 0.0114, "step": 6530 }, { "epoch": 0.30782627113658023, "grad_norm": 0.09690742194652557, "learning_rate": 6.798118919215625e-06, "loss": 0.0126, "step": 6540 }, { "epoch": 0.30829695350834874, "grad_norm": 0.13386720418930054, "learning_rate": 6.769249914306408e-06, "loss": 0.0134, "step": 6550 }, { "epoch": 0.3087676358801172, "grad_norm": 0.11136613041162491, "learning_rate": 6.740410941868678e-06, "loss": 0.0109, "step": 6560 }, { "epoch": 0.3092383182518857, "grad_norm": 0.13539622724056244, "learning_rate": 6.711602269984339e-06, "loss": 0.0127, "step": 6570 }, { "epoch": 0.30970900062365414, "grad_norm": 0.10886342823505402, "learning_rate": 6.6828241664536145e-06, "loss": 0.0097, "step": 6580 }, { "epoch": 0.3101796829954226, "grad_norm": 0.11577015370130539, "learning_rate": 6.65407689879258e-06, "loss": 0.0103, "step": 6590 }, { "epoch": 0.3106503653671911, "grad_norm": 0.14132943749427795, "learning_rate": 6.625360734230663e-06, "loss": 0.0108, "step": 6600 }, { "epoch": 0.31112104773895954, "grad_norm": 0.10323132574558258, "learning_rate": 6.596675939708166e-06, "loss": 0.0129, "step": 6610 }, { "epoch": 0.31159173011072805, "grad_norm": 0.23846012353897095, "learning_rate": 6.5680227818737695e-06, "loss": 0.0102, "step": 6620 }, { "epoch": 0.3120624124824965, "grad_norm": 0.1398949772119522, "learning_rate": 6.539401527082083e-06, "loss": 0.0121, "step": 6630 }, { "epoch": 0.31253309485426495, "grad_norm": 0.18361139297485352, "learning_rate": 6.510812441391131e-06, "loss": 0.0128, "step": 6640 }, { "epoch": 0.31300377722603345, "grad_norm": 0.09834540635347366, "learning_rate": 6.4822557905599156e-06, "loss": 0.0085, "step": 6650 }, { "epoch": 0.3134744595978019, "grad_norm": 0.1404964029788971, "learning_rate": 6.4537318400459295e-06, "loss": 0.0108, "step": 6660 }, { "epoch": 0.3139451419695704, "grad_norm": 0.10830911993980408, "learning_rate": 6.425240855002674e-06, "loss": 0.0098, "step": 6670 }, { "epoch": 0.31441582434133886, "grad_norm": 0.11447969824075699, "learning_rate": 6.396783100277224e-06, "loss": 0.0126, "step": 6680 }, { "epoch": 0.3148865067131073, "grad_norm": 0.1484939306974411, "learning_rate": 6.368358840407754e-06, "loss": 0.0087, "step": 6690 }, { "epoch": 0.3153571890848758, "grad_norm": 0.11320305615663528, "learning_rate": 6.339968339621056e-06, "loss": 0.0104, "step": 6700 }, { "epoch": 0.31582787145664426, "grad_norm": 0.15289172530174255, "learning_rate": 6.311611861830129e-06, "loss": 0.0146, "step": 6710 }, { "epoch": 0.31629855382841277, "grad_norm": 0.10496920347213745, "learning_rate": 6.283289670631684e-06, "loss": 0.0081, "step": 6720 }, { "epoch": 0.3167692362001812, "grad_norm": 0.09335048496723175, "learning_rate": 6.2550020293037095e-06, "loss": 0.0111, "step": 6730 }, { "epoch": 0.31723991857194966, "grad_norm": 0.09383797645568848, "learning_rate": 6.2267492008030395e-06, "loss": 0.0099, "step": 6740 }, { "epoch": 0.31771060094371817, "grad_norm": 0.11041012406349182, "learning_rate": 6.198531447762875e-06, "loss": 0.009, "step": 6750 }, { "epoch": 0.3181812833154866, "grad_norm": 0.10897303372621536, "learning_rate": 6.1703490324903745e-06, "loss": 0.0102, "step": 6760 }, { "epoch": 0.3186519656872551, "grad_norm": 0.10038303583860397, "learning_rate": 6.142202216964204e-06, "loss": 0.0089, "step": 6770 }, { "epoch": 0.3191226480590236, "grad_norm": 0.13420462608337402, "learning_rate": 6.114091262832087e-06, "loss": 0.0124, "step": 6780 }, { "epoch": 0.319593330430792, "grad_norm": 0.12231487035751343, "learning_rate": 6.0860164314084e-06, "loss": 0.0111, "step": 6790 }, { "epoch": 0.3200640128025605, "grad_norm": 0.16560333967208862, "learning_rate": 6.05797798367173e-06, "loss": 0.0115, "step": 6800 }, { "epoch": 0.320534695174329, "grad_norm": 0.12143757194280624, "learning_rate": 6.029976180262431e-06, "loss": 0.0093, "step": 6810 }, { "epoch": 0.3210053775460975, "grad_norm": 0.08762738108634949, "learning_rate": 6.0020112814802355e-06, "loss": 0.0098, "step": 6820 }, { "epoch": 0.32147605991786593, "grad_norm": 0.1156923919916153, "learning_rate": 5.9740835472818145e-06, "loss": 0.0156, "step": 6830 }, { "epoch": 0.3219467422896344, "grad_norm": 0.1309594362974167, "learning_rate": 5.946193237278352e-06, "loss": 0.0115, "step": 6840 }, { "epoch": 0.3224174246614029, "grad_norm": 0.11876383423805237, "learning_rate": 5.918340610733154e-06, "loss": 0.012, "step": 6850 }, { "epoch": 0.32288810703317133, "grad_norm": 0.18021339178085327, "learning_rate": 5.8905259265592315e-06, "loss": 0.0097, "step": 6860 }, { "epoch": 0.32335878940493984, "grad_norm": 0.14171931147575378, "learning_rate": 5.8627494433168756e-06, "loss": 0.0142, "step": 6870 }, { "epoch": 0.3238294717767083, "grad_norm": 0.10388112813234329, "learning_rate": 5.835011419211285e-06, "loss": 0.0116, "step": 6880 }, { "epoch": 0.32430015414847674, "grad_norm": 0.13450077176094055, "learning_rate": 5.807312112090129e-06, "loss": 0.0112, "step": 6890 }, { "epoch": 0.32477083652024524, "grad_norm": 0.1280916929244995, "learning_rate": 5.779651779441192e-06, "loss": 0.0146, "step": 6900 }, { "epoch": 0.3252415188920137, "grad_norm": 0.12483247369527817, "learning_rate": 5.752030678389948e-06, "loss": 0.011, "step": 6910 }, { "epoch": 0.32571220126378214, "grad_norm": 0.09277547895908356, "learning_rate": 5.724449065697182e-06, "loss": 0.0095, "step": 6920 }, { "epoch": 0.32618288363555065, "grad_norm": 0.1311820149421692, "learning_rate": 5.696907197756598e-06, "loss": 0.012, "step": 6930 }, { "epoch": 0.3266535660073191, "grad_norm": 0.10254520922899246, "learning_rate": 5.669405330592457e-06, "loss": 0.0092, "step": 6940 }, { "epoch": 0.3271242483790876, "grad_norm": 0.11941272020339966, "learning_rate": 5.6419437198571525e-06, "loss": 0.0099, "step": 6950 }, { "epoch": 0.32759493075085605, "grad_norm": 0.09633767604827881, "learning_rate": 5.6145226208288875e-06, "loss": 0.0092, "step": 6960 }, { "epoch": 0.3280656131226245, "grad_norm": 0.11132384091615677, "learning_rate": 5.587142288409262e-06, "loss": 0.0083, "step": 6970 }, { "epoch": 0.328536295494393, "grad_norm": 0.15696297585964203, "learning_rate": 5.559802977120918e-06, "loss": 0.0095, "step": 6980 }, { "epoch": 0.32900697786616145, "grad_norm": 0.14272210001945496, "learning_rate": 5.532504941105176e-06, "loss": 0.0116, "step": 6990 }, { "epoch": 0.32947766023792996, "grad_norm": 0.17424431443214417, "learning_rate": 5.505248434119666e-06, "loss": 0.0129, "step": 7000 }, { "epoch": 0.3299483426096984, "grad_norm": 0.11784672737121582, "learning_rate": 5.478033709535968e-06, "loss": 0.0115, "step": 7010 }, { "epoch": 0.33041902498146686, "grad_norm": 0.08876394480466843, "learning_rate": 5.4508610203372794e-06, "loss": 0.0097, "step": 7020 }, { "epoch": 0.33088970735323536, "grad_norm": 1.080922245979309, "learning_rate": 5.42373061911601e-06, "loss": 0.0114, "step": 7030 }, { "epoch": 0.3313603897250038, "grad_norm": 0.10890117287635803, "learning_rate": 5.3966427580715044e-06, "loss": 0.0122, "step": 7040 }, { "epoch": 0.3318310720967723, "grad_norm": 0.09948209673166275, "learning_rate": 5.3695976890076375e-06, "loss": 0.0113, "step": 7050 }, { "epoch": 0.33230175446854077, "grad_norm": 0.11084888875484467, "learning_rate": 5.3425956633305075e-06, "loss": 0.0103, "step": 7060 }, { "epoch": 0.3327724368403092, "grad_norm": 0.13808102905750275, "learning_rate": 5.3156369320460796e-06, "loss": 0.0103, "step": 7070 }, { "epoch": 0.3332431192120777, "grad_norm": 0.10355255007743835, "learning_rate": 5.2887217457578856e-06, "loss": 0.0096, "step": 7080 }, { "epoch": 0.33371380158384617, "grad_norm": 0.10778376460075378, "learning_rate": 5.261850354664633e-06, "loss": 0.0114, "step": 7090 }, { "epoch": 0.3341844839556147, "grad_norm": 0.13922104239463806, "learning_rate": 5.235023008557955e-06, "loss": 0.0134, "step": 7100 }, { "epoch": 0.3346551663273831, "grad_norm": 0.1498662233352661, "learning_rate": 5.20823995682003e-06, "loss": 0.0099, "step": 7110 }, { "epoch": 0.3351258486991516, "grad_norm": 0.11078202724456787, "learning_rate": 5.1815014484212825e-06, "loss": 0.0103, "step": 7120 }, { "epoch": 0.3355965310709201, "grad_norm": 0.1963895857334137, "learning_rate": 5.154807731918081e-06, "loss": 0.0127, "step": 7130 }, { "epoch": 0.33606721344268853, "grad_norm": 0.13531900942325592, "learning_rate": 5.1281590554504095e-06, "loss": 0.0106, "step": 7140 }, { "epoch": 0.33653789581445703, "grad_norm": 0.12432403117418289, "learning_rate": 5.101555666739563e-06, "loss": 0.0115, "step": 7150 }, { "epoch": 0.3370085781862255, "grad_norm": 0.08359402418136597, "learning_rate": 5.074997813085873e-06, "loss": 0.0097, "step": 7160 }, { "epoch": 0.33747926055799393, "grad_norm": 0.1503474861383438, "learning_rate": 5.048485741366351e-06, "loss": 0.0113, "step": 7170 }, { "epoch": 0.33794994292976244, "grad_norm": 0.12229029089212418, "learning_rate": 5.0220196980324545e-06, "loss": 0.0108, "step": 7180 }, { "epoch": 0.3384206253015309, "grad_norm": 0.09739870578050613, "learning_rate": 4.995599929107758e-06, "loss": 0.0133, "step": 7190 }, { "epoch": 0.3388913076732994, "grad_norm": 0.11581404507160187, "learning_rate": 4.9692266801856815e-06, "loss": 0.0116, "step": 7200 }, { "epoch": 0.33936199004506784, "grad_norm": 0.11786969751119614, "learning_rate": 4.942900196427195e-06, "loss": 0.0107, "step": 7210 }, { "epoch": 0.3398326724168363, "grad_norm": 0.13166283071041107, "learning_rate": 4.916620722558568e-06, "loss": 0.011, "step": 7220 }, { "epoch": 0.3403033547886048, "grad_norm": 0.10356545448303223, "learning_rate": 4.8903885028690454e-06, "loss": 0.0122, "step": 7230 }, { "epoch": 0.34077403716037324, "grad_norm": 0.10164371132850647, "learning_rate": 4.864203781208632e-06, "loss": 0.0108, "step": 7240 }, { "epoch": 0.34124471953214175, "grad_norm": 0.14791904389858246, "learning_rate": 4.838066800985786e-06, "loss": 0.0108, "step": 7250 }, { "epoch": 0.3417154019039102, "grad_norm": 0.17711718380451202, "learning_rate": 4.811977805165174e-06, "loss": 0.0114, "step": 7260 }, { "epoch": 0.34218608427567865, "grad_norm": 0.12444761395454407, "learning_rate": 4.7859370362654045e-06, "loss": 0.0125, "step": 7270 }, { "epoch": 0.34265676664744715, "grad_norm": 0.11174033582210541, "learning_rate": 4.75994473635678e-06, "loss": 0.011, "step": 7280 }, { "epoch": 0.3431274490192156, "grad_norm": 0.10445983707904816, "learning_rate": 4.7340011470590415e-06, "loss": 0.0126, "step": 7290 }, { "epoch": 0.34359813139098405, "grad_norm": 0.09001205861568451, "learning_rate": 4.708106509539134e-06, "loss": 0.0107, "step": 7300 }, { "epoch": 0.34406881376275256, "grad_norm": 0.09991347044706345, "learning_rate": 4.682261064508944e-06, "loss": 0.0097, "step": 7310 }, { "epoch": 0.344539496134521, "grad_norm": 0.16420741379261017, "learning_rate": 4.656465052223079e-06, "loss": 0.0135, "step": 7320 }, { "epoch": 0.3450101785062895, "grad_norm": 0.1705462783575058, "learning_rate": 4.630718712476628e-06, "loss": 0.0124, "step": 7330 }, { "epoch": 0.34548086087805796, "grad_norm": 0.17490284144878387, "learning_rate": 4.6050222846029315e-06, "loss": 0.0104, "step": 7340 }, { "epoch": 0.3459515432498264, "grad_norm": 0.08906587213277817, "learning_rate": 4.5793760074713565e-06, "loss": 0.0111, "step": 7350 }, { "epoch": 0.3464222256215949, "grad_norm": 0.10752348601818085, "learning_rate": 4.553780119485093e-06, "loss": 0.0115, "step": 7360 }, { "epoch": 0.34689290799336336, "grad_norm": 0.16819818317890167, "learning_rate": 4.528234858578894e-06, "loss": 0.0106, "step": 7370 }, { "epoch": 0.34736359036513187, "grad_norm": 0.122901052236557, "learning_rate": 4.502740462216919e-06, "loss": 0.0098, "step": 7380 }, { "epoch": 0.3478342727369003, "grad_norm": 0.11220543831586838, "learning_rate": 4.477297167390487e-06, "loss": 0.0111, "step": 7390 }, { "epoch": 0.34830495510866877, "grad_norm": 0.09675250947475433, "learning_rate": 4.451905210615889e-06, "loss": 0.0105, "step": 7400 }, { "epoch": 0.3487756374804373, "grad_norm": 0.12329498678445816, "learning_rate": 4.426564827932185e-06, "loss": 0.011, "step": 7410 }, { "epoch": 0.3492463198522057, "grad_norm": 0.14219404757022858, "learning_rate": 4.401276254899014e-06, "loss": 0.0106, "step": 7420 }, { "epoch": 0.3497170022239742, "grad_norm": 0.14380919933319092, "learning_rate": 4.3760397265943965e-06, "loss": 0.0114, "step": 7430 }, { "epoch": 0.3501876845957427, "grad_norm": 0.09598112851381302, "learning_rate": 4.350855477612565e-06, "loss": 0.0103, "step": 7440 }, { "epoch": 0.3506583669675111, "grad_norm": 0.14553220570087433, "learning_rate": 4.325723742061767e-06, "loss": 0.0104, "step": 7450 }, { "epoch": 0.35112904933927963, "grad_norm": 0.12169907987117767, "learning_rate": 4.30064475356209e-06, "loss": 0.0099, "step": 7460 }, { "epoch": 0.3515997317110481, "grad_norm": 0.15040737390518188, "learning_rate": 4.275618745243301e-06, "loss": 0.0115, "step": 7470 }, { "epoch": 0.3520704140828166, "grad_norm": 0.15188813209533691, "learning_rate": 4.2506459497426685e-06, "loss": 0.0114, "step": 7480 }, { "epoch": 0.35254109645458503, "grad_norm": 0.12288067489862442, "learning_rate": 4.225726599202808e-06, "loss": 0.0105, "step": 7490 }, { "epoch": 0.3530117788263535, "grad_norm": 0.13514454662799835, "learning_rate": 4.200860925269519e-06, "loss": 0.0096, "step": 7500 }, { "epoch": 0.353482461198122, "grad_norm": 0.07675524801015854, "learning_rate": 4.176049159089626e-06, "loss": 0.0108, "step": 7510 }, { "epoch": 0.35395314356989044, "grad_norm": 0.13065500557422638, "learning_rate": 4.1512915313088505e-06, "loss": 0.0113, "step": 7520 }, { "epoch": 0.35442382594165894, "grad_norm": 0.1256616860628128, "learning_rate": 4.126588272069645e-06, "loss": 0.014, "step": 7530 }, { "epoch": 0.3548945083134274, "grad_norm": 0.09628811478614807, "learning_rate": 4.101939611009059e-06, "loss": 0.0075, "step": 7540 }, { "epoch": 0.35536519068519584, "grad_norm": 0.17647704482078552, "learning_rate": 4.077345777256614e-06, "loss": 0.0127, "step": 7550 }, { "epoch": 0.35583587305696435, "grad_norm": 0.09051442891359329, "learning_rate": 4.052806999432161e-06, "loss": 0.0129, "step": 7560 }, { "epoch": 0.3563065554287328, "grad_norm": 0.08692386746406555, "learning_rate": 4.028323505643762e-06, "loss": 0.0074, "step": 7570 }, { "epoch": 0.3567772378005013, "grad_norm": 0.12106984108686447, "learning_rate": 4.003895523485575e-06, "loss": 0.009, "step": 7580 }, { "epoch": 0.35724792017226975, "grad_norm": 0.08121156692504883, "learning_rate": 3.979523280035723e-06, "loss": 0.0129, "step": 7590 }, { "epoch": 0.3577186025440382, "grad_norm": 0.10249274224042892, "learning_rate": 3.955207001854197e-06, "loss": 0.0116, "step": 7600 }, { "epoch": 0.3581892849158067, "grad_norm": 0.11653947830200195, "learning_rate": 3.930946914980744e-06, "loss": 0.0114, "step": 7610 }, { "epoch": 0.35865996728757515, "grad_norm": 0.11900132894515991, "learning_rate": 3.906743244932767e-06, "loss": 0.0097, "step": 7620 }, { "epoch": 0.35913064965934366, "grad_norm": 0.11373395472764969, "learning_rate": 3.882596216703226e-06, "loss": 0.0135, "step": 7630 }, { "epoch": 0.3596013320311121, "grad_norm": 0.12335237860679626, "learning_rate": 3.858506054758547e-06, "loss": 0.0085, "step": 7640 }, { "epoch": 0.36007201440288056, "grad_norm": 0.15550148487091064, "learning_rate": 3.834472983036551e-06, "loss": 0.0101, "step": 7650 }, { "epoch": 0.36054269677464906, "grad_norm": 0.16644270718097687, "learning_rate": 3.8104972249443417e-06, "loss": 0.0096, "step": 7660 }, { "epoch": 0.3610133791464175, "grad_norm": 0.11090733855962753, "learning_rate": 3.7865790033562532e-06, "loss": 0.0104, "step": 7670 }, { "epoch": 0.361484061518186, "grad_norm": 0.15120545029640198, "learning_rate": 3.7627185406117707e-06, "loss": 0.0118, "step": 7680 }, { "epoch": 0.36195474388995447, "grad_norm": 0.12989024817943573, "learning_rate": 3.738916058513462e-06, "loss": 0.0096, "step": 7690 }, { "epoch": 0.3624254262617229, "grad_norm": 0.10671823471784592, "learning_rate": 3.7151717783249175e-06, "loss": 0.0114, "step": 7700 }, { "epoch": 0.3628961086334914, "grad_norm": 0.15751130878925323, "learning_rate": 3.6914859207686916e-06, "loss": 0.0101, "step": 7710 }, { "epoch": 0.36336679100525987, "grad_norm": 0.17635993659496307, "learning_rate": 3.6678587060242586e-06, "loss": 0.0123, "step": 7720 }, { "epoch": 0.3638374733770283, "grad_norm": 0.125935360789299, "learning_rate": 3.6442903537259556e-06, "loss": 0.0091, "step": 7730 }, { "epoch": 0.3643081557487968, "grad_norm": 0.10976330190896988, "learning_rate": 3.6207810829609414e-06, "loss": 0.0118, "step": 7740 }, { "epoch": 0.3647788381205653, "grad_norm": 0.11178203672170639, "learning_rate": 3.5973311122671695e-06, "loss": 0.0093, "step": 7750 }, { "epoch": 0.3652495204923338, "grad_norm": 0.10673002153635025, "learning_rate": 3.5739406596313474e-06, "loss": 0.0091, "step": 7760 }, { "epoch": 0.36572020286410223, "grad_norm": 0.14696449041366577, "learning_rate": 3.5506099424869133e-06, "loss": 0.0096, "step": 7770 }, { "epoch": 0.3661908852358707, "grad_norm": 0.12052320688962936, "learning_rate": 3.5273391777120136e-06, "loss": 0.0122, "step": 7780 }, { "epoch": 0.3666615676076392, "grad_norm": 0.11616943776607513, "learning_rate": 3.504128581627497e-06, "loss": 0.0108, "step": 7790 }, { "epoch": 0.36713224997940763, "grad_norm": 0.1317727267742157, "learning_rate": 3.480978369994885e-06, "loss": 0.0107, "step": 7800 }, { "epoch": 0.36760293235117614, "grad_norm": 0.10772315412759781, "learning_rate": 3.4578887580143793e-06, "loss": 0.0115, "step": 7810 }, { "epoch": 0.3680736147229446, "grad_norm": 0.11397389322519302, "learning_rate": 3.4348599603228584e-06, "loss": 0.0111, "step": 7820 }, { "epoch": 0.36854429709471304, "grad_norm": 0.1395464688539505, "learning_rate": 3.411892190991882e-06, "loss": 0.0101, "step": 7830 }, { "epoch": 0.36901497946648154, "grad_norm": 0.08767911046743393, "learning_rate": 3.3889856635257024e-06, "loss": 0.0101, "step": 7840 }, { "epoch": 0.36948566183825, "grad_norm": 0.151472270488739, "learning_rate": 3.366140590859276e-06, "loss": 0.0105, "step": 7850 }, { "epoch": 0.3699563442100185, "grad_norm": 0.09505433589220047, "learning_rate": 3.343357185356284e-06, "loss": 0.0105, "step": 7860 }, { "epoch": 0.37042702658178694, "grad_norm": 0.10559482127428055, "learning_rate": 3.3206356588071733e-06, "loss": 0.0088, "step": 7870 }, { "epoch": 0.3708977089535554, "grad_norm": 0.2156754434108734, "learning_rate": 3.2979762224271616e-06, "loss": 0.0116, "step": 7880 }, { "epoch": 0.3713683913253239, "grad_norm": 0.22779065370559692, "learning_rate": 3.275379086854292e-06, "loss": 0.0118, "step": 7890 }, { "epoch": 0.37183907369709235, "grad_norm": 0.11923494189977646, "learning_rate": 3.252844462147472e-06, "loss": 0.0097, "step": 7900 }, { "epoch": 0.37230975606886085, "grad_norm": 0.1430639922618866, "learning_rate": 3.230372557784518e-06, "loss": 0.0094, "step": 7910 }, { "epoch": 0.3727804384406293, "grad_norm": 0.13378025591373444, "learning_rate": 3.2079635826602053e-06, "loss": 0.0112, "step": 7920 }, { "epoch": 0.37325112081239775, "grad_norm": 0.07031971216201782, "learning_rate": 3.185617745084343e-06, "loss": 0.0079, "step": 7930 }, { "epoch": 0.37372180318416626, "grad_norm": 0.15737880766391754, "learning_rate": 3.163335252779811e-06, "loss": 0.0112, "step": 7940 }, { "epoch": 0.3741924855559347, "grad_norm": 0.1345309615135193, "learning_rate": 3.1411163128806497e-06, "loss": 0.0093, "step": 7950 }, { "epoch": 0.3746631679277032, "grad_norm": 0.12266458570957184, "learning_rate": 3.118961131930127e-06, "loss": 0.0109, "step": 7960 }, { "epoch": 0.37513385029947166, "grad_norm": 0.16126354038715363, "learning_rate": 3.0968699158788185e-06, "loss": 0.0096, "step": 7970 }, { "epoch": 0.3756045326712401, "grad_norm": 0.16549748182296753, "learning_rate": 3.0748428700826938e-06, "loss": 0.0095, "step": 7980 }, { "epoch": 0.3760752150430086, "grad_norm": 0.11139413714408875, "learning_rate": 3.0528801993012056e-06, "loss": 0.01, "step": 7990 }, { "epoch": 0.37654589741477706, "grad_norm": 0.08979491144418716, "learning_rate": 3.0309821076953893e-06, "loss": 0.0106, "step": 8000 }, { "epoch": 0.37701657978654557, "grad_norm": 0.14120084047317505, "learning_rate": 3.0091487988259684e-06, "loss": 0.0098, "step": 8010 }, { "epoch": 0.377487262158314, "grad_norm": 0.13351234793663025, "learning_rate": 2.9873804756514513e-06, "loss": 0.0098, "step": 8020 }, { "epoch": 0.37795794453008247, "grad_norm": 0.17762669920921326, "learning_rate": 2.965677340526254e-06, "loss": 0.0099, "step": 8030 }, { "epoch": 0.378428626901851, "grad_norm": 0.12142956256866455, "learning_rate": 2.944039595198814e-06, "loss": 0.0118, "step": 8040 }, { "epoch": 0.3788993092736194, "grad_norm": 0.1348985880613327, "learning_rate": 2.9224674408097207e-06, "loss": 0.0101, "step": 8050 }, { "epoch": 0.3793699916453879, "grad_norm": 0.1335914433002472, "learning_rate": 2.900961077889837e-06, "loss": 0.0113, "step": 8060 }, { "epoch": 0.3798406740171564, "grad_norm": 0.08723060041666031, "learning_rate": 2.879520706358446e-06, "loss": 0.011, "step": 8070 }, { "epoch": 0.3803113563889248, "grad_norm": 0.10504450649023056, "learning_rate": 2.8581465255213834e-06, "loss": 0.0102, "step": 8080 }, { "epoch": 0.38078203876069333, "grad_norm": 0.15166273713111877, "learning_rate": 2.836838734069187e-06, "loss": 0.0099, "step": 8090 }, { "epoch": 0.3812527211324618, "grad_norm": 0.16700349748134613, "learning_rate": 2.8155975300752524e-06, "loss": 0.0111, "step": 8100 }, { "epoch": 0.3817234035042303, "grad_norm": 0.09663381427526474, "learning_rate": 2.794423110993991e-06, "loss": 0.0134, "step": 8110 }, { "epoch": 0.38219408587599873, "grad_norm": 0.12700210511684418, "learning_rate": 2.7733156736589893e-06, "loss": 0.0119, "step": 8120 }, { "epoch": 0.3826647682477672, "grad_norm": 0.1246623620390892, "learning_rate": 2.7522754142811957e-06, "loss": 0.011, "step": 8130 }, { "epoch": 0.3831354506195357, "grad_norm": 0.13532695174217224, "learning_rate": 2.731302528447063e-06, "loss": 0.0108, "step": 8140 }, { "epoch": 0.38360613299130414, "grad_norm": 0.11454427242279053, "learning_rate": 2.710397211116774e-06, "loss": 0.0081, "step": 8150 }, { "epoch": 0.3840768153630726, "grad_norm": 0.12632547318935394, "learning_rate": 2.6895596566223937e-06, "loss": 0.0123, "step": 8160 }, { "epoch": 0.3845474977348411, "grad_norm": 0.10470175743103027, "learning_rate": 2.66879005866608e-06, "loss": 0.0096, "step": 8170 }, { "epoch": 0.38501818010660954, "grad_norm": 0.12307362258434296, "learning_rate": 2.648088610318278e-06, "loss": 0.0083, "step": 8180 }, { "epoch": 0.38548886247837805, "grad_norm": 0.09618969261646271, "learning_rate": 2.6274555040159265e-06, "loss": 0.0107, "step": 8190 }, { "epoch": 0.3859595448501465, "grad_norm": 0.11208220571279526, "learning_rate": 2.606890931560667e-06, "loss": 0.0121, "step": 8200 }, { "epoch": 0.38643022722191495, "grad_norm": 0.09631264209747314, "learning_rate": 2.5863950841170704e-06, "loss": 0.0112, "step": 8210 }, { "epoch": 0.38690090959368345, "grad_norm": 0.10042539983987808, "learning_rate": 2.5659681522108428e-06, "loss": 0.0115, "step": 8220 }, { "epoch": 0.3873715919654519, "grad_norm": 0.13740061223506927, "learning_rate": 2.5456103257270693e-06, "loss": 0.0115, "step": 8230 }, { "epoch": 0.3878422743372204, "grad_norm": 0.115552619099617, "learning_rate": 2.5253217939084407e-06, "loss": 0.0104, "step": 8240 }, { "epoch": 0.38831295670898885, "grad_norm": 0.1519811451435089, "learning_rate": 2.505102745353499e-06, "loss": 0.0104, "step": 8250 }, { "epoch": 0.3887836390807573, "grad_norm": 0.13424964249134064, "learning_rate": 2.4849533680148787e-06, "loss": 0.0144, "step": 8260 }, { "epoch": 0.3892543214525258, "grad_norm": 0.12041078507900238, "learning_rate": 2.4648738491975745e-06, "loss": 0.0102, "step": 8270 }, { "epoch": 0.38972500382429426, "grad_norm": 0.17060953378677368, "learning_rate": 2.4448643755571687e-06, "loss": 0.0115, "step": 8280 }, { "epoch": 0.39019568619606276, "grad_norm": 0.08639443665742874, "learning_rate": 2.424925133098137e-06, "loss": 0.0097, "step": 8290 }, { "epoch": 0.3906663685678312, "grad_norm": 0.15009009838104248, "learning_rate": 2.4050563071720867e-06, "loss": 0.0112, "step": 8300 }, { "epoch": 0.39113705093959966, "grad_norm": 0.10077055543661118, "learning_rate": 2.3852580824760487e-06, "loss": 0.012, "step": 8310 }, { "epoch": 0.39160773331136817, "grad_norm": 0.14659449458122253, "learning_rate": 2.3655306430507563e-06, "loss": 0.0098, "step": 8320 }, { "epoch": 0.3920784156831366, "grad_norm": 0.1448894590139389, "learning_rate": 2.345874172278939e-06, "loss": 0.0097, "step": 8330 }, { "epoch": 0.3925490980549051, "grad_norm": 0.1371270716190338, "learning_rate": 2.326288852883607e-06, "loss": 0.0081, "step": 8340 }, { "epoch": 0.39301978042667357, "grad_norm": 0.1429033875465393, "learning_rate": 2.306774866926377e-06, "loss": 0.01, "step": 8350 }, { "epoch": 0.393490462798442, "grad_norm": 0.11611126363277435, "learning_rate": 2.287332395805737e-06, "loss": 0.011, "step": 8360 }, { "epoch": 0.3939611451702105, "grad_norm": 0.1048848032951355, "learning_rate": 2.26796162025541e-06, "loss": 0.0085, "step": 8370 }, { "epoch": 0.394431827541979, "grad_norm": 0.09943952411413193, "learning_rate": 2.248662720342637e-06, "loss": 0.0101, "step": 8380 }, { "epoch": 0.3949025099137475, "grad_norm": 0.13675834238529205, "learning_rate": 2.229435875466519e-06, "loss": 0.01, "step": 8390 }, { "epoch": 0.39537319228551593, "grad_norm": 0.2128056287765503, "learning_rate": 2.2102812643563455e-06, "loss": 0.0101, "step": 8400 }, { "epoch": 0.3958438746572844, "grad_norm": 0.11201132833957672, "learning_rate": 2.191199065069941e-06, "loss": 0.0116, "step": 8410 }, { "epoch": 0.3963145570290529, "grad_norm": 0.12792275846004486, "learning_rate": 2.1721894549919863e-06, "loss": 0.0103, "step": 8420 }, { "epoch": 0.39678523940082133, "grad_norm": 0.11139087378978729, "learning_rate": 2.1532526108324047e-06, "loss": 0.0102, "step": 8430 }, { "epoch": 0.39725592177258984, "grad_norm": 0.16245591640472412, "learning_rate": 2.1343887086246893e-06, "loss": 0.01, "step": 8440 }, { "epoch": 0.3977266041443583, "grad_norm": 0.138621985912323, "learning_rate": 2.1155979237242817e-06, "loss": 0.0095, "step": 8450 }, { "epoch": 0.39819728651612674, "grad_norm": 0.15205764770507812, "learning_rate": 2.0968804308069324e-06, "loss": 0.0137, "step": 8460 }, { "epoch": 0.39866796888789524, "grad_norm": 0.16271470487117767, "learning_rate": 2.0782364038670986e-06, "loss": 0.0097, "step": 8470 }, { "epoch": 0.3991386512596637, "grad_norm": 0.12749820947647095, "learning_rate": 2.0596660162162872e-06, "loss": 0.0088, "step": 8480 }, { "epoch": 0.3996093336314322, "grad_norm": 0.07396082580089569, "learning_rate": 2.041169440481493e-06, "loss": 0.0124, "step": 8490 }, { "epoch": 0.40008001600320064, "grad_norm": 0.08212500065565109, "learning_rate": 2.022746848603543e-06, "loss": 0.0087, "step": 8500 }, { "epoch": 0.4005506983749691, "grad_norm": 0.09258861839771271, "learning_rate": 2.0043984118355464e-06, "loss": 0.0115, "step": 8510 }, { "epoch": 0.4010213807467376, "grad_norm": 0.08113981038331985, "learning_rate": 1.986124300741267e-06, "loss": 0.0105, "step": 8520 }, { "epoch": 0.40149206311850605, "grad_norm": 0.12780699133872986, "learning_rate": 1.967924685193552e-06, "loss": 0.0099, "step": 8530 }, { "epoch": 0.4019627454902745, "grad_norm": 0.08436062932014465, "learning_rate": 1.9497997343727513e-06, "loss": 0.0118, "step": 8540 }, { "epoch": 0.402433427862043, "grad_norm": 0.14994527399539948, "learning_rate": 1.9317496167651563e-06, "loss": 0.011, "step": 8550 }, { "epoch": 0.40290411023381145, "grad_norm": 0.14769554138183594, "learning_rate": 1.9137745001613984e-06, "loss": 0.011, "step": 8560 }, { "epoch": 0.40337479260557996, "grad_norm": 0.12459457665681839, "learning_rate": 1.8958745516549382e-06, "loss": 0.009, "step": 8570 }, { "epoch": 0.4038454749773484, "grad_norm": 0.13600511848926544, "learning_rate": 1.8780499376404715e-06, "loss": 0.0126, "step": 8580 }, { "epoch": 0.40431615734911686, "grad_norm": 0.13582831621170044, "learning_rate": 1.8603008238124043e-06, "loss": 0.0069, "step": 8590 }, { "epoch": 0.40478683972088536, "grad_norm": 0.1361217349767685, "learning_rate": 1.842627375163305e-06, "loss": 0.0117, "step": 8600 }, { "epoch": 0.4052575220926538, "grad_norm": 0.1543978601694107, "learning_rate": 1.8250297559823716e-06, "loss": 0.0108, "step": 8610 }, { "epoch": 0.4057282044644223, "grad_norm": 0.0865384042263031, "learning_rate": 1.8075081298539032e-06, "loss": 0.0088, "step": 8620 }, { "epoch": 0.40619888683619076, "grad_norm": 0.10540938377380371, "learning_rate": 1.7900626596557924e-06, "loss": 0.0081, "step": 8630 }, { "epoch": 0.4066695692079592, "grad_norm": 0.13248394429683685, "learning_rate": 1.7726935075579798e-06, "loss": 0.0087, "step": 8640 }, { "epoch": 0.4071402515797277, "grad_norm": 0.08709853887557983, "learning_rate": 1.7554008350209862e-06, "loss": 0.0093, "step": 8650 }, { "epoch": 0.40761093395149617, "grad_norm": 0.10586871206760406, "learning_rate": 1.7381848027943815e-06, "loss": 0.0105, "step": 8660 }, { "epoch": 0.4080816163232647, "grad_norm": 0.1367088407278061, "learning_rate": 1.721045570915304e-06, "loss": 0.0113, "step": 8670 }, { "epoch": 0.4085522986950331, "grad_norm": 0.09104905277490616, "learning_rate": 1.703983298706966e-06, "loss": 0.0097, "step": 8680 }, { "epoch": 0.40902298106680157, "grad_norm": 0.12627150118350983, "learning_rate": 1.6869981447771876e-06, "loss": 0.01, "step": 8690 }, { "epoch": 0.4094936634385701, "grad_norm": 0.13719642162322998, "learning_rate": 1.670090267016895e-06, "loss": 0.0097, "step": 8700 }, { "epoch": 0.4099643458103385, "grad_norm": 0.1349107325077057, "learning_rate": 1.653259822598683e-06, "loss": 0.0101, "step": 8710 }, { "epoch": 0.41043502818210703, "grad_norm": 0.10144147276878357, "learning_rate": 1.6365069679753331e-06, "loss": 0.0079, "step": 8720 }, { "epoch": 0.4109057105538755, "grad_norm": 0.11531395465135574, "learning_rate": 1.619831858878368e-06, "loss": 0.0084, "step": 8730 }, { "epoch": 0.41137639292564393, "grad_norm": 0.1156122088432312, "learning_rate": 1.6032346503166007e-06, "loss": 0.0105, "step": 8740 }, { "epoch": 0.41184707529741243, "grad_norm": 0.10936363786458969, "learning_rate": 1.5867154965746956e-06, "loss": 0.01, "step": 8750 }, { "epoch": 0.4123177576691809, "grad_norm": 0.10973110049962997, "learning_rate": 1.5702745512117323e-06, "loss": 0.0089, "step": 8760 }, { "epoch": 0.4127884400409494, "grad_norm": 0.12633436918258667, "learning_rate": 1.553911967059788e-06, "loss": 0.0099, "step": 8770 }, { "epoch": 0.41325912241271784, "grad_norm": 0.12931127846240997, "learning_rate": 1.537627896222489e-06, "loss": 0.0105, "step": 8780 }, { "epoch": 0.4137298047844863, "grad_norm": 0.10223909467458725, "learning_rate": 1.5214224900736375e-06, "loss": 0.0111, "step": 8790 }, { "epoch": 0.4142004871562548, "grad_norm": 0.07914174348115921, "learning_rate": 1.5052958992557687e-06, "loss": 0.0117, "step": 8800 }, { "epoch": 0.41467116952802324, "grad_norm": 0.13701507449150085, "learning_rate": 1.4892482736787717e-06, "loss": 0.0097, "step": 8810 }, { "epoch": 0.41514185189979175, "grad_norm": 0.11646740883588791, "learning_rate": 1.4732797625184814e-06, "loss": 0.0091, "step": 8820 }, { "epoch": 0.4156125342715602, "grad_norm": 0.06624521315097809, "learning_rate": 1.4573905142153134e-06, "loss": 0.012, "step": 8830 }, { "epoch": 0.41608321664332865, "grad_norm": 0.13785380125045776, "learning_rate": 1.44158067647285e-06, "loss": 0.0118, "step": 8840 }, { "epoch": 0.41655389901509715, "grad_norm": 0.20359711349010468, "learning_rate": 1.4258503962565096e-06, "loss": 0.0123, "step": 8850 }, { "epoch": 0.4170245813868656, "grad_norm": 0.1141083762049675, "learning_rate": 1.4101998197921352e-06, "loss": 0.013, "step": 8860 }, { "epoch": 0.4174952637586341, "grad_norm": 0.12032856047153473, "learning_rate": 1.3946290925646788e-06, "loss": 0.0096, "step": 8870 }, { "epoch": 0.41796594613040255, "grad_norm": 0.14139364659786224, "learning_rate": 1.379138359316814e-06, "loss": 0.0109, "step": 8880 }, { "epoch": 0.418436628502171, "grad_norm": 0.10791698098182678, "learning_rate": 1.363727764047612e-06, "loss": 0.01, "step": 8890 }, { "epoch": 0.4189073108739395, "grad_norm": 0.13486641645431519, "learning_rate": 1.3483974500111907e-06, "loss": 0.009, "step": 8900 }, { "epoch": 0.41937799324570796, "grad_norm": 0.12761466205120087, "learning_rate": 1.3331475597153988e-06, "loss": 0.0083, "step": 8910 }, { "epoch": 0.41984867561747646, "grad_norm": 0.12095949798822403, "learning_rate": 1.3179782349204618e-06, "loss": 0.0107, "step": 8920 }, { "epoch": 0.4203193579892449, "grad_norm": 0.09582992643117905, "learning_rate": 1.3028896166377003e-06, "loss": 0.0098, "step": 8930 }, { "epoch": 0.42079004036101336, "grad_norm": 0.10233448445796967, "learning_rate": 1.2878818451281939e-06, "loss": 0.0086, "step": 8940 }, { "epoch": 0.42126072273278187, "grad_norm": 0.19810214638710022, "learning_rate": 1.2729550599014862e-06, "loss": 0.0086, "step": 8950 }, { "epoch": 0.4217314051045503, "grad_norm": 0.10168084502220154, "learning_rate": 1.2581093997142846e-06, "loss": 0.0092, "step": 8960 }, { "epoch": 0.42220208747631877, "grad_norm": 0.1017376258969307, "learning_rate": 1.2433450025691807e-06, "loss": 0.0095, "step": 8970 }, { "epoch": 0.42267276984808727, "grad_norm": 0.12851504981517792, "learning_rate": 1.2286620057133459e-06, "loss": 0.01, "step": 8980 }, { "epoch": 0.4231434522198557, "grad_norm": 0.08634050190448761, "learning_rate": 1.2140605456372856e-06, "loss": 0.0087, "step": 8990 }, { "epoch": 0.4236141345916242, "grad_norm": 0.15187886357307434, "learning_rate": 1.1995407580735364e-06, "loss": 0.0096, "step": 9000 }, { "epoch": 0.4240848169633927, "grad_norm": 0.09797359257936478, "learning_rate": 1.1851027779954373e-06, "loss": 0.0111, "step": 9010 }, { "epoch": 0.4245554993351611, "grad_norm": 0.16339021921157837, "learning_rate": 1.1707467396158524e-06, "loss": 0.0111, "step": 9020 }, { "epoch": 0.42502618170692963, "grad_norm": 0.08181533217430115, "learning_rate": 1.1564727763859306e-06, "loss": 0.0077, "step": 9030 }, { "epoch": 0.4254968640786981, "grad_norm": 0.15458889305591583, "learning_rate": 1.1422810209938627e-06, "loss": 0.0119, "step": 9040 }, { "epoch": 0.4259675464504666, "grad_norm": 0.13464538753032684, "learning_rate": 1.1281716053636616e-06, "loss": 0.0108, "step": 9050 }, { "epoch": 0.42643822882223503, "grad_norm": 0.10789409279823303, "learning_rate": 1.1141446606539063e-06, "loss": 0.009, "step": 9060 }, { "epoch": 0.4269089111940035, "grad_norm": 0.09231771528720856, "learning_rate": 1.1002003172565579e-06, "loss": 0.0102, "step": 9070 }, { "epoch": 0.427379593565772, "grad_norm": 0.13611853122711182, "learning_rate": 1.086338704795722e-06, "loss": 0.009, "step": 9080 }, { "epoch": 0.42785027593754044, "grad_norm": 0.12488257139921188, "learning_rate": 1.0725599521264518e-06, "loss": 0.011, "step": 9090 }, { "epoch": 0.42832095830930894, "grad_norm": 0.08606278896331787, "learning_rate": 1.0588641873335558e-06, "loss": 0.0083, "step": 9100 }, { "epoch": 0.4287916406810774, "grad_norm": 0.12615059316158295, "learning_rate": 1.0452515377303974e-06, "loss": 0.0102, "step": 9110 }, { "epoch": 0.42926232305284584, "grad_norm": 0.10175028443336487, "learning_rate": 1.0317221298577163e-06, "loss": 0.0083, "step": 9120 }, { "epoch": 0.42973300542461434, "grad_norm": 0.10084985196590424, "learning_rate": 1.0182760894824607e-06, "loss": 0.0091, "step": 9130 }, { "epoch": 0.4302036877963828, "grad_norm": 0.1031237542629242, "learning_rate": 1.0049135415965926e-06, "loss": 0.0099, "step": 9140 }, { "epoch": 0.4306743701681513, "grad_norm": 0.14219872653484344, "learning_rate": 9.916346104159602e-07, "loss": 0.0096, "step": 9150 }, { "epoch": 0.43114505253991975, "grad_norm": 0.10666804015636444, "learning_rate": 9.784394193791169e-07, "loss": 0.0078, "step": 9160 }, { "epoch": 0.4316157349116882, "grad_norm": 0.14867661893367767, "learning_rate": 9.653280911461837e-07, "loss": 0.0088, "step": 9170 }, { "epoch": 0.4320864172834567, "grad_norm": 0.12209910899400711, "learning_rate": 9.523007475977064e-07, "loss": 0.008, "step": 9180 }, { "epoch": 0.43255709965522515, "grad_norm": 0.12639661133289337, "learning_rate": 9.393575098335339e-07, "loss": 0.0106, "step": 9190 }, { "epoch": 0.43302778202699366, "grad_norm": 0.14436563849449158, "learning_rate": 9.264984981716663e-07, "loss": 0.0114, "step": 9200 }, { "epoch": 0.4334984643987621, "grad_norm": 0.12432365119457245, "learning_rate": 9.137238321471675e-07, "loss": 0.0125, "step": 9210 }, { "epoch": 0.43396914677053056, "grad_norm": 0.0940658375620842, "learning_rate": 9.010336305110345e-07, "loss": 0.0087, "step": 9220 }, { "epoch": 0.43443982914229906, "grad_norm": 0.12311194092035294, "learning_rate": 8.884280112290977e-07, "loss": 0.0097, "step": 9230 }, { "epoch": 0.4349105115140675, "grad_norm": 0.10468921810388565, "learning_rate": 8.759070914809253e-07, "loss": 0.0085, "step": 9240 }, { "epoch": 0.435381193885836, "grad_norm": 0.10736406594514847, "learning_rate": 8.634709876587344e-07, "loss": 0.0091, "step": 9250 }, { "epoch": 0.43585187625760446, "grad_norm": 0.11935314536094666, "learning_rate": 8.511198153663069e-07, "loss": 0.0104, "step": 9260 }, { "epoch": 0.4363225586293729, "grad_norm": 0.09716708958148956, "learning_rate": 8.388536894179234e-07, "loss": 0.0066, "step": 9270 }, { "epoch": 0.4367932410011414, "grad_norm": 0.13693177700042725, "learning_rate": 8.266727238372763e-07, "loss": 0.0093, "step": 9280 }, { "epoch": 0.43726392337290987, "grad_norm": 0.1553613394498825, "learning_rate": 8.145770318564361e-07, "loss": 0.0092, "step": 9290 }, { "epoch": 0.4377346057446784, "grad_norm": 0.14881853759288788, "learning_rate": 8.025667259147773e-07, "loss": 0.01, "step": 9300 }, { "epoch": 0.4382052881164468, "grad_norm": 0.12403322756290436, "learning_rate": 7.906419176579416e-07, "loss": 0.0091, "step": 9310 }, { "epoch": 0.43867597048821527, "grad_norm": 0.13110636174678802, "learning_rate": 7.788027179367997e-07, "loss": 0.0091, "step": 9320 }, { "epoch": 0.4391466528599838, "grad_norm": 0.08518616855144501, "learning_rate": 7.670492368064275e-07, "loss": 0.0101, "step": 9330 }, { "epoch": 0.4396173352317522, "grad_norm": 0.10521264374256134, "learning_rate": 7.553815835250644e-07, "loss": 0.0127, "step": 9340 }, { "epoch": 0.44008801760352073, "grad_norm": 0.10062461346387863, "learning_rate": 7.437998665531221e-07, "loss": 0.0107, "step": 9350 }, { "epoch": 0.4405586999752892, "grad_norm": 0.13433079421520233, "learning_rate": 7.323041935521502e-07, "loss": 0.0105, "step": 9360 }, { "epoch": 0.44102938234705763, "grad_norm": 0.1287975013256073, "learning_rate": 7.208946713838638e-07, "loss": 0.0091, "step": 9370 }, { "epoch": 0.44150006471882614, "grad_norm": 0.12621824443340302, "learning_rate": 7.095714061091241e-07, "loss": 0.0092, "step": 9380 }, { "epoch": 0.4419707470905946, "grad_norm": 0.11793633550405502, "learning_rate": 6.983345029869681e-07, "loss": 0.0108, "step": 9390 }, { "epoch": 0.44244142946236303, "grad_norm": 0.11052042990922928, "learning_rate": 6.871840664736251e-07, "loss": 0.0092, "step": 9400 }, { "epoch": 0.44291211183413154, "grad_norm": 0.11183434724807739, "learning_rate": 6.761202002215506e-07, "loss": 0.0097, "step": 9410 }, { "epoch": 0.4433827942059, "grad_norm": 0.12875571846961975, "learning_rate": 6.65143007078447e-07, "loss": 0.009, "step": 9420 }, { "epoch": 0.4438534765776685, "grad_norm": 0.1803130954504013, "learning_rate": 6.542525890863338e-07, "loss": 0.01, "step": 9430 }, { "epoch": 0.44432415894943694, "grad_norm": 0.11501237750053406, "learning_rate": 6.434490474805743e-07, "loss": 0.009, "step": 9440 }, { "epoch": 0.4447948413212054, "grad_norm": 0.12859542667865753, "learning_rate": 6.327324826889469e-07, "loss": 0.0118, "step": 9450 }, { "epoch": 0.4452655236929739, "grad_norm": 0.1547563374042511, "learning_rate": 6.221029943307099e-07, "loss": 0.0095, "step": 9460 }, { "epoch": 0.44573620606474235, "grad_norm": 0.08650557696819305, "learning_rate": 6.115606812156749e-07, "loss": 0.0102, "step": 9470 }, { "epoch": 0.44620688843651085, "grad_norm": 0.10128959268331528, "learning_rate": 6.01105641343287e-07, "loss": 0.0099, "step": 9480 }, { "epoch": 0.4466775708082793, "grad_norm": 0.12898880243301392, "learning_rate": 5.907379719017181e-07, "loss": 0.0092, "step": 9490 }, { "epoch": 0.44714825318004775, "grad_norm": 0.1226678267121315, "learning_rate": 5.804577692669533e-07, "loss": 0.0118, "step": 9500 }, { "epoch": 0.44761893555181625, "grad_norm": 0.11136288195848465, "learning_rate": 5.702651290019112e-07, "loss": 0.01, "step": 9510 }, { "epoch": 0.4480896179235847, "grad_norm": 0.12227800488471985, "learning_rate": 5.601601458555406e-07, "loss": 0.009, "step": 9520 }, { "epoch": 0.4485603002953532, "grad_norm": 0.09158363193273544, "learning_rate": 5.501429137619452e-07, "loss": 0.0083, "step": 9530 }, { "epoch": 0.44903098266712166, "grad_norm": 0.10648646950721741, "learning_rate": 5.402135258395114e-07, "loss": 0.0075, "step": 9540 }, { "epoch": 0.4495016650388901, "grad_norm": 0.1419914960861206, "learning_rate": 5.303720743900475e-07, "loss": 0.0076, "step": 9550 }, { "epoch": 0.4499723474106586, "grad_norm": 0.09056587517261505, "learning_rate": 5.206186508979083e-07, "loss": 0.0076, "step": 9560 }, { "epoch": 0.45044302978242706, "grad_norm": 0.10235956311225891, "learning_rate": 5.109533460291694e-07, "loss": 0.0118, "step": 9570 }, { "epoch": 0.45091371215419557, "grad_norm": 0.0866006463766098, "learning_rate": 5.01376249630764e-07, "loss": 0.008, "step": 9580 }, { "epoch": 0.451384394525964, "grad_norm": 0.11908163875341415, "learning_rate": 4.918874507296578e-07, "loss": 0.0089, "step": 9590 }, { "epoch": 0.45185507689773247, "grad_norm": 0.13241758942604065, "learning_rate": 4.824870375320156e-07, "loss": 0.0096, "step": 9600 }, { "epoch": 0.45232575926950097, "grad_norm": 0.15662206709384918, "learning_rate": 4.731750974223892e-07, "loss": 0.0103, "step": 9610 }, { "epoch": 0.4527964416412694, "grad_norm": 0.09037859737873077, "learning_rate": 4.639517169628971e-07, "loss": 0.0094, "step": 9620 }, { "epoch": 0.4532671240130379, "grad_norm": 0.1385626494884491, "learning_rate": 4.548169818924275e-07, "loss": 0.0099, "step": 9630 }, { "epoch": 0.4537378063848064, "grad_norm": 0.12310747057199478, "learning_rate": 4.4577097712582897e-07, "loss": 0.0076, "step": 9640 }, { "epoch": 0.4542084887565748, "grad_norm": 0.2014337033033371, "learning_rate": 4.3681378675313747e-07, "loss": 0.0092, "step": 9650 }, { "epoch": 0.45467917112834333, "grad_norm": 0.11354377120733261, "learning_rate": 4.279454940387828e-07, "loss": 0.0077, "step": 9660 }, { "epoch": 0.4551498535001118, "grad_norm": 0.12230443209409714, "learning_rate": 4.191661814208181e-07, "loss": 0.0118, "step": 9670 }, { "epoch": 0.4556205358718803, "grad_norm": 0.0864444151520729, "learning_rate": 4.1047593051015245e-07, "loss": 0.01, "step": 9680 }, { "epoch": 0.45609121824364873, "grad_norm": 0.12819884717464447, "learning_rate": 4.018748220897994e-07, "loss": 0.0088, "step": 9690 }, { "epoch": 0.4565619006154172, "grad_norm": 0.07518994063138962, "learning_rate": 3.933629361141078e-07, "loss": 0.0095, "step": 9700 }, { "epoch": 0.4570325829871857, "grad_norm": 0.13385291397571564, "learning_rate": 3.849403517080452e-07, "loss": 0.0103, "step": 9710 }, { "epoch": 0.45750326535895414, "grad_norm": 0.10270257294178009, "learning_rate": 3.7660714716643563e-07, "loss": 0.0093, "step": 9720 }, { "epoch": 0.45797394773072264, "grad_norm": 0.12753267586231232, "learning_rate": 3.683633999532521e-07, "loss": 0.0109, "step": 9730 }, { "epoch": 0.4584446301024911, "grad_norm": 0.0898975059390068, "learning_rate": 3.60209186700885e-07, "loss": 0.0082, "step": 9740 }, { "epoch": 0.45891531247425954, "grad_norm": 0.08164086192846298, "learning_rate": 3.521445832094328e-07, "loss": 0.0097, "step": 9750 }, { "epoch": 0.45938599484602805, "grad_norm": 0.11694973707199097, "learning_rate": 3.441696644459969e-07, "loss": 0.0096, "step": 9760 }, { "epoch": 0.4598566772177965, "grad_norm": 0.1071474552154541, "learning_rate": 3.362845045439911e-07, "loss": 0.0093, "step": 9770 }, { "epoch": 0.460327359589565, "grad_norm": 0.09316183626651764, "learning_rate": 3.284891768024401e-07, "loss": 0.0112, "step": 9780 }, { "epoch": 0.46079804196133345, "grad_norm": 0.09821862727403641, "learning_rate": 3.207837536853087e-07, "loss": 0.0102, "step": 9790 }, { "epoch": 0.4612687243331019, "grad_norm": 0.10373802483081818, "learning_rate": 3.131683068208247e-07, "loss": 0.008, "step": 9800 }, { "epoch": 0.4617394067048704, "grad_norm": 0.12380393594503403, "learning_rate": 3.0564290700081044e-07, "loss": 0.0091, "step": 9810 }, { "epoch": 0.46221008907663885, "grad_norm": 0.1361027956008911, "learning_rate": 2.9820762418002916e-07, "loss": 0.0098, "step": 9820 }, { "epoch": 0.4626807714484073, "grad_norm": 0.10926921665668488, "learning_rate": 2.908625274755339e-07, "loss": 0.0111, "step": 9830 }, { "epoch": 0.4631514538201758, "grad_norm": 0.07226519286632538, "learning_rate": 2.8360768516601745e-07, "loss": 0.0072, "step": 9840 }, { "epoch": 0.46362213619194426, "grad_norm": 0.1176331490278244, "learning_rate": 2.764431646911947e-07, "loss": 0.0094, "step": 9850 }, { "epoch": 0.46409281856371276, "grad_norm": 0.14391915500164032, "learning_rate": 2.693690326511533e-07, "loss": 0.0099, "step": 9860 }, { "epoch": 0.4645635009354812, "grad_norm": 0.10774114727973938, "learning_rate": 2.6238535480575533e-07, "loss": 0.0088, "step": 9870 }, { "epoch": 0.46503418330724966, "grad_norm": 0.0891522765159607, "learning_rate": 2.55492196074012e-07, "loss": 0.0092, "step": 9880 }, { "epoch": 0.46550486567901816, "grad_norm": 0.15805207192897797, "learning_rate": 2.4868962053348764e-07, "loss": 0.0099, "step": 9890 }, { "epoch": 0.4659755480507866, "grad_norm": 0.15476396679878235, "learning_rate": 2.419776914196981e-07, "loss": 0.0103, "step": 9900 }, { "epoch": 0.4664462304225551, "grad_norm": 0.13449379801750183, "learning_rate": 2.3535647112553295e-07, "loss": 0.0121, "step": 9910 }, { "epoch": 0.46691691279432357, "grad_norm": 0.053645629435777664, "learning_rate": 2.288260212006599e-07, "loss": 0.0094, "step": 9920 }, { "epoch": 0.467387595166092, "grad_norm": 0.12548840045928955, "learning_rate": 2.2238640235097032e-07, "loss": 0.0103, "step": 9930 }, { "epoch": 0.4678582775378605, "grad_norm": 0.10619097948074341, "learning_rate": 2.1603767443799994e-07, "loss": 0.0083, "step": 9940 }, { "epoch": 0.468328959909629, "grad_norm": 0.12959586083889008, "learning_rate": 2.097798964783826e-07, "loss": 0.0109, "step": 9950 }, { "epoch": 0.4687996422813975, "grad_norm": 0.1125236302614212, "learning_rate": 2.0361312664329502e-07, "loss": 0.0086, "step": 9960 }, { "epoch": 0.4692703246531659, "grad_norm": 0.10379857569932938, "learning_rate": 1.975374222579207e-07, "loss": 0.011, "step": 9970 }, { "epoch": 0.4697410070249344, "grad_norm": 0.1815749704837799, "learning_rate": 1.9155283980091366e-07, "loss": 0.0096, "step": 9980 }, { "epoch": 0.4702116893967029, "grad_norm": 0.07143761217594147, "learning_rate": 1.8565943490387761e-07, "loss": 0.0098, "step": 9990 }, { "epoch": 0.47068237176847133, "grad_norm": 0.07605130225419998, "learning_rate": 1.79857262350841e-07, "loss": 0.0081, "step": 10000 }, { "epoch": 0.47115305414023984, "grad_norm": 0.12292288988828659, "learning_rate": 1.741463760777584e-07, "loss": 0.0087, "step": 10010 }, { "epoch": 0.4716237365120083, "grad_norm": 0.1324399709701538, "learning_rate": 1.685268291719999e-07, "loss": 0.0086, "step": 10020 }, { "epoch": 0.47209441888377673, "grad_norm": 0.1211579442024231, "learning_rate": 1.6299867387186363e-07, "loss": 0.0118, "step": 10030 }, { "epoch": 0.47256510125554524, "grad_norm": 0.06875480711460114, "learning_rate": 1.5756196156608393e-07, "loss": 0.0107, "step": 10040 }, { "epoch": 0.4730357836273137, "grad_norm": 0.062625452876091, "learning_rate": 1.5221674279336408e-07, "loss": 0.0089, "step": 10050 }, { "epoch": 0.4735064659990822, "grad_norm": 0.1908957064151764, "learning_rate": 1.4696306724189312e-07, "loss": 0.0128, "step": 10060 }, { "epoch": 0.47397714837085064, "grad_norm": 0.12590709328651428, "learning_rate": 1.4180098374889429e-07, "loss": 0.0109, "step": 10070 }, { "epoch": 0.4744478307426191, "grad_norm": 0.14626096189022064, "learning_rate": 1.367305403001673e-07, "loss": 0.0094, "step": 10080 }, { "epoch": 0.4749185131143876, "grad_norm": 0.23703539371490479, "learning_rate": 1.3175178402964116e-07, "loss": 0.0103, "step": 10090 }, { "epoch": 0.47538919548615605, "grad_norm": 0.1613423228263855, "learning_rate": 1.2686476121894e-07, "loss": 0.0104, "step": 10100 }, { "epoch": 0.47585987785792455, "grad_norm": 0.10613039880990982, "learning_rate": 1.220695172969477e-07, "loss": 0.0117, "step": 10110 }, { "epoch": 0.476330560229693, "grad_norm": 0.10097139328718185, "learning_rate": 1.1736609683938749e-07, "loss": 0.0082, "step": 10120 }, { "epoch": 0.47680124260146145, "grad_norm": 0.10395904630422592, "learning_rate": 1.1275454356841298e-07, "loss": 0.01, "step": 10130 }, { "epoch": 0.47727192497322996, "grad_norm": 0.1282517910003662, "learning_rate": 1.0823490035218986e-07, "loss": 0.01, "step": 10140 }, { "epoch": 0.4777426073449984, "grad_norm": 0.1378805786371231, "learning_rate": 1.038072092045117e-07, "loss": 0.0078, "step": 10150 }, { "epoch": 0.4782132897167669, "grad_norm": 0.10759636759757996, "learning_rate": 9.947151128439692e-08, "loss": 0.0106, "step": 10160 }, { "epoch": 0.47868397208853536, "grad_norm": 0.13750086724758148, "learning_rate": 9.52278468957124e-08, "loss": 0.0117, "step": 10170 }, { "epoch": 0.4791546544603038, "grad_norm": 0.09412442147731781, "learning_rate": 9.107625548679944e-08, "loss": 0.0075, "step": 10180 }, { "epoch": 0.4796253368320723, "grad_norm": 0.12823505699634552, "learning_rate": 8.701677565010725e-08, "loss": 0.0092, "step": 10190 }, { "epoch": 0.48009601920384076, "grad_norm": 0.1552668958902359, "learning_rate": 8.304944512182666e-08, "loss": 0.0079, "step": 10200 }, { "epoch": 0.4805667015756092, "grad_norm": 0.10657220333814621, "learning_rate": 7.91743007815493e-08, "loss": 0.0106, "step": 10210 }, { "epoch": 0.4810373839473777, "grad_norm": 0.12126398086547852, "learning_rate": 7.539137865192003e-08, "loss": 0.009, "step": 10220 }, { "epoch": 0.48150806631914617, "grad_norm": 0.13223843276500702, "learning_rate": 7.17007138983028e-08, "loss": 0.0099, "step": 10230 }, { "epoch": 0.48197874869091467, "grad_norm": 0.11215262115001678, "learning_rate": 6.810234082845313e-08, "loss": 0.0086, "step": 10240 }, { "epoch": 0.4824494310626831, "grad_norm": 0.1248275563120842, "learning_rate": 6.459629289219838e-08, "loss": 0.0102, "step": 10250 }, { "epoch": 0.48292011343445157, "grad_norm": 0.09382683038711548, "learning_rate": 6.118260268112908e-08, "loss": 0.0103, "step": 10260 }, { "epoch": 0.4833907958062201, "grad_norm": 0.11404488980770111, "learning_rate": 5.786130192829809e-08, "loss": 0.0094, "step": 10270 }, { "epoch": 0.4838614781779885, "grad_norm": 0.12813036143779755, "learning_rate": 5.4632421507916366e-08, "loss": 0.0094, "step": 10280 }, { "epoch": 0.48433216054975703, "grad_norm": 0.14295053482055664, "learning_rate": 5.1495991435076555e-08, "loss": 0.0092, "step": 10290 }, { "epoch": 0.4848028429215255, "grad_norm": 0.1135953962802887, "learning_rate": 4.845204086546984e-08, "loss": 0.0075, "step": 10300 }, { "epoch": 0.48527352529329393, "grad_norm": 0.16795606911182404, "learning_rate": 4.5500598095110645e-08, "loss": 0.0107, "step": 10310 }, { "epoch": 0.48574420766506243, "grad_norm": 0.11554685980081558, "learning_rate": 4.264169056008016e-08, "loss": 0.0101, "step": 10320 }, { "epoch": 0.4862148900368309, "grad_norm": 0.09621243178844452, "learning_rate": 3.987534483626987e-08, "loss": 0.0083, "step": 10330 }, { "epoch": 0.4866855724085994, "grad_norm": 0.12303745001554489, "learning_rate": 3.720158663913065e-08, "loss": 0.0101, "step": 10340 }, { "epoch": 0.48715625478036784, "grad_norm": 0.10024432837963104, "learning_rate": 3.4620440823438517e-08, "loss": 0.0104, "step": 10350 }, { "epoch": 0.4876269371521363, "grad_norm": 0.1421239823102951, "learning_rate": 3.2131931383059256e-08, "loss": 0.0098, "step": 10360 }, { "epoch": 0.4880976195239048, "grad_norm": 0.15799741446971893, "learning_rate": 2.9736081450730813e-08, "loss": 0.0108, "step": 10370 }, { "epoch": 0.48856830189567324, "grad_norm": 0.0686628594994545, "learning_rate": 2.7432913297841256e-08, "loss": 0.0091, "step": 10380 }, { "epoch": 0.48903898426744175, "grad_norm": 0.09906376898288727, "learning_rate": 2.5222448334227822e-08, "loss": 0.0135, "step": 10390 }, { "epoch": 0.4895096666392102, "grad_norm": 0.12504538893699646, "learning_rate": 2.3104707107974857e-08, "loss": 0.008, "step": 10400 }, { "epoch": 0.48998034901097864, "grad_norm": 0.16366298496723175, "learning_rate": 2.1079709305226183e-08, "loss": 0.0083, "step": 10410 }, { "epoch": 0.49045103138274715, "grad_norm": 0.10536178946495056, "learning_rate": 1.914747374999304e-08, "loss": 0.0107, "step": 10420 }, { "epoch": 0.4909217137545156, "grad_norm": 0.12254618853330612, "learning_rate": 1.7308018403991988e-08, "loss": 0.0081, "step": 10430 }, { "epoch": 0.4913923961262841, "grad_norm": 0.13702034950256348, "learning_rate": 1.556136036646838e-08, "loss": 0.0077, "step": 10440 }, { "epoch": 0.49186307849805255, "grad_norm": 0.1288021057844162, "learning_rate": 1.3907515874042044e-08, "loss": 0.0075, "step": 10450 }, { "epoch": 0.492333760869821, "grad_norm": 0.10662814974784851, "learning_rate": 1.2346500300551844e-08, "loss": 0.0087, "step": 10460 }, { "epoch": 0.4928044432415895, "grad_norm": 0.12214840948581696, "learning_rate": 1.0878328156919139e-08, "loss": 0.0104, "step": 10470 }, { "epoch": 0.49327512561335796, "grad_norm": 0.15854117274284363, "learning_rate": 9.503013091006763e-09, "loss": 0.0118, "step": 10480 }, { "epoch": 0.49374580798512646, "grad_norm": 0.09191913902759552, "learning_rate": 8.220567887498033e-09, "loss": 0.011, "step": 10490 }, { "epoch": 0.4942164903568949, "grad_norm": 0.11399047076702118, "learning_rate": 7.031004467771274e-09, "loss": 0.0089, "step": 10500 }, { "epoch": 0.49468717272866336, "grad_norm": 0.10139255970716476, "learning_rate": 5.934333889794364e-09, "loss": 0.0091, "step": 10510 }, { "epoch": 0.49515785510043187, "grad_norm": 0.12150579690933228, "learning_rate": 4.93056634801925e-09, "loss": 0.0082, "step": 10520 }, { "epoch": 0.4956285374722003, "grad_norm": 0.13589785993099213, "learning_rate": 4.019711173289809e-09, "loss": 0.0095, "step": 10530 }, { "epoch": 0.4960992198439688, "grad_norm": 0.19519634544849396, "learning_rate": 3.201776832749692e-09, "loss": 0.0113, "step": 10540 }, { "epoch": 0.49656990221573727, "grad_norm": 0.11004017293453217, "learning_rate": 2.4767709297712772e-09, "loss": 0.01, "step": 10550 }, { "epoch": 0.4970405845875057, "grad_norm": 0.11647970974445343, "learning_rate": 1.8447002038779471e-09, "loss": 0.0106, "step": 10560 }, { "epoch": 0.4975112669592742, "grad_norm": 0.08774129301309586, "learning_rate": 1.305570530686362e-09, "loss": 0.0082, "step": 10570 }, { "epoch": 0.4979819493310427, "grad_norm": 0.13334845006465912, "learning_rate": 8.593869218487261e-10, "loss": 0.0128, "step": 10580 }, { "epoch": 0.4984526317028112, "grad_norm": 0.10003668814897537, "learning_rate": 5.061535250061589e-10, "loss": 0.0095, "step": 10590 }, { "epoch": 0.4989233140745796, "grad_norm": 0.09516848623752594, "learning_rate": 2.45873623754278e-10, "loss": 0.0081, "step": 10600 }, { "epoch": 0.4993939964463481, "grad_norm": 0.08035647869110107, "learning_rate": 7.85496376076722e-11, "loss": 0.0081, "step": 10610 }, { "epoch": 0.4998646788181166, "grad_norm": 0.07603952288627625, "learning_rate": 4.1831219799171044e-12, "loss": 0.0094, "step": 10620 }, { "epoch": 0.5000058835296471, "step": 10623, "total_flos": 5.473467596718211e+18, "train_loss": 0.01674340627431746, "train_runtime": 68174.0536, "train_samples_per_second": 1.247, "train_steps_per_second": 0.156 } ], "logging_steps": 10, "max_steps": 10623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.473467596718211e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }