{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 721, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013879250520471894, "grad_norm": 0.2488352358341217, "learning_rate": 0.0, "loss": 0.0507, "step": 1 }, { "epoch": 0.002775850104094379, "grad_norm": 0.2802581489086151, "learning_rate": 4.5454545454545457e-07, "loss": 0.0448, "step": 2 }, { "epoch": 0.004163775156141568, "grad_norm": 0.20850145816802979, "learning_rate": 9.090909090909091e-07, "loss": 0.0382, "step": 3 }, { "epoch": 0.005551700208188758, "grad_norm": 0.24074043333530426, "learning_rate": 1.3636363636363636e-06, "loss": 0.0455, "step": 4 }, { "epoch": 0.006939625260235947, "grad_norm": 0.33691975474357605, "learning_rate": 1.8181818181818183e-06, "loss": 0.0663, "step": 5 }, { "epoch": 0.008327550312283136, "grad_norm": 0.24708178639411926, "learning_rate": 2.2727272727272728e-06, "loss": 0.0407, "step": 6 }, { "epoch": 0.009715475364330326, "grad_norm": 0.23655228316783905, "learning_rate": 2.7272727272727272e-06, "loss": 0.0348, "step": 7 }, { "epoch": 0.011103400416377515, "grad_norm": 0.2452535629272461, "learning_rate": 3.181818181818182e-06, "loss": 0.0507, "step": 8 }, { "epoch": 0.012491325468424705, "grad_norm": 0.31181129813194275, "learning_rate": 3.6363636363636366e-06, "loss": 0.0474, "step": 9 }, { "epoch": 0.013879250520471894, "grad_norm": 0.256910115480423, "learning_rate": 4.0909090909090915e-06, "loss": 0.0445, "step": 10 }, { "epoch": 0.015267175572519083, "grad_norm": 0.30851638317108154, "learning_rate": 4.5454545454545455e-06, "loss": 0.0472, "step": 11 }, { "epoch": 0.016655100624566273, "grad_norm": 0.31822746992111206, "learning_rate": 5e-06, "loss": 0.0611, "step": 12 }, { "epoch": 0.018043025676613464, "grad_norm": 0.25003501772880554, "learning_rate": 5.4545454545454545e-06, "loss": 0.0393, "step": 13 }, { "epoch": 0.01943095072866065, "grad_norm": 0.2273981273174286, "learning_rate": 5.90909090909091e-06, "loss": 0.0328, "step": 14 }, { "epoch": 0.020818875780707843, "grad_norm": 0.2772292196750641, "learning_rate": 6.363636363636364e-06, "loss": 0.0349, "step": 15 }, { "epoch": 0.02220680083275503, "grad_norm": 0.36983752250671387, "learning_rate": 6.818181818181818e-06, "loss": 0.0476, "step": 16 }, { "epoch": 0.02359472588480222, "grad_norm": 0.27299028635025024, "learning_rate": 7.272727272727273e-06, "loss": 0.04, "step": 17 }, { "epoch": 0.02498265093684941, "grad_norm": 0.27781346440315247, "learning_rate": 7.727272727272727e-06, "loss": 0.0356, "step": 18 }, { "epoch": 0.0263705759888966, "grad_norm": 0.313568651676178, "learning_rate": 8.181818181818183e-06, "loss": 0.0379, "step": 19 }, { "epoch": 0.027758501040943788, "grad_norm": 0.2946470081806183, "learning_rate": 8.636363636363637e-06, "loss": 0.0388, "step": 20 }, { "epoch": 0.02914642609299098, "grad_norm": 0.40478628873825073, "learning_rate": 9.090909090909091e-06, "loss": 0.036, "step": 21 }, { "epoch": 0.030534351145038167, "grad_norm": 0.26556551456451416, "learning_rate": 9.545454545454547e-06, "loss": 0.0483, "step": 22 }, { "epoch": 0.031922276197085354, "grad_norm": 0.24696266651153564, "learning_rate": 1e-05, "loss": 0.0408, "step": 23 }, { "epoch": 0.033310201249132546, "grad_norm": 0.26831310987472534, "learning_rate": 9.999949500779842e-06, "loss": 0.0408, "step": 24 }, { "epoch": 0.03469812630117974, "grad_norm": 0.27735695242881775, "learning_rate": 9.999798004139435e-06, "loss": 0.054, "step": 25 }, { "epoch": 0.03608605135322693, "grad_norm": 0.32683494687080383, "learning_rate": 9.999545513138964e-06, "loss": 0.0455, "step": 26 }, { "epoch": 0.03747397640527411, "grad_norm": 0.2764844596385956, "learning_rate": 9.999192032878667e-06, "loss": 0.0435, "step": 27 }, { "epoch": 0.0388619014573213, "grad_norm": 0.2449122816324234, "learning_rate": 9.998737570498737e-06, "loss": 0.0437, "step": 28 }, { "epoch": 0.040249826509368494, "grad_norm": 0.28722766041755676, "learning_rate": 9.998182135179173e-06, "loss": 0.0438, "step": 29 }, { "epoch": 0.041637751561415685, "grad_norm": 0.3285958468914032, "learning_rate": 9.997525738139595e-06, "loss": 0.0503, "step": 30 }, { "epoch": 0.04302567661346287, "grad_norm": 0.2966720163822174, "learning_rate": 9.996768392639015e-06, "loss": 0.045, "step": 31 }, { "epoch": 0.04441360166551006, "grad_norm": 0.30295976996421814, "learning_rate": 9.99591011397558e-06, "loss": 0.0468, "step": 32 }, { "epoch": 0.04580152671755725, "grad_norm": 0.2856103777885437, "learning_rate": 9.994950919486248e-06, "loss": 0.0467, "step": 33 }, { "epoch": 0.04718945176960444, "grad_norm": 0.3122520446777344, "learning_rate": 9.99389082854645e-06, "loss": 0.0519, "step": 34 }, { "epoch": 0.048577376821651634, "grad_norm": 0.34234896302223206, "learning_rate": 9.992729862569694e-06, "loss": 0.0539, "step": 35 }, { "epoch": 0.04996530187369882, "grad_norm": 0.23981930315494537, "learning_rate": 9.99146804500713e-06, "loss": 0.046, "step": 36 }, { "epoch": 0.05135322692574601, "grad_norm": 0.30006060004234314, "learning_rate": 9.990105401347075e-06, "loss": 0.0563, "step": 37 }, { "epoch": 0.0527411519777932, "grad_norm": 0.23050066828727722, "learning_rate": 9.988641959114512e-06, "loss": 0.0419, "step": 38 }, { "epoch": 0.05412907702984039, "grad_norm": 0.3269716501235962, "learning_rate": 9.987077747870512e-06, "loss": 0.0634, "step": 39 }, { "epoch": 0.055517002081887576, "grad_norm": 0.25331613421440125, "learning_rate": 9.985412799211658e-06, "loss": 0.0446, "step": 40 }, { "epoch": 0.05690492713393477, "grad_norm": 0.22769664227962494, "learning_rate": 9.98364714676939e-06, "loss": 0.0384, "step": 41 }, { "epoch": 0.05829285218598196, "grad_norm": 0.290194034576416, "learning_rate": 9.981780826209342e-06, "loss": 0.0502, "step": 42 }, { "epoch": 0.05968077723802915, "grad_norm": 0.2815046012401581, "learning_rate": 9.979813875230604e-06, "loss": 0.0472, "step": 43 }, { "epoch": 0.061068702290076333, "grad_norm": 0.25529077649116516, "learning_rate": 9.97774633356497e-06, "loss": 0.0551, "step": 44 }, { "epoch": 0.062456627342123525, "grad_norm": 0.24432392418384552, "learning_rate": 9.97557824297614e-06, "loss": 0.0376, "step": 45 }, { "epoch": 0.06384455239417071, "grad_norm": 0.29393213987350464, "learning_rate": 9.97330964725887e-06, "loss": 0.056, "step": 46 }, { "epoch": 0.0652324774462179, "grad_norm": 0.24784711003303528, "learning_rate": 9.970940592238077e-06, "loss": 0.0415, "step": 47 }, { "epoch": 0.06662040249826509, "grad_norm": 0.21179543435573578, "learning_rate": 9.968471125767942e-06, "loss": 0.0415, "step": 48 }, { "epoch": 0.06800832755031229, "grad_norm": 0.23930367827415466, "learning_rate": 9.965901297730914e-06, "loss": 0.0503, "step": 49 }, { "epoch": 0.06939625260235947, "grad_norm": 0.24700821936130524, "learning_rate": 9.963231160036716e-06, "loss": 0.0391, "step": 50 }, { "epoch": 0.07078417765440666, "grad_norm": 0.29365330934524536, "learning_rate": 9.960460766621299e-06, "loss": 0.0496, "step": 51 }, { "epoch": 0.07217210270645386, "grad_norm": 0.2956799566745758, "learning_rate": 9.957590173445746e-06, "loss": 0.0451, "step": 52 }, { "epoch": 0.07356002775850104, "grad_norm": 0.288196861743927, "learning_rate": 9.954619438495142e-06, "loss": 0.0512, "step": 53 }, { "epoch": 0.07494795281054822, "grad_norm": 0.24770434200763702, "learning_rate": 9.951548621777409e-06, "loss": 0.0437, "step": 54 }, { "epoch": 0.07633587786259542, "grad_norm": 0.23926012217998505, "learning_rate": 9.948377785322082e-06, "loss": 0.0325, "step": 55 }, { "epoch": 0.0777238029146426, "grad_norm": 0.25855472683906555, "learning_rate": 9.945106993179074e-06, "loss": 0.0426, "step": 56 }, { "epoch": 0.0791117279666898, "grad_norm": 0.3099120259284973, "learning_rate": 9.941736311417362e-06, "loss": 0.0474, "step": 57 }, { "epoch": 0.08049965301873699, "grad_norm": 0.22296354174613953, "learning_rate": 9.938265808123667e-06, "loss": 0.0377, "step": 58 }, { "epoch": 0.08188757807078417, "grad_norm": 0.2595604658126831, "learning_rate": 9.934695553401076e-06, "loss": 0.0492, "step": 59 }, { "epoch": 0.08327550312283137, "grad_norm": 0.2444402575492859, "learning_rate": 9.931025619367617e-06, "loss": 0.0489, "step": 60 }, { "epoch": 0.08466342817487855, "grad_norm": 0.2848093807697296, "learning_rate": 9.927256080154813e-06, "loss": 0.0601, "step": 61 }, { "epoch": 0.08605135322692574, "grad_norm": 0.24992454051971436, "learning_rate": 9.923387011906183e-06, "loss": 0.0392, "step": 62 }, { "epoch": 0.08743927827897294, "grad_norm": 0.28840649127960205, "learning_rate": 9.919418492775694e-06, "loss": 0.052, "step": 63 }, { "epoch": 0.08882720333102012, "grad_norm": 0.2522892951965332, "learning_rate": 9.915350602926198e-06, "loss": 0.0499, "step": 64 }, { "epoch": 0.09021512838306732, "grad_norm": 0.6776983737945557, "learning_rate": 9.911183424527802e-06, "loss": 0.0491, "step": 65 }, { "epoch": 0.0916030534351145, "grad_norm": 0.2539517283439636, "learning_rate": 9.906917041756208e-06, "loss": 0.0456, "step": 66 }, { "epoch": 0.09299097848716169, "grad_norm": 0.2568037509918213, "learning_rate": 9.902551540791016e-06, "loss": 0.0416, "step": 67 }, { "epoch": 0.09437890353920889, "grad_norm": 0.2917027771472931, "learning_rate": 9.898087009813985e-06, "loss": 0.0559, "step": 68 }, { "epoch": 0.09576682859125607, "grad_norm": 0.20621994137763977, "learning_rate": 9.893523539007248e-06, "loss": 0.0301, "step": 69 }, { "epoch": 0.09715475364330327, "grad_norm": 0.2382073998451233, "learning_rate": 9.888861220551494e-06, "loss": 0.0457, "step": 70 }, { "epoch": 0.09854267869535045, "grad_norm": 0.27149689197540283, "learning_rate": 9.884100148624096e-06, "loss": 0.0507, "step": 71 }, { "epoch": 0.09993060374739764, "grad_norm": 0.30372723937034607, "learning_rate": 9.879240419397227e-06, "loss": 0.0604, "step": 72 }, { "epoch": 0.10131852879944483, "grad_norm": 0.32783612608909607, "learning_rate": 9.874282131035899e-06, "loss": 0.0538, "step": 73 }, { "epoch": 0.10270645385149202, "grad_norm": 0.2645081877708435, "learning_rate": 9.86922538369599e-06, "loss": 0.0486, "step": 74 }, { "epoch": 0.1040943789035392, "grad_norm": 0.26425695419311523, "learning_rate": 9.864070279522222e-06, "loss": 0.0459, "step": 75 }, { "epoch": 0.1054823039555864, "grad_norm": 0.25774726271629333, "learning_rate": 9.858816922646088e-06, "loss": 0.0461, "step": 76 }, { "epoch": 0.10687022900763359, "grad_norm": 0.2583499550819397, "learning_rate": 9.853465419183759e-06, "loss": 0.0389, "step": 77 }, { "epoch": 0.10825815405968078, "grad_norm": 0.28452831506729126, "learning_rate": 9.848015877233935e-06, "loss": 0.0569, "step": 78 }, { "epoch": 0.10964607911172797, "grad_norm": 0.357303261756897, "learning_rate": 9.842468406875665e-06, "loss": 0.0636, "step": 79 }, { "epoch": 0.11103400416377515, "grad_norm": 0.25684916973114014, "learning_rate": 9.836823120166116e-06, "loss": 0.0518, "step": 80 }, { "epoch": 0.11242192921582235, "grad_norm": 0.2970696687698364, "learning_rate": 9.831080131138325e-06, "loss": 0.0517, "step": 81 }, { "epoch": 0.11380985426786953, "grad_norm": 0.3112433850765228, "learning_rate": 9.825239555798875e-06, "loss": 0.0529, "step": 82 }, { "epoch": 0.11519777931991672, "grad_norm": 0.23133604228496552, "learning_rate": 9.819301512125565e-06, "loss": 0.0418, "step": 83 }, { "epoch": 0.11658570437196392, "grad_norm": 0.3827936053276062, "learning_rate": 9.813266120065028e-06, "loss": 0.0558, "step": 84 }, { "epoch": 0.1179736294240111, "grad_norm": 0.4031021296977997, "learning_rate": 9.807133501530297e-06, "loss": 0.0582, "step": 85 }, { "epoch": 0.1193615544760583, "grad_norm": 0.2524511218070984, "learning_rate": 9.800903780398357e-06, "loss": 0.0405, "step": 86 }, { "epoch": 0.12074947952810548, "grad_norm": 0.25606897473335266, "learning_rate": 9.794577082507631e-06, "loss": 0.0517, "step": 87 }, { "epoch": 0.12213740458015267, "grad_norm": 0.24462860822677612, "learning_rate": 9.788153535655442e-06, "loss": 0.0465, "step": 88 }, { "epoch": 0.12352532963219987, "grad_norm": 0.2757568955421448, "learning_rate": 9.781633269595432e-06, "loss": 0.0439, "step": 89 }, { "epoch": 0.12491325468424705, "grad_norm": 0.3216729462146759, "learning_rate": 9.77501641603494e-06, "loss": 0.0493, "step": 90 }, { "epoch": 0.12630117973629423, "grad_norm": 0.3510143458843231, "learning_rate": 9.76830310863235e-06, "loss": 0.0427, "step": 91 }, { "epoch": 0.12768910478834142, "grad_norm": 0.3293008506298065, "learning_rate": 9.761493482994374e-06, "loss": 0.0478, "step": 92 }, { "epoch": 0.12907702984038863, "grad_norm": 0.24690201878547668, "learning_rate": 9.754587676673323e-06, "loss": 0.0428, "step": 93 }, { "epoch": 0.1304649548924358, "grad_norm": 0.2786445915699005, "learning_rate": 9.747585829164332e-06, "loss": 0.0434, "step": 94 }, { "epoch": 0.131852879944483, "grad_norm": 0.2033212035894394, "learning_rate": 9.74048808190254e-06, "loss": 0.0416, "step": 95 }, { "epoch": 0.13324080499653018, "grad_norm": 0.2768020033836365, "learning_rate": 9.733294578260224e-06, "loss": 0.0495, "step": 96 }, { "epoch": 0.13462873004857737, "grad_norm": 0.2745535671710968, "learning_rate": 9.726005463543913e-06, "loss": 0.0464, "step": 97 }, { "epoch": 0.13601665510062458, "grad_norm": 0.2892036736011505, "learning_rate": 9.718620884991455e-06, "loss": 0.0471, "step": 98 }, { "epoch": 0.13740458015267176, "grad_norm": 0.26574069261550903, "learning_rate": 9.711140991769028e-06, "loss": 0.0497, "step": 99 }, { "epoch": 0.13879250520471895, "grad_norm": 0.23448492586612701, "learning_rate": 9.703565934968146e-06, "loss": 0.041, "step": 100 }, { "epoch": 0.14018043025676613, "grad_norm": 0.21899428963661194, "learning_rate": 9.695895867602591e-06, "loss": 0.0425, "step": 101 }, { "epoch": 0.14156835530881332, "grad_norm": 0.30496442317962646, "learning_rate": 9.688130944605332e-06, "loss": 0.048, "step": 102 }, { "epoch": 0.14295628036086053, "grad_norm": 0.23567788302898407, "learning_rate": 9.680271322825392e-06, "loss": 0.0402, "step": 103 }, { "epoch": 0.1443442054129077, "grad_norm": 0.2686285674571991, "learning_rate": 9.672317161024679e-06, "loss": 0.0511, "step": 104 }, { "epoch": 0.1457321304649549, "grad_norm": 0.23937608301639557, "learning_rate": 9.664268619874776e-06, "loss": 0.0444, "step": 105 }, { "epoch": 0.14712005551700208, "grad_norm": 0.2692619264125824, "learning_rate": 9.656125861953711e-06, "loss": 0.0472, "step": 106 }, { "epoch": 0.14850798056904926, "grad_norm": 0.2656213641166687, "learning_rate": 9.647889051742649e-06, "loss": 0.0516, "step": 107 }, { "epoch": 0.14989590562109645, "grad_norm": 0.2606026530265808, "learning_rate": 9.639558355622589e-06, "loss": 0.0456, "step": 108 }, { "epoch": 0.15128383067314366, "grad_norm": 0.23198707401752472, "learning_rate": 9.631133941870993e-06, "loss": 0.05, "step": 109 }, { "epoch": 0.15267175572519084, "grad_norm": 0.25558924674987793, "learning_rate": 9.622615980658391e-06, "loss": 0.0535, "step": 110 }, { "epoch": 0.15405968077723803, "grad_norm": 0.2660232186317444, "learning_rate": 9.614004644044943e-06, "loss": 0.0481, "step": 111 }, { "epoch": 0.1554476058292852, "grad_norm": 0.21960465610027313, "learning_rate": 9.60530010597696e-06, "loss": 0.0424, "step": 112 }, { "epoch": 0.1568355308813324, "grad_norm": 0.26909562945365906, "learning_rate": 9.596502542283399e-06, "loss": 0.0621, "step": 113 }, { "epoch": 0.1582234559333796, "grad_norm": 0.21745258569717407, "learning_rate": 9.587612130672302e-06, "loss": 0.038, "step": 114 }, { "epoch": 0.1596113809854268, "grad_norm": 0.2411315143108368, "learning_rate": 9.578629050727208e-06, "loss": 0.0445, "step": 115 }, { "epoch": 0.16099930603747398, "grad_norm": 0.2715134918689728, "learning_rate": 9.569553483903531e-06, "loss": 0.0526, "step": 116 }, { "epoch": 0.16238723108952116, "grad_norm": 0.23721489310264587, "learning_rate": 9.56038561352489e-06, "loss": 0.0441, "step": 117 }, { "epoch": 0.16377515614156835, "grad_norm": 0.23496368527412415, "learning_rate": 9.551125624779407e-06, "loss": 0.0353, "step": 118 }, { "epoch": 0.16516308119361556, "grad_norm": 0.2572284936904907, "learning_rate": 9.541773704715966e-06, "loss": 0.0494, "step": 119 }, { "epoch": 0.16655100624566274, "grad_norm": 0.27511200308799744, "learning_rate": 9.532330042240434e-06, "loss": 0.0412, "step": 120 }, { "epoch": 0.16793893129770993, "grad_norm": 0.2772343158721924, "learning_rate": 9.522794828111849e-06, "loss": 0.0533, "step": 121 }, { "epoch": 0.1693268563497571, "grad_norm": 0.19398753345012665, "learning_rate": 9.51316825493856e-06, "loss": 0.0358, "step": 122 }, { "epoch": 0.1707147814018043, "grad_norm": 0.2360389232635498, "learning_rate": 9.503450517174344e-06, "loss": 0.0373, "step": 123 }, { "epoch": 0.17210270645385148, "grad_norm": 0.24422723054885864, "learning_rate": 9.493641811114472e-06, "loss": 0.04, "step": 124 }, { "epoch": 0.1734906315058987, "grad_norm": 0.2566449046134949, "learning_rate": 9.483742334891747e-06, "loss": 0.0455, "step": 125 }, { "epoch": 0.17487855655794587, "grad_norm": 0.33392754197120667, "learning_rate": 9.473752288472499e-06, "loss": 0.0656, "step": 126 }, { "epoch": 0.17626648160999306, "grad_norm": 0.23692674934864044, "learning_rate": 9.463671873652551e-06, "loss": 0.0462, "step": 127 }, { "epoch": 0.17765440666204024, "grad_norm": 0.24718192219734192, "learning_rate": 9.453501294053139e-06, "loss": 0.0483, "step": 128 }, { "epoch": 0.17904233171408743, "grad_norm": 0.24895285069942474, "learning_rate": 9.443240755116797e-06, "loss": 0.0392, "step": 129 }, { "epoch": 0.18043025676613464, "grad_norm": 0.23605003952980042, "learning_rate": 9.432890464103208e-06, "loss": 0.043, "step": 130 }, { "epoch": 0.18181818181818182, "grad_norm": 0.2124328911304474, "learning_rate": 9.422450630085026e-06, "loss": 0.0423, "step": 131 }, { "epoch": 0.183206106870229, "grad_norm": 0.29214364290237427, "learning_rate": 9.411921463943641e-06, "loss": 0.0579, "step": 132 }, { "epoch": 0.1845940319222762, "grad_norm": 0.26696211099624634, "learning_rate": 9.401303178364923e-06, "loss": 0.05, "step": 133 }, { "epoch": 0.18598195697432338, "grad_norm": 0.37398970127105713, "learning_rate": 9.39059598783493e-06, "loss": 0.0428, "step": 134 }, { "epoch": 0.1873698820263706, "grad_norm": 0.44276946783065796, "learning_rate": 9.37980010863557e-06, "loss": 0.0713, "step": 135 }, { "epoch": 0.18875780707841777, "grad_norm": 0.21642902493476868, "learning_rate": 9.368915758840235e-06, "loss": 0.0328, "step": 136 }, { "epoch": 0.19014573213046496, "grad_norm": 0.23396173119544983, "learning_rate": 9.357943158309396e-06, "loss": 0.0316, "step": 137 }, { "epoch": 0.19153365718251214, "grad_norm": 0.29896441102027893, "learning_rate": 9.346882528686159e-06, "loss": 0.0547, "step": 138 }, { "epoch": 0.19292158223455932, "grad_norm": 0.30207115411758423, "learning_rate": 9.335734093391797e-06, "loss": 0.0611, "step": 139 }, { "epoch": 0.19430950728660654, "grad_norm": 0.21651242673397064, "learning_rate": 9.32449807762122e-06, "loss": 0.0363, "step": 140 }, { "epoch": 0.19569743233865372, "grad_norm": 0.2455293834209442, "learning_rate": 9.313174708338446e-06, "loss": 0.0378, "step": 141 }, { "epoch": 0.1970853573907009, "grad_norm": 0.27794840931892395, "learning_rate": 9.301764214272e-06, "loss": 0.0472, "step": 142 }, { "epoch": 0.1984732824427481, "grad_norm": 0.4069722294807434, "learning_rate": 9.2902668259103e-06, "loss": 0.0336, "step": 143 }, { "epoch": 0.19986120749479527, "grad_norm": 0.2580963671207428, "learning_rate": 9.278682775497012e-06, "loss": 0.0388, "step": 144 }, { "epoch": 0.20124913254684246, "grad_norm": 0.25055116415023804, "learning_rate": 9.267012297026334e-06, "loss": 0.044, "step": 145 }, { "epoch": 0.20263705759888967, "grad_norm": 0.20974084734916687, "learning_rate": 9.255255626238295e-06, "loss": 0.0403, "step": 146 }, { "epoch": 0.20402498265093685, "grad_norm": 0.25812828540802, "learning_rate": 9.243413000613974e-06, "loss": 0.05, "step": 147 }, { "epoch": 0.20541290770298404, "grad_norm": 0.2810615003108978, "learning_rate": 9.231484659370717e-06, "loss": 0.0475, "step": 148 }, { "epoch": 0.20680083275503122, "grad_norm": 0.3402911126613617, "learning_rate": 9.219470843457294e-06, "loss": 0.0574, "step": 149 }, { "epoch": 0.2081887578070784, "grad_norm": 0.25966858863830566, "learning_rate": 9.207371795549043e-06, "loss": 0.0465, "step": 150 }, { "epoch": 0.20957668285912562, "grad_norm": 0.2727181911468506, "learning_rate": 9.195187760042952e-06, "loss": 0.0472, "step": 151 }, { "epoch": 0.2109646079111728, "grad_norm": 0.27386289834976196, "learning_rate": 9.182918983052743e-06, "loss": 0.0526, "step": 152 }, { "epoch": 0.21235253296322, "grad_norm": 0.2608488202095032, "learning_rate": 9.17056571240388e-06, "loss": 0.0593, "step": 153 }, { "epoch": 0.21374045801526717, "grad_norm": 0.30400654673576355, "learning_rate": 9.158128197628578e-06, "loss": 0.0539, "step": 154 }, { "epoch": 0.21512838306731435, "grad_norm": 0.23927266895771027, "learning_rate": 9.145606689960756e-06, "loss": 0.0465, "step": 155 }, { "epoch": 0.21651630811936157, "grad_norm": 0.24992169439792633, "learning_rate": 9.133001442330964e-06, "loss": 0.0483, "step": 156 }, { "epoch": 0.21790423317140875, "grad_norm": 0.2401110827922821, "learning_rate": 9.120312709361271e-06, "loss": 0.038, "step": 157 }, { "epoch": 0.21929215822345594, "grad_norm": 0.516508936882019, "learning_rate": 9.107540747360124e-06, "loss": 0.0522, "step": 158 }, { "epoch": 0.22068008327550312, "grad_norm": 0.23530790209770203, "learning_rate": 9.094685814317174e-06, "loss": 0.0392, "step": 159 }, { "epoch": 0.2220680083275503, "grad_norm": 0.24144016206264496, "learning_rate": 9.081748169898054e-06, "loss": 0.0426, "step": 160 }, { "epoch": 0.22345593337959752, "grad_norm": 0.2304311841726303, "learning_rate": 9.068728075439153e-06, "loss": 0.0429, "step": 161 }, { "epoch": 0.2248438584316447, "grad_norm": 0.22649236023426056, "learning_rate": 9.055625793942308e-06, "loss": 0.0432, "step": 162 }, { "epoch": 0.22623178348369188, "grad_norm": 0.2122083157300949, "learning_rate": 9.042441590069526e-06, "loss": 0.0423, "step": 163 }, { "epoch": 0.22761970853573907, "grad_norm": 0.19142234325408936, "learning_rate": 9.029175730137611e-06, "loss": 0.0396, "step": 164 }, { "epoch": 0.22900763358778625, "grad_norm": 0.29952624440193176, "learning_rate": 9.015828482112793e-06, "loss": 0.0662, "step": 165 }, { "epoch": 0.23039555863983344, "grad_norm": 0.2410702407360077, "learning_rate": 9.002400115605319e-06, "loss": 0.0439, "step": 166 }, { "epoch": 0.23178348369188065, "grad_norm": 0.2898229956626892, "learning_rate": 8.988890901864006e-06, "loss": 0.0488, "step": 167 }, { "epoch": 0.23317140874392783, "grad_norm": 0.547232449054718, "learning_rate": 8.975301113770756e-06, "loss": 0.0487, "step": 168 }, { "epoch": 0.23455933379597502, "grad_norm": 0.2565780282020569, "learning_rate": 8.96163102583505e-06, "loss": 0.0501, "step": 169 }, { "epoch": 0.2359472588480222, "grad_norm": 0.31040069460868835, "learning_rate": 8.947880914188397e-06, "loss": 0.0602, "step": 170 }, { "epoch": 0.23733518390006939, "grad_norm": 0.2812497913837433, "learning_rate": 8.934051056578768e-06, "loss": 0.0507, "step": 171 }, { "epoch": 0.2387231089521166, "grad_norm": 0.2024613618850708, "learning_rate": 8.920141732364971e-06, "loss": 0.0346, "step": 172 }, { "epoch": 0.24011103400416378, "grad_norm": 0.21827936172485352, "learning_rate": 8.906153222511014e-06, "loss": 0.0501, "step": 173 }, { "epoch": 0.24149895905621097, "grad_norm": 0.222194641828537, "learning_rate": 8.892085809580435e-06, "loss": 0.0447, "step": 174 }, { "epoch": 0.24288688410825815, "grad_norm": 0.2933279275894165, "learning_rate": 8.877939777730585e-06, "loss": 0.0561, "step": 175 }, { "epoch": 0.24427480916030533, "grad_norm": 0.26495489478111267, "learning_rate": 8.863715412706897e-06, "loss": 0.0488, "step": 176 }, { "epoch": 0.24566273421235255, "grad_norm": 0.25571662187576294, "learning_rate": 8.849413001837105e-06, "loss": 0.0469, "step": 177 }, { "epoch": 0.24705065926439973, "grad_norm": 0.24550214409828186, "learning_rate": 8.83503283402545e-06, "loss": 0.0516, "step": 178 }, { "epoch": 0.24843858431644691, "grad_norm": 0.25382155179977417, "learning_rate": 8.820575199746835e-06, "loss": 0.0531, "step": 179 }, { "epoch": 0.2498265093684941, "grad_norm": 0.3099108338356018, "learning_rate": 8.806040391040962e-06, "loss": 0.0507, "step": 180 }, { "epoch": 0.2512144344205413, "grad_norm": 0.25325724482536316, "learning_rate": 8.791428701506433e-06, "loss": 0.0473, "step": 181 }, { "epoch": 0.25260235947258847, "grad_norm": 0.24286960065364838, "learning_rate": 8.776740426294818e-06, "loss": 0.0531, "step": 182 }, { "epoch": 0.2539902845246357, "grad_norm": 0.2681328356266022, "learning_rate": 8.761975862104694e-06, "loss": 0.0506, "step": 183 }, { "epoch": 0.25537820957668284, "grad_norm": 0.2976507544517517, "learning_rate": 8.747135307175657e-06, "loss": 0.0464, "step": 184 }, { "epoch": 0.25676613462873005, "grad_norm": 0.2756042182445526, "learning_rate": 8.73221906128228e-06, "loss": 0.0454, "step": 185 }, { "epoch": 0.25815405968077726, "grad_norm": 0.24677051603794098, "learning_rate": 8.71722742572808e-06, "loss": 0.0443, "step": 186 }, { "epoch": 0.2595419847328244, "grad_norm": 0.2520540654659271, "learning_rate": 8.702160703339422e-06, "loss": 0.0411, "step": 187 }, { "epoch": 0.2609299097848716, "grad_norm": 0.26869460940361023, "learning_rate": 8.687019198459395e-06, "loss": 0.0538, "step": 188 }, { "epoch": 0.2623178348369188, "grad_norm": 0.2537235915660858, "learning_rate": 8.671803216941674e-06, "loss": 0.045, "step": 189 }, { "epoch": 0.263705759888966, "grad_norm": 0.25532054901123047, "learning_rate": 8.656513066144342e-06, "loss": 0.0421, "step": 190 }, { "epoch": 0.2650936849410132, "grad_norm": 0.23937132954597473, "learning_rate": 8.641149054923673e-06, "loss": 0.0488, "step": 191 }, { "epoch": 0.26648160999306036, "grad_norm": 0.22795534133911133, "learning_rate": 8.625711493627902e-06, "loss": 0.047, "step": 192 }, { "epoch": 0.2678695350451076, "grad_norm": 0.24898919463157654, "learning_rate": 8.610200694090951e-06, "loss": 0.0496, "step": 193 }, { "epoch": 0.26925746009715473, "grad_norm": 0.26497387886047363, "learning_rate": 8.594616969626134e-06, "loss": 0.0485, "step": 194 }, { "epoch": 0.27064538514920194, "grad_norm": 0.24637307226657867, "learning_rate": 8.578960635019822e-06, "loss": 0.0497, "step": 195 }, { "epoch": 0.27203331020124916, "grad_norm": 0.3122495412826538, "learning_rate": 8.563232006525093e-06, "loss": 0.0512, "step": 196 }, { "epoch": 0.2734212352532963, "grad_norm": 0.23246419429779053, "learning_rate": 8.547431401855333e-06, "loss": 0.0428, "step": 197 }, { "epoch": 0.2748091603053435, "grad_norm": 0.24396541714668274, "learning_rate": 8.531559140177828e-06, "loss": 0.0487, "step": 198 }, { "epoch": 0.2761970853573907, "grad_norm": 0.22116141021251678, "learning_rate": 8.515615542107317e-06, "loss": 0.0395, "step": 199 }, { "epoch": 0.2775850104094379, "grad_norm": 0.28872185945510864, "learning_rate": 8.499600929699501e-06, "loss": 0.048, "step": 200 }, { "epoch": 0.2789729354614851, "grad_norm": 0.2683909833431244, "learning_rate": 8.48351562644456e-06, "loss": 0.0462, "step": 201 }, { "epoch": 0.28036086051353226, "grad_norm": 0.22411032021045685, "learning_rate": 8.4673599572606e-06, "loss": 0.0485, "step": 202 }, { "epoch": 0.2817487855655795, "grad_norm": 0.27028921246528625, "learning_rate": 8.4511342484871e-06, "loss": 0.0591, "step": 203 }, { "epoch": 0.28313671061762663, "grad_norm": 0.25318291783332825, "learning_rate": 8.434838827878315e-06, "loss": 0.0402, "step": 204 }, { "epoch": 0.28452463566967384, "grad_norm": 0.2501783072948456, "learning_rate": 8.418474024596659e-06, "loss": 0.0456, "step": 205 }, { "epoch": 0.28591256072172105, "grad_norm": 0.24944418668746948, "learning_rate": 8.402040169206054e-06, "loss": 0.0439, "step": 206 }, { "epoch": 0.2873004857737682, "grad_norm": 0.33963543176651, "learning_rate": 8.38553759366525e-06, "loss": 0.0639, "step": 207 }, { "epoch": 0.2886884108258154, "grad_norm": 0.32349342107772827, "learning_rate": 8.36896663132113e-06, "loss": 0.0541, "step": 208 }, { "epoch": 0.2900763358778626, "grad_norm": 0.23536472022533417, "learning_rate": 8.352327616901956e-06, "loss": 0.0487, "step": 209 }, { "epoch": 0.2914642609299098, "grad_norm": 0.23875856399536133, "learning_rate": 8.335620886510637e-06, "loss": 0.048, "step": 210 }, { "epoch": 0.29285218598195695, "grad_norm": 0.25536614656448364, "learning_rate": 8.318846777617913e-06, "loss": 0.0538, "step": 211 }, { "epoch": 0.29424011103400416, "grad_norm": 0.37551552057266235, "learning_rate": 8.302005629055549e-06, "loss": 0.0464, "step": 212 }, { "epoch": 0.29562803608605137, "grad_norm": 0.25611814856529236, "learning_rate": 8.285097781009497e-06, "loss": 0.0451, "step": 213 }, { "epoch": 0.2970159611380985, "grad_norm": 0.26617372035980225, "learning_rate": 8.268123575013008e-06, "loss": 0.0567, "step": 214 }, { "epoch": 0.29840388619014574, "grad_norm": 0.2143125832080841, "learning_rate": 8.251083353939752e-06, "loss": 0.0481, "step": 215 }, { "epoch": 0.2997918112421929, "grad_norm": 0.2626519203186035, "learning_rate": 8.233977461996879e-06, "loss": 0.0474, "step": 216 }, { "epoch": 0.3011797362942401, "grad_norm": 0.20418819785118103, "learning_rate": 8.216806244718068e-06, "loss": 0.0396, "step": 217 }, { "epoch": 0.3025676613462873, "grad_norm": 0.30222561955451965, "learning_rate": 8.199570048956553e-06, "loss": 0.0494, "step": 218 }, { "epoch": 0.3039555863983345, "grad_norm": 0.37407004833221436, "learning_rate": 8.182269222878112e-06, "loss": 0.0536, "step": 219 }, { "epoch": 0.3053435114503817, "grad_norm": 0.3121405839920044, "learning_rate": 8.164904115954036e-06, "loss": 0.0448, "step": 220 }, { "epoch": 0.30673143650242884, "grad_norm": 0.31007713079452515, "learning_rate": 8.147475078954067e-06, "loss": 0.046, "step": 221 }, { "epoch": 0.30811936155447606, "grad_norm": 0.2768557369709015, "learning_rate": 8.129982463939313e-06, "loss": 0.0517, "step": 222 }, { "epoch": 0.30950728660652327, "grad_norm": 0.2517678141593933, "learning_rate": 8.112426624255145e-06, "loss": 0.0553, "step": 223 }, { "epoch": 0.3108952116585704, "grad_norm": 0.2336336374282837, "learning_rate": 8.094807914524048e-06, "loss": 0.0403, "step": 224 }, { "epoch": 0.31228313671061764, "grad_norm": 0.23134920001029968, "learning_rate": 8.07712669063846e-06, "loss": 0.043, "step": 225 }, { "epoch": 0.3136710617626648, "grad_norm": 0.21085438132286072, "learning_rate": 8.059383309753587e-06, "loss": 0.0377, "step": 226 }, { "epoch": 0.315058986814712, "grad_norm": 0.24013970792293549, "learning_rate": 8.041578130280194e-06, "loss": 0.0481, "step": 227 }, { "epoch": 0.3164469118667592, "grad_norm": 0.24835489690303802, "learning_rate": 8.023711511877347e-06, "loss": 0.0463, "step": 228 }, { "epoch": 0.3178348369188064, "grad_norm": 0.24152745306491852, "learning_rate": 8.005783815445168e-06, "loss": 0.0394, "step": 229 }, { "epoch": 0.3192227619708536, "grad_norm": 0.259420245885849, "learning_rate": 7.987795403117528e-06, "loss": 0.0468, "step": 230 }, { "epoch": 0.32061068702290074, "grad_norm": 0.2520180940628052, "learning_rate": 7.96974663825475e-06, "loss": 0.0474, "step": 231 }, { "epoch": 0.32199861207494795, "grad_norm": 0.22478239238262177, "learning_rate": 7.95163788543625e-06, "loss": 0.0446, "step": 232 }, { "epoch": 0.32338653712699517, "grad_norm": 0.3189745247364044, "learning_rate": 7.933469510453189e-06, "loss": 0.0621, "step": 233 }, { "epoch": 0.3247744621790423, "grad_norm": 0.18363076448440552, "learning_rate": 7.915241880301075e-06, "loss": 0.034, "step": 234 }, { "epoch": 0.32616238723108953, "grad_norm": 0.2726271450519562, "learning_rate": 7.896955363172347e-06, "loss": 0.0375, "step": 235 }, { "epoch": 0.3275503122831367, "grad_norm": 0.26370498538017273, "learning_rate": 7.878610328448948e-06, "loss": 0.0464, "step": 236 }, { "epoch": 0.3289382373351839, "grad_norm": 0.30265697836875916, "learning_rate": 7.86020714669486e-06, "loss": 0.061, "step": 237 }, { "epoch": 0.3303261623872311, "grad_norm": 0.20978808403015137, "learning_rate": 7.84174618964861e-06, "loss": 0.0371, "step": 238 }, { "epoch": 0.33171408743927827, "grad_norm": 0.22528868913650513, "learning_rate": 7.823227830215776e-06, "loss": 0.0432, "step": 239 }, { "epoch": 0.3331020124913255, "grad_norm": 0.22642700374126434, "learning_rate": 7.804652442461438e-06, "loss": 0.0492, "step": 240 }, { "epoch": 0.33448993754337264, "grad_norm": 0.25173112750053406, "learning_rate": 7.786020401602638e-06, "loss": 0.0489, "step": 241 }, { "epoch": 0.33587786259541985, "grad_norm": 0.2896901071071625, "learning_rate": 7.767332084000784e-06, "loss": 0.0481, "step": 242 }, { "epoch": 0.33726578764746706, "grad_norm": 0.2184886485338211, "learning_rate": 7.748587867154068e-06, "loss": 0.0356, "step": 243 }, { "epoch": 0.3386537126995142, "grad_norm": 0.22447901964187622, "learning_rate": 7.72978812968982e-06, "loss": 0.0347, "step": 244 }, { "epoch": 0.34004163775156143, "grad_norm": 0.20314303040504456, "learning_rate": 7.71093325135687e-06, "loss": 0.0431, "step": 245 }, { "epoch": 0.3414295628036086, "grad_norm": 0.22147540748119354, "learning_rate": 7.692023613017884e-06, "loss": 0.0423, "step": 246 }, { "epoch": 0.3428174878556558, "grad_norm": 0.2724509537220001, "learning_rate": 7.673059596641657e-06, "loss": 0.0414, "step": 247 }, { "epoch": 0.34420541290770296, "grad_norm": 0.22747007012367249, "learning_rate": 7.6540415852954e-06, "loss": 0.04, "step": 248 }, { "epoch": 0.34559333795975017, "grad_norm": 0.3057887852191925, "learning_rate": 7.634969963137015e-06, "loss": 0.0545, "step": 249 }, { "epoch": 0.3469812630117974, "grad_norm": 0.3109971582889557, "learning_rate": 7.615845115407316e-06, "loss": 0.0475, "step": 250 }, { "epoch": 0.34836918806384454, "grad_norm": 0.19809511303901672, "learning_rate": 7.596667428422264e-06, "loss": 0.0312, "step": 251 }, { "epoch": 0.34975711311589175, "grad_norm": 0.26312026381492615, "learning_rate": 7.5774372895651545e-06, "loss": 0.0494, "step": 252 }, { "epoch": 0.3511450381679389, "grad_norm": 0.23003171384334564, "learning_rate": 7.558155087278791e-06, "loss": 0.0542, "step": 253 }, { "epoch": 0.3525329632199861, "grad_norm": 0.24017004668712616, "learning_rate": 7.538821211057648e-06, "loss": 0.0478, "step": 254 }, { "epoch": 0.35392088827203333, "grad_norm": 0.20186010003089905, "learning_rate": 7.519436051439991e-06, "loss": 0.0326, "step": 255 }, { "epoch": 0.3553088133240805, "grad_norm": 0.2597668170928955, "learning_rate": 7.500000000000001e-06, "loss": 0.0501, "step": 256 }, { "epoch": 0.3566967383761277, "grad_norm": 0.23714697360992432, "learning_rate": 7.480513449339851e-06, "loss": 0.0483, "step": 257 }, { "epoch": 0.35808466342817485, "grad_norm": 0.3031267821788788, "learning_rate": 7.460976793081789e-06, "loss": 0.0507, "step": 258 }, { "epoch": 0.35947258848022207, "grad_norm": 0.2820730209350586, "learning_rate": 7.441390425860172e-06, "loss": 0.0502, "step": 259 }, { "epoch": 0.3608605135322693, "grad_norm": 0.3192010223865509, "learning_rate": 7.421754743313514e-06, "loss": 0.0619, "step": 260 }, { "epoch": 0.36224843858431643, "grad_norm": 0.2101418673992157, "learning_rate": 7.402070142076475e-06, "loss": 0.0411, "step": 261 }, { "epoch": 0.36363636363636365, "grad_norm": 0.2577596604824066, "learning_rate": 7.382337019771859e-06, "loss": 0.0422, "step": 262 }, { "epoch": 0.3650242886884108, "grad_norm": 0.20709669589996338, "learning_rate": 7.36255577500258e-06, "loss": 0.0382, "step": 263 }, { "epoch": 0.366412213740458, "grad_norm": 0.22359001636505127, "learning_rate": 7.342726807343615e-06, "loss": 0.0318, "step": 264 }, { "epoch": 0.3678001387925052, "grad_norm": 0.2126387655735016, "learning_rate": 7.322850517333924e-06, "loss": 0.0355, "step": 265 }, { "epoch": 0.3691880638445524, "grad_norm": 0.252213716506958, "learning_rate": 7.302927306468365e-06, "loss": 0.0451, "step": 266 }, { "epoch": 0.3705759888965996, "grad_norm": 0.21543753147125244, "learning_rate": 7.282957577189581e-06, "loss": 0.0531, "step": 267 }, { "epoch": 0.37196391394864675, "grad_norm": 0.2670553922653198, "learning_rate": 7.2629417328798755e-06, "loss": 0.0508, "step": 268 }, { "epoch": 0.37335183900069396, "grad_norm": 0.24284611642360687, "learning_rate": 7.242880177853062e-06, "loss": 0.0397, "step": 269 }, { "epoch": 0.3747397640527412, "grad_norm": 0.3197598159313202, "learning_rate": 7.222773317346291e-06, "loss": 0.0577, "step": 270 }, { "epoch": 0.37612768910478833, "grad_norm": 0.21058222651481628, "learning_rate": 7.202621557511874e-06, "loss": 0.0366, "step": 271 }, { "epoch": 0.37751561415683554, "grad_norm": 0.33534789085388184, "learning_rate": 7.1824253054090735e-06, "loss": 0.0445, "step": 272 }, { "epoch": 0.3789035392088827, "grad_norm": 0.26425862312316895, "learning_rate": 7.162184968995882e-06, "loss": 0.0513, "step": 273 }, { "epoch": 0.3802914642609299, "grad_norm": 0.19292885065078735, "learning_rate": 7.141900957120781e-06, "loss": 0.0376, "step": 274 }, { "epoch": 0.3816793893129771, "grad_norm": 0.25801414251327515, "learning_rate": 7.121573679514484e-06, "loss": 0.0468, "step": 275 }, { "epoch": 0.3830673143650243, "grad_norm": 0.22623024880886078, "learning_rate": 7.101203546781655e-06, "loss": 0.043, "step": 276 }, { "epoch": 0.3844552394170715, "grad_norm": 0.36003199219703674, "learning_rate": 7.080790970392626e-06, "loss": 0.0637, "step": 277 }, { "epoch": 0.38584316446911865, "grad_norm": 0.22922679781913757, "learning_rate": 7.060336362675069e-06, "loss": 0.0504, "step": 278 }, { "epoch": 0.38723108952116586, "grad_norm": 0.25158169865608215, "learning_rate": 7.039840136805679e-06, "loss": 0.039, "step": 279 }, { "epoch": 0.3886190145732131, "grad_norm": 0.3726769685745239, "learning_rate": 7.019302706801826e-06, "loss": 0.0577, "step": 280 }, { "epoch": 0.39000693962526023, "grad_norm": 0.2862564027309418, "learning_rate": 6.998724487513191e-06, "loss": 0.0446, "step": 281 }, { "epoch": 0.39139486467730744, "grad_norm": 0.2609567642211914, "learning_rate": 6.978105894613385e-06, "loss": 0.0574, "step": 282 }, { "epoch": 0.3927827897293546, "grad_norm": 0.3218703866004944, "learning_rate": 6.9574473445915495e-06, "loss": 0.0582, "step": 283 }, { "epoch": 0.3941707147814018, "grad_norm": 0.23220689594745636, "learning_rate": 6.936749254743951e-06, "loss": 0.0499, "step": 284 }, { "epoch": 0.39555863983344897, "grad_norm": 0.2209801822900772, "learning_rate": 6.916012043165552e-06, "loss": 0.0482, "step": 285 }, { "epoch": 0.3969465648854962, "grad_norm": 0.2856426239013672, "learning_rate": 6.895236128741554e-06, "loss": 0.0426, "step": 286 }, { "epoch": 0.3983344899375434, "grad_norm": 0.2667888402938843, "learning_rate": 6.87442193113895e-06, "loss": 0.0542, "step": 287 }, { "epoch": 0.39972241498959055, "grad_norm": 0.2344602793455124, "learning_rate": 6.8535698707980356e-06, "loss": 0.0434, "step": 288 }, { "epoch": 0.40111034004163776, "grad_norm": 0.24343138933181763, "learning_rate": 6.83268036892393e-06, "loss": 0.0508, "step": 289 }, { "epoch": 0.4024982650936849, "grad_norm": 0.22512559592723846, "learning_rate": 6.811753847478051e-06, "loss": 0.0396, "step": 290 }, { "epoch": 0.4038861901457321, "grad_norm": 0.1864270120859146, "learning_rate": 6.790790729169604e-06, "loss": 0.0388, "step": 291 }, { "epoch": 0.40527411519777934, "grad_norm": 0.19488897919654846, "learning_rate": 6.769791437447042e-06, "loss": 0.035, "step": 292 }, { "epoch": 0.4066620402498265, "grad_norm": 0.2777364253997803, "learning_rate": 6.7487563964895066e-06, "loss": 0.0519, "step": 293 }, { "epoch": 0.4080499653018737, "grad_norm": 0.1941017359495163, "learning_rate": 6.7276860311982614e-06, "loss": 0.0361, "step": 294 }, { "epoch": 0.40943789035392086, "grad_norm": 0.20925886929035187, "learning_rate": 6.7065807671881155e-06, "loss": 0.0353, "step": 295 }, { "epoch": 0.4108258154059681, "grad_norm": 0.27289116382598877, "learning_rate": 6.6854410307788175e-06, "loss": 0.0533, "step": 296 }, { "epoch": 0.4122137404580153, "grad_norm": 0.27907559275627136, "learning_rate": 6.664267248986447e-06, "loss": 0.0415, "step": 297 }, { "epoch": 0.41360166551006244, "grad_norm": 0.24534979462623596, "learning_rate": 6.643059849514795e-06, "loss": 0.0445, "step": 298 }, { "epoch": 0.41498959056210966, "grad_norm": 0.18391579389572144, "learning_rate": 6.621819260746713e-06, "loss": 0.0335, "step": 299 }, { "epoch": 0.4163775156141568, "grad_norm": 0.24801869690418243, "learning_rate": 6.600545911735468e-06, "loss": 0.044, "step": 300 }, { "epoch": 0.417765440666204, "grad_norm": 0.2516506314277649, "learning_rate": 6.579240232196073e-06, "loss": 0.0507, "step": 301 }, { "epoch": 0.41915336571825124, "grad_norm": 0.2211104929447174, "learning_rate": 6.5579026524966106e-06, "loss": 0.0397, "step": 302 }, { "epoch": 0.4205412907702984, "grad_norm": 0.3019264042377472, "learning_rate": 6.536533603649536e-06, "loss": 0.0554, "step": 303 }, { "epoch": 0.4219292158223456, "grad_norm": 0.2607039213180542, "learning_rate": 6.515133517302969e-06, "loss": 0.0443, "step": 304 }, { "epoch": 0.42331714087439276, "grad_norm": 0.26423439383506775, "learning_rate": 6.493702825731977e-06, "loss": 0.052, "step": 305 }, { "epoch": 0.42470506592644, "grad_norm": 0.2512415647506714, "learning_rate": 6.472241961829846e-06, "loss": 0.0437, "step": 306 }, { "epoch": 0.4260929909784872, "grad_norm": 0.2525016963481903, "learning_rate": 6.450751359099332e-06, "loss": 0.0372, "step": 307 }, { "epoch": 0.42748091603053434, "grad_norm": 0.2588847577571869, "learning_rate": 6.429231451643907e-06, "loss": 0.0428, "step": 308 }, { "epoch": 0.42886884108258155, "grad_norm": 0.27101561427116394, "learning_rate": 6.407682674158988e-06, "loss": 0.0497, "step": 309 }, { "epoch": 0.4302567661346287, "grad_norm": 0.2133539319038391, "learning_rate": 6.386105461923159e-06, "loss": 0.0362, "step": 310 }, { "epoch": 0.4316446911866759, "grad_norm": 0.2591789960861206, "learning_rate": 6.364500250789375e-06, "loss": 0.0434, "step": 311 }, { "epoch": 0.43303261623872313, "grad_norm": 0.2357206642627716, "learning_rate": 6.342867477176164e-06, "loss": 0.0425, "step": 312 }, { "epoch": 0.4344205412907703, "grad_norm": 0.22705315053462982, "learning_rate": 6.321207578058803e-06, "loss": 0.049, "step": 313 }, { "epoch": 0.4358084663428175, "grad_norm": 0.22295016050338745, "learning_rate": 6.299520990960497e-06, "loss": 0.0455, "step": 314 }, { "epoch": 0.43719639139486466, "grad_norm": 0.2628712058067322, "learning_rate": 6.2778081539435436e-06, "loss": 0.0481, "step": 315 }, { "epoch": 0.43858431644691187, "grad_norm": 0.26134777069091797, "learning_rate": 6.256069505600474e-06, "loss": 0.0526, "step": 316 }, { "epoch": 0.4399722414989591, "grad_norm": 0.16641436517238617, "learning_rate": 6.234305485045205e-06, "loss": 0.0376, "step": 317 }, { "epoch": 0.44136016655100624, "grad_norm": 0.2876615822315216, "learning_rate": 6.212516531904164e-06, "loss": 0.0474, "step": 318 }, { "epoch": 0.44274809160305345, "grad_norm": 0.2932173013687134, "learning_rate": 6.1907030863074055e-06, "loss": 0.0508, "step": 319 }, { "epoch": 0.4441360166551006, "grad_norm": 0.2550623118877411, "learning_rate": 6.16886558887973e-06, "loss": 0.0457, "step": 320 }, { "epoch": 0.4455239417071478, "grad_norm": 0.22620978951454163, "learning_rate": 6.1470044807317695e-06, "loss": 0.0445, "step": 321 }, { "epoch": 0.44691186675919503, "grad_norm": 0.2603720724582672, "learning_rate": 6.1251202034510905e-06, "loss": 0.0519, "step": 322 }, { "epoch": 0.4482997918112422, "grad_norm": 0.2070014923810959, "learning_rate": 6.103213199093267e-06, "loss": 0.0365, "step": 323 }, { "epoch": 0.4496877168632894, "grad_norm": 0.2892493009567261, "learning_rate": 6.081283910172956e-06, "loss": 0.0586, "step": 324 }, { "epoch": 0.45107564191533656, "grad_norm": 0.2348898947238922, "learning_rate": 6.059332779654953e-06, "loss": 0.0426, "step": 325 }, { "epoch": 0.45246356696738377, "grad_norm": 0.24212618172168732, "learning_rate": 6.037360250945243e-06, "loss": 0.0378, "step": 326 }, { "epoch": 0.4538514920194309, "grad_norm": 0.2537144422531128, "learning_rate": 6.015366767882054e-06, "loss": 0.0412, "step": 327 }, { "epoch": 0.45523941707147814, "grad_norm": 0.21163740754127502, "learning_rate": 5.993352774726885e-06, "loss": 0.0358, "step": 328 }, { "epoch": 0.45662734212352535, "grad_norm": 0.340040385723114, "learning_rate": 5.97131871615553e-06, "loss": 0.0428, "step": 329 }, { "epoch": 0.4580152671755725, "grad_norm": 0.28573116660118103, "learning_rate": 5.949265037249096e-06, "loss": 0.0557, "step": 330 }, { "epoch": 0.4594031922276197, "grad_norm": 0.2429424375295639, "learning_rate": 5.927192183485023e-06, "loss": 0.0444, "step": 331 }, { "epoch": 0.4607911172796669, "grad_norm": 0.19028761982917786, "learning_rate": 5.905100600728067e-06, "loss": 0.0368, "step": 332 }, { "epoch": 0.4621790423317141, "grad_norm": 0.24256815016269684, "learning_rate": 5.882990735221312e-06, "loss": 0.0501, "step": 333 }, { "epoch": 0.4635669673837613, "grad_norm": 0.2468879520893097, "learning_rate": 5.860863033577141e-06, "loss": 0.0551, "step": 334 }, { "epoch": 0.46495489243580845, "grad_norm": 0.26638200879096985, "learning_rate": 5.8387179427682265e-06, "loss": 0.038, "step": 335 }, { "epoch": 0.46634281748785567, "grad_norm": 0.2620946764945984, "learning_rate": 5.8165559101184955e-06, "loss": 0.0596, "step": 336 }, { "epoch": 0.4677307425399028, "grad_norm": 0.3298340439796448, "learning_rate": 5.794377383294094e-06, "loss": 0.0422, "step": 337 }, { "epoch": 0.46911866759195003, "grad_norm": 0.2864164710044861, "learning_rate": 5.7721828102943445e-06, "loss": 0.0557, "step": 338 }, { "epoch": 0.47050659264399725, "grad_norm": 0.19313201308250427, "learning_rate": 5.749972639442698e-06, "loss": 0.0352, "step": 339 }, { "epoch": 0.4718945176960444, "grad_norm": 0.2098756730556488, "learning_rate": 5.72774731937768e-06, "loss": 0.0465, "step": 340 }, { "epoch": 0.4732824427480916, "grad_norm": 0.23879243433475494, "learning_rate": 5.705507299043822e-06, "loss": 0.0407, "step": 341 }, { "epoch": 0.47467036780013877, "grad_norm": 0.24368953704833984, "learning_rate": 5.683253027682597e-06, "loss": 0.047, "step": 342 }, { "epoch": 0.476058292852186, "grad_norm": 0.22611577808856964, "learning_rate": 5.660984954823342e-06, "loss": 0.0444, "step": 343 }, { "epoch": 0.4774462179042332, "grad_norm": 0.21530263125896454, "learning_rate": 5.638703530274187e-06, "loss": 0.0391, "step": 344 }, { "epoch": 0.47883414295628035, "grad_norm": 0.2654690444469452, "learning_rate": 5.6164092041129544e-06, "loss": 0.0537, "step": 345 }, { "epoch": 0.48022206800832756, "grad_norm": 0.24219410121440887, "learning_rate": 5.594102426678082e-06, "loss": 0.0469, "step": 346 }, { "epoch": 0.4816099930603747, "grad_norm": 0.2516026496887207, "learning_rate": 5.57178364855951e-06, "loss": 0.0438, "step": 347 }, { "epoch": 0.48299791811242193, "grad_norm": 0.18920086324214935, "learning_rate": 5.549453320589598e-06, "loss": 0.0383, "step": 348 }, { "epoch": 0.48438584316446914, "grad_norm": 0.25243017077445984, "learning_rate": 5.527111893834004e-06, "loss": 0.0473, "step": 349 }, { "epoch": 0.4857737682165163, "grad_norm": 0.18175047636032104, "learning_rate": 5.504759819582581e-06, "loss": 0.039, "step": 350 }, { "epoch": 0.4871616932685635, "grad_norm": 0.262374609708786, "learning_rate": 5.482397549340256e-06, "loss": 0.0502, "step": 351 }, { "epoch": 0.48854961832061067, "grad_norm": 0.22592027485370636, "learning_rate": 5.460025534817911e-06, "loss": 0.0463, "step": 352 }, { "epoch": 0.4899375433726579, "grad_norm": 0.24996748566627502, "learning_rate": 5.437644227923261e-06, "loss": 0.0492, "step": 353 }, { "epoch": 0.4913254684247051, "grad_norm": 0.2771015167236328, "learning_rate": 5.415254080751725e-06, "loss": 0.0441, "step": 354 }, { "epoch": 0.49271339347675225, "grad_norm": 0.21063049137592316, "learning_rate": 5.39285554557729e-06, "loss": 0.041, "step": 355 }, { "epoch": 0.49410131852879946, "grad_norm": 0.2659851014614105, "learning_rate": 5.37044907484338e-06, "loss": 0.051, "step": 356 }, { "epoch": 0.4954892435808466, "grad_norm": 0.24859070777893066, "learning_rate": 5.348035121153716e-06, "loss": 0.0454, "step": 357 }, { "epoch": 0.49687716863289383, "grad_norm": 0.2629816234111786, "learning_rate": 5.32561413726317e-06, "loss": 0.0433, "step": 358 }, { "epoch": 0.49826509368494104, "grad_norm": 0.21318283677101135, "learning_rate": 5.303186576068621e-06, "loss": 0.0408, "step": 359 }, { "epoch": 0.4996530187369882, "grad_norm": 0.28312963247299194, "learning_rate": 5.28075289059981e-06, "loss": 0.0464, "step": 360 }, { "epoch": 0.5010409437890354, "grad_norm": 0.2362765371799469, "learning_rate": 5.258313534010187e-06, "loss": 0.0365, "step": 361 }, { "epoch": 0.5024288688410826, "grad_norm": 0.17733339965343475, "learning_rate": 5.235868959567755e-06, "loss": 0.0348, "step": 362 }, { "epoch": 0.5038167938931297, "grad_norm": 0.29030370712280273, "learning_rate": 5.213419620645914e-06, "loss": 0.0451, "step": 363 }, { "epoch": 0.5052047189451769, "grad_norm": 0.26337236166000366, "learning_rate": 5.1909659707143105e-06, "loss": 0.0403, "step": 364 }, { "epoch": 0.5065926439972241, "grad_norm": 0.27861809730529785, "learning_rate": 5.1685084633296665e-06, "loss": 0.0489, "step": 365 }, { "epoch": 0.5079805690492714, "grad_norm": 0.26633739471435547, "learning_rate": 5.14604755212663e-06, "loss": 0.0539, "step": 366 }, { "epoch": 0.5093684941013186, "grad_norm": 0.2497876137495041, "learning_rate": 5.123583690808596e-06, "loss": 0.0469, "step": 367 }, { "epoch": 0.5107564191533657, "grad_norm": 0.2367750108242035, "learning_rate": 5.101117333138558e-06, "loss": 0.0455, "step": 368 }, { "epoch": 0.5121443442054129, "grad_norm": 0.21793133020401, "learning_rate": 5.078648932929933e-06, "loss": 0.0441, "step": 369 }, { "epoch": 0.5135322692574601, "grad_norm": 0.2375047653913498, "learning_rate": 5.056178944037396e-06, "loss": 0.0536, "step": 370 }, { "epoch": 0.5149201943095073, "grad_norm": 0.24593202769756317, "learning_rate": 5.033707820347715e-06, "loss": 0.0488, "step": 371 }, { "epoch": 0.5163081193615545, "grad_norm": 0.2014894336462021, "learning_rate": 5.011236015770577e-06, "loss": 0.0438, "step": 372 }, { "epoch": 0.5176960444136016, "grad_norm": 0.2295396327972412, "learning_rate": 4.988763984229425e-06, "loss": 0.0426, "step": 373 }, { "epoch": 0.5190839694656488, "grad_norm": 0.23237484693527222, "learning_rate": 4.9662921796522856e-06, "loss": 0.0432, "step": 374 }, { "epoch": 0.520471894517696, "grad_norm": 0.18679746985435486, "learning_rate": 4.9438210559626045e-06, "loss": 0.0314, "step": 375 }, { "epoch": 0.5218598195697433, "grad_norm": 0.2613310217857361, "learning_rate": 4.921351067070068e-06, "loss": 0.0517, "step": 376 }, { "epoch": 0.5232477446217905, "grad_norm": 0.21653001010417938, "learning_rate": 4.898882666861444e-06, "loss": 0.0374, "step": 377 }, { "epoch": 0.5246356696738376, "grad_norm": 0.23039276897907257, "learning_rate": 4.876416309191406e-06, "loss": 0.0353, "step": 378 }, { "epoch": 0.5260235947258848, "grad_norm": 0.2714105248451233, "learning_rate": 4.853952447873371e-06, "loss": 0.0552, "step": 379 }, { "epoch": 0.527411519777932, "grad_norm": 0.25004643201828003, "learning_rate": 4.831491536670334e-06, "loss": 0.0462, "step": 380 }, { "epoch": 0.5287994448299792, "grad_norm": 0.22087691724300385, "learning_rate": 4.809034029285691e-06, "loss": 0.0358, "step": 381 }, { "epoch": 0.5301873698820264, "grad_norm": 0.20914320647716522, "learning_rate": 4.786580379354087e-06, "loss": 0.0362, "step": 382 }, { "epoch": 0.5315752949340735, "grad_norm": 0.1910157948732376, "learning_rate": 4.7641310404322475e-06, "loss": 0.0388, "step": 383 }, { "epoch": 0.5329632199861207, "grad_norm": 0.2663300037384033, "learning_rate": 4.741686465989814e-06, "loss": 0.0458, "step": 384 }, { "epoch": 0.5343511450381679, "grad_norm": 0.19584941864013672, "learning_rate": 4.719247109400192e-06, "loss": 0.042, "step": 385 }, { "epoch": 0.5357390700902152, "grad_norm": 0.21342402696609497, "learning_rate": 4.696813423931381e-06, "loss": 0.0441, "step": 386 }, { "epoch": 0.5371269951422624, "grad_norm": 0.2350674718618393, "learning_rate": 4.674385862736832e-06, "loss": 0.0479, "step": 387 }, { "epoch": 0.5385149201943095, "grad_norm": 0.27498018741607666, "learning_rate": 4.651964878846285e-06, "loss": 0.0419, "step": 388 }, { "epoch": 0.5399028452463567, "grad_norm": 0.19162392616271973, "learning_rate": 4.62955092515662e-06, "loss": 0.0396, "step": 389 }, { "epoch": 0.5412907702984039, "grad_norm": 0.2587217688560486, "learning_rate": 4.607144454422711e-06, "loss": 0.048, "step": 390 }, { "epoch": 0.5426786953504511, "grad_norm": 0.3030252456665039, "learning_rate": 4.584745919248275e-06, "loss": 0.0629, "step": 391 }, { "epoch": 0.5440666204024983, "grad_norm": 0.27365297079086304, "learning_rate": 4.56235577207674e-06, "loss": 0.0356, "step": 392 }, { "epoch": 0.5454545454545454, "grad_norm": 0.22301366925239563, "learning_rate": 4.5399744651820915e-06, "loss": 0.0399, "step": 393 }, { "epoch": 0.5468424705065926, "grad_norm": 0.23638130724430084, "learning_rate": 4.517602450659746e-06, "loss": 0.0362, "step": 394 }, { "epoch": 0.5482303955586398, "grad_norm": 0.23380909860134125, "learning_rate": 4.49524018041742e-06, "loss": 0.0439, "step": 395 }, { "epoch": 0.549618320610687, "grad_norm": 0.1905248910188675, "learning_rate": 4.472888106165995e-06, "loss": 0.042, "step": 396 }, { "epoch": 0.5510062456627343, "grad_norm": 0.25941696763038635, "learning_rate": 4.450546679410403e-06, "loss": 0.046, "step": 397 }, { "epoch": 0.5523941707147814, "grad_norm": 0.25850844383239746, "learning_rate": 4.428216351440492e-06, "loss": 0.0457, "step": 398 }, { "epoch": 0.5537820957668286, "grad_norm": 0.19940395653247833, "learning_rate": 4.40589757332192e-06, "loss": 0.0416, "step": 399 }, { "epoch": 0.5551700208188758, "grad_norm": 0.2690625786781311, "learning_rate": 4.383590795887046e-06, "loss": 0.0546, "step": 400 }, { "epoch": 0.556557945870923, "grad_norm": 0.21887092292308807, "learning_rate": 4.361296469725813e-06, "loss": 0.0458, "step": 401 }, { "epoch": 0.5579458709229702, "grad_norm": 0.19797225296497345, "learning_rate": 4.339015045176659e-06, "loss": 0.038, "step": 402 }, { "epoch": 0.5593337959750173, "grad_norm": 0.2749207317829132, "learning_rate": 4.316746972317406e-06, "loss": 0.0617, "step": 403 }, { "epoch": 0.5607217210270645, "grad_norm": 0.26302123069763184, "learning_rate": 4.2944927009561786e-06, "loss": 0.0404, "step": 404 }, { "epoch": 0.5621096460791117, "grad_norm": 0.2845047116279602, "learning_rate": 4.272252680622321e-06, "loss": 0.0537, "step": 405 }, { "epoch": 0.563497571131159, "grad_norm": 0.20317070186138153, "learning_rate": 4.250027360557302e-06, "loss": 0.039, "step": 406 }, { "epoch": 0.5648854961832062, "grad_norm": 0.3297349214553833, "learning_rate": 4.227817189705657e-06, "loss": 0.0659, "step": 407 }, { "epoch": 0.5662734212352533, "grad_norm": 0.19175824522972107, "learning_rate": 4.205622616705909e-06, "loss": 0.0344, "step": 408 }, { "epoch": 0.5676613462873005, "grad_norm": 0.21104075014591217, "learning_rate": 4.183444089881506e-06, "loss": 0.0403, "step": 409 }, { "epoch": 0.5690492713393477, "grad_norm": 0.23763036727905273, "learning_rate": 4.161282057231776e-06, "loss": 0.0425, "step": 410 }, { "epoch": 0.5704371963913949, "grad_norm": 0.27829548716545105, "learning_rate": 4.13913696642286e-06, "loss": 0.0379, "step": 411 }, { "epoch": 0.5718251214434421, "grad_norm": 0.2803460359573364, "learning_rate": 4.1170092647786895e-06, "loss": 0.0566, "step": 412 }, { "epoch": 0.5732130464954892, "grad_norm": 0.2148689329624176, "learning_rate": 4.094899399271935e-06, "loss": 0.0497, "step": 413 }, { "epoch": 0.5746009715475364, "grad_norm": 0.25749555230140686, "learning_rate": 4.072807816514978e-06, "loss": 0.0543, "step": 414 }, { "epoch": 0.5759888965995836, "grad_norm": 0.21719405055046082, "learning_rate": 4.0507349627509045e-06, "loss": 0.0368, "step": 415 }, { "epoch": 0.5773768216516308, "grad_norm": 0.1985744833946228, "learning_rate": 4.028681283844471e-06, "loss": 0.0326, "step": 416 }, { "epoch": 0.5787647467036781, "grad_norm": 0.33434179425239563, "learning_rate": 4.006647225273116e-06, "loss": 0.0477, "step": 417 }, { "epoch": 0.5801526717557252, "grad_norm": 0.2449856400489807, "learning_rate": 3.984633232117948e-06, "loss": 0.0495, "step": 418 }, { "epoch": 0.5815405968077724, "grad_norm": 0.2528287172317505, "learning_rate": 3.96263974905476e-06, "loss": 0.0463, "step": 419 }, { "epoch": 0.5829285218598196, "grad_norm": 0.19862471520900726, "learning_rate": 3.94066722034505e-06, "loss": 0.0404, "step": 420 }, { "epoch": 0.5843164469118668, "grad_norm": 0.22756393253803253, "learning_rate": 3.9187160898270435e-06, "loss": 0.0463, "step": 421 }, { "epoch": 0.5857043719639139, "grad_norm": 0.23390746116638184, "learning_rate": 3.896786800906734e-06, "loss": 0.0478, "step": 422 }, { "epoch": 0.5870922970159611, "grad_norm": 0.302847295999527, "learning_rate": 3.87487979654891e-06, "loss": 0.0481, "step": 423 }, { "epoch": 0.5884802220680083, "grad_norm": 0.27103695273399353, "learning_rate": 3.852995519268231e-06, "loss": 0.0501, "step": 424 }, { "epoch": 0.5898681471200555, "grad_norm": 0.2562784254550934, "learning_rate": 3.831134411120273e-06, "loss": 0.0443, "step": 425 }, { "epoch": 0.5912560721721027, "grad_norm": 0.2507801949977875, "learning_rate": 3.809296913692594e-06, "loss": 0.0419, "step": 426 }, { "epoch": 0.5926439972241498, "grad_norm": 0.2666143476963043, "learning_rate": 3.787483468095838e-06, "loss": 0.0338, "step": 427 }, { "epoch": 0.594031922276197, "grad_norm": 0.20649617910385132, "learning_rate": 3.765694514954796e-06, "loss": 0.0495, "step": 428 }, { "epoch": 0.5954198473282443, "grad_norm": 0.25489917397499084, "learning_rate": 3.7439304943995274e-06, "loss": 0.0466, "step": 429 }, { "epoch": 0.5968077723802915, "grad_norm": 0.21290229260921478, "learning_rate": 3.72219184605646e-06, "loss": 0.0488, "step": 430 }, { "epoch": 0.5981956974323387, "grad_norm": 0.25812432169914246, "learning_rate": 3.7004790090395043e-06, "loss": 0.0473, "step": 431 }, { "epoch": 0.5995836224843858, "grad_norm": 0.2217107117176056, "learning_rate": 3.678792421941199e-06, "loss": 0.0415, "step": 432 }, { "epoch": 0.600971547536433, "grad_norm": 0.21589906513690948, "learning_rate": 3.657132522823837e-06, "loss": 0.0395, "step": 433 }, { "epoch": 0.6023594725884802, "grad_norm": 0.2054014950990677, "learning_rate": 3.6354997492106258e-06, "loss": 0.0478, "step": 434 }, { "epoch": 0.6037473976405274, "grad_norm": 0.2601326107978821, "learning_rate": 3.6138945380768442e-06, "loss": 0.0418, "step": 435 }, { "epoch": 0.6051353226925746, "grad_norm": 0.19384582340717316, "learning_rate": 3.592317325841014e-06, "loss": 0.0357, "step": 436 }, { "epoch": 0.6065232477446217, "grad_norm": 0.22405052185058594, "learning_rate": 3.5707685483560948e-06, "loss": 0.0373, "step": 437 }, { "epoch": 0.607911172796669, "grad_norm": 0.274188756942749, "learning_rate": 3.5492486409006684e-06, "loss": 0.0522, "step": 438 }, { "epoch": 0.6092990978487162, "grad_norm": 0.21671554446220398, "learning_rate": 3.5277580381701553e-06, "loss": 0.0417, "step": 439 }, { "epoch": 0.6106870229007634, "grad_norm": 0.2146933525800705, "learning_rate": 3.5062971742680244e-06, "loss": 0.04, "step": 440 }, { "epoch": 0.6120749479528106, "grad_norm": 0.20991787314414978, "learning_rate": 3.484866482697032e-06, "loss": 0.0365, "step": 441 }, { "epoch": 0.6134628730048577, "grad_norm": 0.26411962509155273, "learning_rate": 3.4634663963504654e-06, "loss": 0.0514, "step": 442 }, { "epoch": 0.6148507980569049, "grad_norm": 0.19666975736618042, "learning_rate": 3.4420973475033894e-06, "loss": 0.0377, "step": 443 }, { "epoch": 0.6162387231089521, "grad_norm": 0.21241791546344757, "learning_rate": 3.4207597678039293e-06, "loss": 0.0358, "step": 444 }, { "epoch": 0.6176266481609993, "grad_norm": 0.24057012796401978, "learning_rate": 3.3994540882645353e-06, "loss": 0.0514, "step": 445 }, { "epoch": 0.6190145732130465, "grad_norm": 0.24572890996932983, "learning_rate": 3.3781807392532893e-06, "loss": 0.0406, "step": 446 }, { "epoch": 0.6204024982650936, "grad_norm": 0.1983802616596222, "learning_rate": 3.3569401504852073e-06, "loss": 0.0371, "step": 447 }, { "epoch": 0.6217904233171409, "grad_norm": 0.25842270255088806, "learning_rate": 3.335732751013553e-06, "loss": 0.0476, "step": 448 }, { "epoch": 0.6231783483691881, "grad_norm": 0.19981160759925842, "learning_rate": 3.3145589692211837e-06, "loss": 0.0378, "step": 449 }, { "epoch": 0.6245662734212353, "grad_norm": 0.1566598266363144, "learning_rate": 3.2934192328118866e-06, "loss": 0.0295, "step": 450 }, { "epoch": 0.6259541984732825, "grad_norm": 0.2110665738582611, "learning_rate": 3.27231396880174e-06, "loss": 0.0406, "step": 451 }, { "epoch": 0.6273421235253296, "grad_norm": 0.21389417350292206, "learning_rate": 3.2512436035104968e-06, "loss": 0.0445, "step": 452 }, { "epoch": 0.6287300485773768, "grad_norm": 0.2280310094356537, "learning_rate": 3.2302085625529596e-06, "loss": 0.0407, "step": 453 }, { "epoch": 0.630117973629424, "grad_norm": 0.23333007097244263, "learning_rate": 3.2092092708303973e-06, "loss": 0.042, "step": 454 }, { "epoch": 0.6315058986814712, "grad_norm": 0.2135223001241684, "learning_rate": 3.18824615252195e-06, "loss": 0.0394, "step": 455 }, { "epoch": 0.6328938237335184, "grad_norm": 0.2351330667734146, "learning_rate": 3.1673196310760723e-06, "loss": 0.042, "step": 456 }, { "epoch": 0.6342817487855655, "grad_norm": 0.30510368943214417, "learning_rate": 3.146430129201965e-06, "loss": 0.064, "step": 457 }, { "epoch": 0.6356696738376127, "grad_norm": 0.232173889875412, "learning_rate": 3.125578068861051e-06, "loss": 0.0465, "step": 458 }, { "epoch": 0.63705759888966, "grad_norm": 0.2363738715648651, "learning_rate": 3.104763871258447e-06, "loss": 0.043, "step": 459 }, { "epoch": 0.6384455239417072, "grad_norm": 0.21220462024211884, "learning_rate": 3.083987956834449e-06, "loss": 0.0418, "step": 460 }, { "epoch": 0.6398334489937544, "grad_norm": 0.2919938862323761, "learning_rate": 3.06325074525605e-06, "loss": 0.0583, "step": 461 }, { "epoch": 0.6412213740458015, "grad_norm": 0.18480442464351654, "learning_rate": 3.0425526554084526e-06, "loss": 0.0376, "step": 462 }, { "epoch": 0.6426092990978487, "grad_norm": 0.21944300830364227, "learning_rate": 3.0218941053866167e-06, "loss": 0.035, "step": 463 }, { "epoch": 0.6439972241498959, "grad_norm": 0.2114299088716507, "learning_rate": 3.00127551248681e-06, "loss": 0.0374, "step": 464 }, { "epoch": 0.6453851492019431, "grad_norm": 0.222478449344635, "learning_rate": 2.980697293198174e-06, "loss": 0.038, "step": 465 }, { "epoch": 0.6467730742539903, "grad_norm": 0.2513335943222046, "learning_rate": 2.960159863194322e-06, "loss": 0.0433, "step": 466 }, { "epoch": 0.6481609993060374, "grad_norm": 0.2263910174369812, "learning_rate": 2.939663637324934e-06, "loss": 0.0425, "step": 467 }, { "epoch": 0.6495489243580846, "grad_norm": 0.2414645254611969, "learning_rate": 2.9192090296073755e-06, "loss": 0.0434, "step": 468 }, { "epoch": 0.6509368494101319, "grad_norm": 0.31796714663505554, "learning_rate": 2.8987964532183454e-06, "loss": 0.056, "step": 469 }, { "epoch": 0.6523247744621791, "grad_norm": 0.2518536448478699, "learning_rate": 2.878426320485518e-06, "loss": 0.0507, "step": 470 }, { "epoch": 0.6537126995142263, "grad_norm": 0.24848508834838867, "learning_rate": 2.8580990428792205e-06, "loss": 0.0432, "step": 471 }, { "epoch": 0.6551006245662734, "grad_norm": 0.2785174250602722, "learning_rate": 2.8378150310041197e-06, "loss": 0.0555, "step": 472 }, { "epoch": 0.6564885496183206, "grad_norm": 0.48383262753486633, "learning_rate": 2.8175746945909277e-06, "loss": 0.0437, "step": 473 }, { "epoch": 0.6578764746703678, "grad_norm": 0.2524990439414978, "learning_rate": 2.7973784424881273e-06, "loss": 0.0392, "step": 474 }, { "epoch": 0.659264399722415, "grad_norm": 0.22584012150764465, "learning_rate": 2.7772266826537103e-06, "loss": 0.0398, "step": 475 }, { "epoch": 0.6606523247744622, "grad_norm": 0.20362503826618195, "learning_rate": 2.75711982214694e-06, "loss": 0.0431, "step": 476 }, { "epoch": 0.6620402498265093, "grad_norm": 0.2138269990682602, "learning_rate": 2.7370582671201253e-06, "loss": 0.0336, "step": 477 }, { "epoch": 0.6634281748785565, "grad_norm": 0.1999903917312622, "learning_rate": 2.7170424228104207e-06, "loss": 0.0295, "step": 478 }, { "epoch": 0.6648160999306038, "grad_norm": 0.2650030553340912, "learning_rate": 2.697072693531637e-06, "loss": 0.0418, "step": 479 }, { "epoch": 0.666204024982651, "grad_norm": 0.26400989294052124, "learning_rate": 2.6771494826660782e-06, "loss": 0.0495, "step": 480 }, { "epoch": 0.6675919500346982, "grad_norm": 0.3058173358440399, "learning_rate": 2.6572731926563867e-06, "loss": 0.0366, "step": 481 }, { "epoch": 0.6689798750867453, "grad_norm": 0.26100462675094604, "learning_rate": 2.6374442249974214e-06, "loss": 0.0525, "step": 482 }, { "epoch": 0.6703678001387925, "grad_norm": 0.3068617284297943, "learning_rate": 2.617662980228144e-06, "loss": 0.0448, "step": 483 }, { "epoch": 0.6717557251908397, "grad_norm": 0.21236619353294373, "learning_rate": 2.5979298579235276e-06, "loss": 0.0395, "step": 484 }, { "epoch": 0.6731436502428869, "grad_norm": 0.213504821062088, "learning_rate": 2.578245256686488e-06, "loss": 0.0329, "step": 485 }, { "epoch": 0.6745315752949341, "grad_norm": 0.2812097668647766, "learning_rate": 2.558609574139829e-06, "loss": 0.0427, "step": 486 }, { "epoch": 0.6759195003469812, "grad_norm": 0.23506538569927216, "learning_rate": 2.539023206918212e-06, "loss": 0.0422, "step": 487 }, { "epoch": 0.6773074253990284, "grad_norm": 0.17870992422103882, "learning_rate": 2.5194865506601507e-06, "loss": 0.0388, "step": 488 }, { "epoch": 0.6786953504510757, "grad_norm": 0.284327894449234, "learning_rate": 2.5000000000000015e-06, "loss": 0.0488, "step": 489 }, { "epoch": 0.6800832755031229, "grad_norm": 0.23880119621753693, "learning_rate": 2.4805639485600087e-06, "loss": 0.0412, "step": 490 }, { "epoch": 0.6814712005551701, "grad_norm": 0.2482188642024994, "learning_rate": 2.4611787889423546e-06, "loss": 0.0404, "step": 491 }, { "epoch": 0.6828591256072172, "grad_norm": 0.24093759059906006, "learning_rate": 2.441844912721209e-06, "loss": 0.0488, "step": 492 }, { "epoch": 0.6842470506592644, "grad_norm": 0.2581704258918762, "learning_rate": 2.422562710434848e-06, "loss": 0.041, "step": 493 }, { "epoch": 0.6856349757113116, "grad_norm": 0.29225507378578186, "learning_rate": 2.403332571577738e-06, "loss": 0.0562, "step": 494 }, { "epoch": 0.6870229007633588, "grad_norm": 0.23954467475414276, "learning_rate": 2.3841548845926844e-06, "loss": 0.0468, "step": 495 }, { "epoch": 0.6884108258154059, "grad_norm": 0.23937787115573883, "learning_rate": 2.365030036862988e-06, "loss": 0.0405, "step": 496 }, { "epoch": 0.6897987508674531, "grad_norm": 0.28101256489753723, "learning_rate": 2.3459584147046e-06, "loss": 0.0555, "step": 497 }, { "epoch": 0.6911866759195003, "grad_norm": 0.1975279599428177, "learning_rate": 2.3269404033583443e-06, "loss": 0.0409, "step": 498 }, { "epoch": 0.6925746009715475, "grad_norm": 0.19648875296115875, "learning_rate": 2.3079763869821176e-06, "loss": 0.0317, "step": 499 }, { "epoch": 0.6939625260235948, "grad_norm": 0.1975594460964203, "learning_rate": 2.2890667486431296e-06, "loss": 0.0346, "step": 500 }, { "epoch": 0.6953504510756419, "grad_norm": 0.2302437722682953, "learning_rate": 2.270211870310184e-06, "loss": 0.0427, "step": 501 }, { "epoch": 0.6967383761276891, "grad_norm": 0.2960100769996643, "learning_rate": 2.251412132845933e-06, "loss": 0.0535, "step": 502 }, { "epoch": 0.6981263011797363, "grad_norm": 0.18378931283950806, "learning_rate": 2.232667915999216e-06, "loss": 0.0375, "step": 503 }, { "epoch": 0.6995142262317835, "grad_norm": 0.20398057997226715, "learning_rate": 2.2139795983973654e-06, "loss": 0.0383, "step": 504 }, { "epoch": 0.7009021512838307, "grad_norm": 0.6562597155570984, "learning_rate": 2.1953475575385618e-06, "loss": 0.0437, "step": 505 }, { "epoch": 0.7022900763358778, "grad_norm": 0.2080502212047577, "learning_rate": 2.1767721697842244e-06, "loss": 0.0396, "step": 506 }, { "epoch": 0.703678001387925, "grad_norm": 0.23737893998622894, "learning_rate": 2.1582538103513896e-06, "loss": 0.0446, "step": 507 }, { "epoch": 0.7050659264399722, "grad_norm": 0.24636588990688324, "learning_rate": 2.139792853305141e-06, "loss": 0.0462, "step": 508 }, { "epoch": 0.7064538514920194, "grad_norm": 0.2152203917503357, "learning_rate": 2.121389671551054e-06, "loss": 0.0492, "step": 509 }, { "epoch": 0.7078417765440667, "grad_norm": 0.26464414596557617, "learning_rate": 2.1030446368276547e-06, "loss": 0.0534, "step": 510 }, { "epoch": 0.7092297015961138, "grad_norm": 0.27359458804130554, "learning_rate": 2.0847581196989277e-06, "loss": 0.0506, "step": 511 }, { "epoch": 0.710617626648161, "grad_norm": 0.20019769668579102, "learning_rate": 2.0665304895468114e-06, "loss": 0.0392, "step": 512 }, { "epoch": 0.7120055517002082, "grad_norm": 0.4770808815956116, "learning_rate": 2.04836211456375e-06, "loss": 0.0518, "step": 513 }, { "epoch": 0.7133934767522554, "grad_norm": 0.26947441697120667, "learning_rate": 2.030253361745251e-06, "loss": 0.0434, "step": 514 }, { "epoch": 0.7147814018043026, "grad_norm": 0.2415596842765808, "learning_rate": 2.012204596882472e-06, "loss": 0.0425, "step": 515 }, { "epoch": 0.7161693268563497, "grad_norm": 0.19842304289340973, "learning_rate": 1.9942161845548334e-06, "loss": 0.0361, "step": 516 }, { "epoch": 0.7175572519083969, "grad_norm": 0.27983754873275757, "learning_rate": 1.9762884881226535e-06, "loss": 0.0447, "step": 517 }, { "epoch": 0.7189451769604441, "grad_norm": 0.24323104321956635, "learning_rate": 1.958421869719807e-06, "loss": 0.0498, "step": 518 }, { "epoch": 0.7203331020124913, "grad_norm": 0.18736103177070618, "learning_rate": 1.9406166902464128e-06, "loss": 0.028, "step": 519 }, { "epoch": 0.7217210270645386, "grad_norm": 0.29117128252983093, "learning_rate": 1.922873309361542e-06, "loss": 0.0571, "step": 520 }, { "epoch": 0.7231089521165857, "grad_norm": 0.4879210591316223, "learning_rate": 1.9051920854759543e-06, "loss": 0.0387, "step": 521 }, { "epoch": 0.7244968771686329, "grad_norm": 0.18582020699977875, "learning_rate": 1.887573375744856e-06, "loss": 0.0302, "step": 522 }, { "epoch": 0.7258848022206801, "grad_norm": 0.1959291249513626, "learning_rate": 1.8700175360606882e-06, "loss": 0.0339, "step": 523 }, { "epoch": 0.7272727272727273, "grad_norm": 0.26895925402641296, "learning_rate": 1.8525249210459345e-06, "loss": 0.0473, "step": 524 }, { "epoch": 0.7286606523247745, "grad_norm": 0.30201104283332825, "learning_rate": 1.8350958840459665e-06, "loss": 0.0491, "step": 525 }, { "epoch": 0.7300485773768216, "grad_norm": 0.2387959212064743, "learning_rate": 1.8177307771218894e-06, "loss": 0.0358, "step": 526 }, { "epoch": 0.7314365024288688, "grad_norm": 0.21746529638767242, "learning_rate": 1.8004299510434493e-06, "loss": 0.0377, "step": 527 }, { "epoch": 0.732824427480916, "grad_norm": 0.23417864739894867, "learning_rate": 1.7831937552819345e-06, "loss": 0.0426, "step": 528 }, { "epoch": 0.7342123525329632, "grad_norm": 0.18516947329044342, "learning_rate": 1.766022538003122e-06, "loss": 0.0279, "step": 529 }, { "epoch": 0.7356002775850105, "grad_norm": 0.24591688811779022, "learning_rate": 1.7489166460602496e-06, "loss": 0.0451, "step": 530 }, { "epoch": 0.7369882026370576, "grad_norm": 0.19787262380123138, "learning_rate": 1.7318764249869934e-06, "loss": 0.0354, "step": 531 }, { "epoch": 0.7383761276891048, "grad_norm": 0.1984841376543045, "learning_rate": 1.7149022189905041e-06, "loss": 0.0411, "step": 532 }, { "epoch": 0.739764052741152, "grad_norm": 0.26569145917892456, "learning_rate": 1.697994370944452e-06, "loss": 0.0555, "step": 533 }, { "epoch": 0.7411519777931992, "grad_norm": 0.25187060236930847, "learning_rate": 1.6811532223820875e-06, "loss": 0.0423, "step": 534 }, { "epoch": 0.7425399028452464, "grad_norm": 0.21106332540512085, "learning_rate": 1.6643791134893644e-06, "loss": 0.0368, "step": 535 }, { "epoch": 0.7439278278972935, "grad_norm": 0.20370684564113617, "learning_rate": 1.6476723830980451e-06, "loss": 0.038, "step": 536 }, { "epoch": 0.7453157529493407, "grad_norm": 0.2407875657081604, "learning_rate": 1.631033368678872e-06, "loss": 0.0431, "step": 537 }, { "epoch": 0.7467036780013879, "grad_norm": 0.2957558333873749, "learning_rate": 1.6144624063347514e-06, "loss": 0.0308, "step": 538 }, { "epoch": 0.7480916030534351, "grad_norm": 0.19274376332759857, "learning_rate": 1.597959830793947e-06, "loss": 0.0298, "step": 539 }, { "epoch": 0.7494795281054824, "grad_norm": 0.2099481225013733, "learning_rate": 1.5815259754033407e-06, "loss": 0.0343, "step": 540 }, { "epoch": 0.7508674531575295, "grad_norm": 0.2350345253944397, "learning_rate": 1.5651611721216865e-06, "loss": 0.0387, "step": 541 }, { "epoch": 0.7522553782095767, "grad_norm": 0.23420676589012146, "learning_rate": 1.5488657515129001e-06, "loss": 0.0422, "step": 542 }, { "epoch": 0.7536433032616239, "grad_norm": 0.26220807433128357, "learning_rate": 1.5326400427394023e-06, "loss": 0.0444, "step": 543 }, { "epoch": 0.7550312283136711, "grad_norm": 0.19518734514713287, "learning_rate": 1.5164843735554408e-06, "loss": 0.0412, "step": 544 }, { "epoch": 0.7564191533657183, "grad_norm": 0.2225373089313507, "learning_rate": 1.5003990703004994e-06, "loss": 0.0357, "step": 545 }, { "epoch": 0.7578070784177654, "grad_norm": 0.16066500544548035, "learning_rate": 1.4843844578926863e-06, "loss": 0.0308, "step": 546 }, { "epoch": 0.7591950034698126, "grad_norm": 0.27240243554115295, "learning_rate": 1.4684408598221722e-06, "loss": 0.0535, "step": 547 }, { "epoch": 0.7605829285218598, "grad_norm": 0.27581334114074707, "learning_rate": 1.452568598144668e-06, "loss": 0.0387, "step": 548 }, { "epoch": 0.761970853573907, "grad_norm": 0.20905457437038422, "learning_rate": 1.4367679934749085e-06, "loss": 0.0388, "step": 549 }, { "epoch": 0.7633587786259542, "grad_norm": 0.20028021931648254, "learning_rate": 1.421039364980178e-06, "loss": 0.0328, "step": 550 }, { "epoch": 0.7647467036780013, "grad_norm": 0.24100172519683838, "learning_rate": 1.405383030373867e-06, "loss": 0.0371, "step": 551 }, { "epoch": 0.7661346287300486, "grad_norm": 0.233174666762352, "learning_rate": 1.3897993059090492e-06, "loss": 0.0402, "step": 552 }, { "epoch": 0.7675225537820958, "grad_norm": 0.2091444879770279, "learning_rate": 1.374288506372099e-06, "loss": 0.0442, "step": 553 }, { "epoch": 0.768910478834143, "grad_norm": 0.20137923955917358, "learning_rate": 1.3588509450763281e-06, "loss": 0.04, "step": 554 }, { "epoch": 0.7702984038861902, "grad_norm": 0.24194765090942383, "learning_rate": 1.3434869338556594e-06, "loss": 0.0366, "step": 555 }, { "epoch": 0.7716863289382373, "grad_norm": 0.24912679195404053, "learning_rate": 1.3281967830583264e-06, "loss": 0.0416, "step": 556 }, { "epoch": 0.7730742539902845, "grad_norm": 0.6993520259857178, "learning_rate": 1.3129808015406064e-06, "loss": 0.0495, "step": 557 }, { "epoch": 0.7744621790423317, "grad_norm": 0.2714241147041321, "learning_rate": 1.297839296660579e-06, "loss": 0.0429, "step": 558 }, { "epoch": 0.7758501040943789, "grad_norm": 0.26390236616134644, "learning_rate": 1.2827725742719205e-06, "loss": 0.0403, "step": 559 }, { "epoch": 0.7772380291464261, "grad_norm": 0.19193479418754578, "learning_rate": 1.267780938717722e-06, "loss": 0.0316, "step": 560 }, { "epoch": 0.7786259541984732, "grad_norm": 0.31760939955711365, "learning_rate": 1.252864692824346e-06, "loss": 0.0506, "step": 561 }, { "epoch": 0.7800138792505205, "grad_norm": 0.22748614847660065, "learning_rate": 1.2380241378953067e-06, "loss": 0.0465, "step": 562 }, { "epoch": 0.7814018043025677, "grad_norm": 0.17882980406284332, "learning_rate": 1.223259573705184e-06, "loss": 0.0356, "step": 563 }, { "epoch": 0.7827897293546149, "grad_norm": 0.20773129165172577, "learning_rate": 1.2085712984935693e-06, "loss": 0.0346, "step": 564 }, { "epoch": 0.7841776544066621, "grad_norm": 0.2179643213748932, "learning_rate": 1.1939596089590394e-06, "loss": 0.0395, "step": 565 }, { "epoch": 0.7855655794587092, "grad_norm": 0.19541744887828827, "learning_rate": 1.1794248002531644e-06, "loss": 0.0348, "step": 566 }, { "epoch": 0.7869535045107564, "grad_norm": 0.24888668954372406, "learning_rate": 1.1649671659745504e-06, "loss": 0.0481, "step": 567 }, { "epoch": 0.7883414295628036, "grad_norm": 0.20198383927345276, "learning_rate": 1.1505869981628953e-06, "loss": 0.0406, "step": 568 }, { "epoch": 0.7897293546148508, "grad_norm": 0.2493666559457779, "learning_rate": 1.1362845872931044e-06, "loss": 0.037, "step": 569 }, { "epoch": 0.7911172796668979, "grad_norm": 0.22709275782108307, "learning_rate": 1.1220602222694166e-06, "loss": 0.0426, "step": 570 }, { "epoch": 0.7925052047189451, "grad_norm": 0.25838497281074524, "learning_rate": 1.1079141904195662e-06, "loss": 0.0486, "step": 571 }, { "epoch": 0.7938931297709924, "grad_norm": 0.2124512940645218, "learning_rate": 1.0938467774889883e-06, "loss": 0.034, "step": 572 }, { "epoch": 0.7952810548230396, "grad_norm": 0.22879303991794586, "learning_rate": 1.0798582676350316e-06, "loss": 0.0444, "step": 573 }, { "epoch": 0.7966689798750868, "grad_norm": 0.19821453094482422, "learning_rate": 1.0659489434212323e-06, "loss": 0.0365, "step": 574 }, { "epoch": 0.7980569049271339, "grad_norm": 0.184886172413826, "learning_rate": 1.0521190858116042e-06, "loss": 0.031, "step": 575 }, { "epoch": 0.7994448299791811, "grad_norm": 0.22053441405296326, "learning_rate": 1.0383689741649516e-06, "loss": 0.0477, "step": 576 }, { "epoch": 0.8008327550312283, "grad_norm": 0.21095865964889526, "learning_rate": 1.0246988862292462e-06, "loss": 0.0315, "step": 577 }, { "epoch": 0.8022206800832755, "grad_norm": 0.21625402569770813, "learning_rate": 1.0111090981359961e-06, "loss": 0.044, "step": 578 }, { "epoch": 0.8036086051353227, "grad_norm": 0.2409215271472931, "learning_rate": 9.975998843946811e-07, "loss": 0.0504, "step": 579 }, { "epoch": 0.8049965301873698, "grad_norm": 0.18764494359493256, "learning_rate": 9.841715178872092e-07, "loss": 0.035, "step": 580 }, { "epoch": 0.806384455239417, "grad_norm": 0.25301676988601685, "learning_rate": 9.708242698623898e-07, "loss": 0.0476, "step": 581 }, { "epoch": 0.8077723802914643, "grad_norm": 0.23698049783706665, "learning_rate": 9.575584099304735e-07, "loss": 0.0334, "step": 582 }, { "epoch": 0.8091603053435115, "grad_norm": 0.2794058918952942, "learning_rate": 9.443742060576916e-07, "loss": 0.0515, "step": 583 }, { "epoch": 0.8105482303955587, "grad_norm": 0.31980571150779724, "learning_rate": 9.312719245608487e-07, "loss": 0.0617, "step": 584 }, { "epoch": 0.8119361554476058, "grad_norm": 0.21514901518821716, "learning_rate": 9.182518301019466e-07, "loss": 0.0445, "step": 585 }, { "epoch": 0.813324080499653, "grad_norm": 0.1985836774110794, "learning_rate": 9.053141856828274e-07, "loss": 0.036, "step": 586 }, { "epoch": 0.8147120055517002, "grad_norm": 0.22017304599285126, "learning_rate": 8.924592526398762e-07, "loss": 0.0412, "step": 587 }, { "epoch": 0.8160999306037474, "grad_norm": 0.18507330119609833, "learning_rate": 8.796872906387299e-07, "loss": 0.0374, "step": 588 }, { "epoch": 0.8174878556557946, "grad_norm": 0.16561415791511536, "learning_rate": 8.669985576690371e-07, "loss": 0.0286, "step": 589 }, { "epoch": 0.8188757807078417, "grad_norm": 0.2667452096939087, "learning_rate": 8.54393310039246e-07, "loss": 0.0429, "step": 590 }, { "epoch": 0.8202637057598889, "grad_norm": 0.28538012504577637, "learning_rate": 8.418718023714235e-07, "loss": 0.0354, "step": 591 }, { "epoch": 0.8216516308119362, "grad_norm": 0.25441908836364746, "learning_rate": 8.29434287596122e-07, "loss": 0.0485, "step": 592 }, { "epoch": 0.8230395558639834, "grad_norm": 0.24638378620147705, "learning_rate": 8.170810169472593e-07, "loss": 0.0403, "step": 593 }, { "epoch": 0.8244274809160306, "grad_norm": 0.19534482061862946, "learning_rate": 8.04812239957049e-07, "loss": 0.0365, "step": 594 }, { "epoch": 0.8258154059680777, "grad_norm": 0.17373257875442505, "learning_rate": 7.926282044509593e-07, "loss": 0.0271, "step": 595 }, { "epoch": 0.8272033310201249, "grad_norm": 0.2546538710594177, "learning_rate": 7.805291565427065e-07, "loss": 0.0451, "step": 596 }, { "epoch": 0.8285912560721721, "grad_norm": 0.19341765344142914, "learning_rate": 7.685153406292845e-07, "loss": 0.0365, "step": 597 }, { "epoch": 0.8299791811242193, "grad_norm": 0.20034624636173248, "learning_rate": 7.56586999386027e-07, "loss": 0.033, "step": 598 }, { "epoch": 0.8313671061762665, "grad_norm": 0.201642245054245, "learning_rate": 7.447443737617066e-07, "loss": 0.0263, "step": 599 }, { "epoch": 0.8327550312283136, "grad_norm": 0.2718977928161621, "learning_rate": 7.329877029736665e-07, "loss": 0.0461, "step": 600 }, { "epoch": 0.8341429562803608, "grad_norm": 0.21784666180610657, "learning_rate": 7.213172245029892e-07, "loss": 0.0336, "step": 601 }, { "epoch": 0.835530881332408, "grad_norm": 0.21657319366931915, "learning_rate": 7.097331740896995e-07, "loss": 0.0384, "step": 602 }, { "epoch": 0.8369188063844553, "grad_norm": 0.17798687517642975, "learning_rate": 6.98235785728002e-07, "loss": 0.0325, "step": 603 }, { "epoch": 0.8383067314365025, "grad_norm": 0.26031696796417236, "learning_rate": 6.868252916615553e-07, "loss": 0.0392, "step": 604 }, { "epoch": 0.8396946564885496, "grad_norm": 0.26929226517677307, "learning_rate": 6.755019223787807e-07, "loss": 0.0454, "step": 605 }, { "epoch": 0.8410825815405968, "grad_norm": 0.22964255511760712, "learning_rate": 6.642659066082046e-07, "loss": 0.0392, "step": 606 }, { "epoch": 0.842470506592644, "grad_norm": 0.2255278080701828, "learning_rate": 6.531174713138416e-07, "loss": 0.0466, "step": 607 }, { "epoch": 0.8438584316446912, "grad_norm": 0.25335291028022766, "learning_rate": 6.420568416906059e-07, "loss": 0.0422, "step": 608 }, { "epoch": 0.8452463566967384, "grad_norm": 0.22428208589553833, "learning_rate": 6.310842411597667e-07, "loss": 0.0426, "step": 609 }, { "epoch": 0.8466342817487855, "grad_norm": 0.20912961661815643, "learning_rate": 6.201998913644319e-07, "loss": 0.0356, "step": 610 }, { "epoch": 0.8480222068008327, "grad_norm": 0.27327394485473633, "learning_rate": 6.094040121650719e-07, "loss": 0.0462, "step": 611 }, { "epoch": 0.84941013185288, "grad_norm": 0.2542797029018402, "learning_rate": 5.986968216350786e-07, "loss": 0.0304, "step": 612 }, { "epoch": 0.8507980569049272, "grad_norm": 0.18343311548233032, "learning_rate": 5.880785360563596e-07, "loss": 0.0357, "step": 613 }, { "epoch": 0.8521859819569744, "grad_norm": 0.1442384272813797, "learning_rate": 5.775493699149754e-07, "loss": 0.0243, "step": 614 }, { "epoch": 0.8535739070090215, "grad_norm": 0.23244093358516693, "learning_rate": 5.671095358967926e-07, "loss": 0.0391, "step": 615 }, { "epoch": 0.8549618320610687, "grad_norm": 0.20659306645393372, "learning_rate": 5.56759244883206e-07, "loss": 0.0315, "step": 616 }, { "epoch": 0.8563497571131159, "grad_norm": 0.2342347949743271, "learning_rate": 5.464987059468629e-07, "loss": 0.0483, "step": 617 }, { "epoch": 0.8577376821651631, "grad_norm": 0.2304675132036209, "learning_rate": 5.36328126347449e-07, "loss": 0.047, "step": 618 }, { "epoch": 0.8591256072172103, "grad_norm": 0.24737447500228882, "learning_rate": 5.262477115275022e-07, "loss": 0.0407, "step": 619 }, { "epoch": 0.8605135322692574, "grad_norm": 0.22765293717384338, "learning_rate": 5.162576651082541e-07, "loss": 0.036, "step": 620 }, { "epoch": 0.8619014573213046, "grad_norm": 0.23714007437229156, "learning_rate": 5.063581888855285e-07, "loss": 0.0502, "step": 621 }, { "epoch": 0.8632893823733518, "grad_norm": 0.1933499425649643, "learning_rate": 4.965494828256573e-07, "loss": 0.0313, "step": 622 }, { "epoch": 0.864677307425399, "grad_norm": 0.3374840021133423, "learning_rate": 4.868317450614407e-07, "loss": 0.042, "step": 623 }, { "epoch": 0.8660652324774463, "grad_norm": 0.24019359052181244, "learning_rate": 4.772051718881532e-07, "loss": 0.0438, "step": 624 }, { "epoch": 0.8674531575294934, "grad_norm": 0.17875275015830994, "learning_rate": 4.676699577595667e-07, "loss": 0.0375, "step": 625 }, { "epoch": 0.8688410825815406, "grad_norm": 0.22650332748889923, "learning_rate": 4.582262952840355e-07, "loss": 0.0362, "step": 626 }, { "epoch": 0.8702290076335878, "grad_norm": 0.19926731288433075, "learning_rate": 4.4887437522059487e-07, "loss": 0.0314, "step": 627 }, { "epoch": 0.871616932685635, "grad_norm": 0.22209225594997406, "learning_rate": 4.3961438647511066e-07, "loss": 0.0325, "step": 628 }, { "epoch": 0.8730048577376822, "grad_norm": 0.23820732533931732, "learning_rate": 4.304465160964699e-07, "loss": 0.0392, "step": 629 }, { "epoch": 0.8743927827897293, "grad_norm": 0.1484634429216385, "learning_rate": 4.2137094927279296e-07, "loss": 0.0271, "step": 630 }, { "epoch": 0.8757807078417765, "grad_norm": 0.19037142395973206, "learning_rate": 4.1238786932769947e-07, "loss": 0.0316, "step": 631 }, { "epoch": 0.8771686328938237, "grad_norm": 0.25099167227745056, "learning_rate": 4.0349745771660233e-07, "loss": 0.039, "step": 632 }, { "epoch": 0.878556557945871, "grad_norm": 0.28284958004951477, "learning_rate": 3.946998940230401e-07, "loss": 0.0475, "step": 633 }, { "epoch": 0.8799444829979182, "grad_norm": 0.24289296567440033, "learning_rate": 3.859953559550589e-07, "loss": 0.0457, "step": 634 }, { "epoch": 0.8813324080499653, "grad_norm": 0.17973592877388, "learning_rate": 3.7738401934161006e-07, "loss": 0.0342, "step": 635 }, { "epoch": 0.8827203331020125, "grad_norm": 0.24480481445789337, "learning_rate": 3.6886605812900766e-07, "loss": 0.0502, "step": 636 }, { "epoch": 0.8841082581540597, "grad_norm": 0.2745607793331146, "learning_rate": 3.604416443774117e-07, "loss": 0.0503, "step": 637 }, { "epoch": 0.8854961832061069, "grad_norm": 0.2708338499069214, "learning_rate": 3.5211094825735147e-07, "loss": 0.0434, "step": 638 }, { "epoch": 0.8868841082581541, "grad_norm": 0.2643817663192749, "learning_rate": 3.4387413804628955e-07, "loss": 0.0442, "step": 639 }, { "epoch": 0.8882720333102012, "grad_norm": 0.17992961406707764, "learning_rate": 3.357313801252238e-07, "loss": 0.0319, "step": 640 }, { "epoch": 0.8896599583622484, "grad_norm": 0.19240032136440277, "learning_rate": 3.276828389753234e-07, "loss": 0.0362, "step": 641 }, { "epoch": 0.8910478834142956, "grad_norm": 0.2068304866552353, "learning_rate": 3.197286771746094e-07, "loss": 0.0326, "step": 642 }, { "epoch": 0.8924358084663429, "grad_norm": 0.3868578374385834, "learning_rate": 3.118690553946685e-07, "loss": 0.0312, "step": 643 }, { "epoch": 0.8938237335183901, "grad_norm": 0.1948038786649704, "learning_rate": 3.041041323974098e-07, "loss": 0.0374, "step": 644 }, { "epoch": 0.8952116585704372, "grad_norm": 0.3001810908317566, "learning_rate": 2.964340650318548e-07, "loss": 0.0548, "step": 645 }, { "epoch": 0.8965995836224844, "grad_norm": 0.20546355843544006, "learning_rate": 2.8885900823097223e-07, "loss": 0.0412, "step": 646 }, { "epoch": 0.8979875086745316, "grad_norm": 0.19419418275356293, "learning_rate": 2.813791150085454e-07, "loss": 0.0401, "step": 647 }, { "epoch": 0.8993754337265788, "grad_norm": 0.1783968210220337, "learning_rate": 2.73994536456087e-07, "loss": 0.0308, "step": 648 }, { "epoch": 0.9007633587786259, "grad_norm": 0.22798137366771698, "learning_rate": 2.6670542173977745e-07, "loss": 0.0422, "step": 649 }, { "epoch": 0.9021512838306731, "grad_norm": 0.28471091389656067, "learning_rate": 2.5951191809746146e-07, "loss": 0.0583, "step": 650 }, { "epoch": 0.9035392088827203, "grad_norm": 0.239897683262825, "learning_rate": 2.524141708356681e-07, "loss": 0.0455, "step": 651 }, { "epoch": 0.9049271339347675, "grad_norm": 0.21840114891529083, "learning_rate": 2.454123233266781e-07, "loss": 0.0396, "step": 652 }, { "epoch": 0.9063150589868147, "grad_norm": 0.23024079203605652, "learning_rate": 2.385065170056283e-07, "loss": 0.0395, "step": 653 }, { "epoch": 0.9077029840388618, "grad_norm": 0.23643670976161957, "learning_rate": 2.3169689136765038e-07, "loss": 0.0447, "step": 654 }, { "epoch": 0.9090909090909091, "grad_norm": 0.24520286917686462, "learning_rate": 2.249835839650588e-07, "loss": 0.0476, "step": 655 }, { "epoch": 0.9104788341429563, "grad_norm": 0.21127338707447052, "learning_rate": 2.1836673040456947e-07, "loss": 0.0388, "step": 656 }, { "epoch": 0.9118667591950035, "grad_norm": 0.1968703418970108, "learning_rate": 2.1184646434455947e-07, "loss": 0.0328, "step": 657 }, { "epoch": 0.9132546842470507, "grad_norm": 0.2115575224161148, "learning_rate": 2.0542291749237053e-07, "loss": 0.0352, "step": 658 }, { "epoch": 0.9146426092990978, "grad_norm": 0.15520645678043365, "learning_rate": 1.9909621960164382e-07, "loss": 0.0296, "step": 659 }, { "epoch": 0.916030534351145, "grad_norm": 0.6320406198501587, "learning_rate": 1.9286649846970318e-07, "loss": 0.0363, "step": 660 }, { "epoch": 0.9174184594031922, "grad_norm": 0.2276618331670761, "learning_rate": 1.8673387993497383e-07, "loss": 0.0359, "step": 661 }, { "epoch": 0.9188063844552394, "grad_norm": 0.2655383050441742, "learning_rate": 1.8069848787443556e-07, "loss": 0.0506, "step": 662 }, { "epoch": 0.9201943095072866, "grad_norm": 0.22439704835414886, "learning_rate": 1.7476044420112637e-07, "loss": 0.0292, "step": 663 }, { "epoch": 0.9215822345593337, "grad_norm": 0.18324856460094452, "learning_rate": 1.689198688616761e-07, "loss": 0.0285, "step": 664 }, { "epoch": 0.922970159611381, "grad_norm": 0.2790418267250061, "learning_rate": 1.631768798338834e-07, "loss": 0.0417, "step": 665 }, { "epoch": 0.9243580846634282, "grad_norm": 0.22626622021198273, "learning_rate": 1.5753159312433762e-07, "loss": 0.0432, "step": 666 }, { "epoch": 0.9257460097154754, "grad_norm": 0.22337311506271362, "learning_rate": 1.5198412276606622e-07, "loss": 0.0379, "step": 667 }, { "epoch": 0.9271339347675226, "grad_norm": 0.1884629875421524, "learning_rate": 1.465345808162427e-07, "loss": 0.0315, "step": 668 }, { "epoch": 0.9285218598195697, "grad_norm": 0.18185679614543915, "learning_rate": 1.4118307735391412e-07, "loss": 0.0322, "step": 669 }, { "epoch": 0.9299097848716169, "grad_norm": 0.20479874312877655, "learning_rate": 1.3592972047777874e-07, "loss": 0.0303, "step": 670 }, { "epoch": 0.9312977099236641, "grad_norm": 0.3006023168563843, "learning_rate": 1.3077461630400967e-07, "loss": 0.055, "step": 671 }, { "epoch": 0.9326856349757113, "grad_norm": 0.27527281641960144, "learning_rate": 1.2571786896410144e-07, "loss": 0.0491, "step": 672 }, { "epoch": 0.9340735600277585, "grad_norm": 0.23598523437976837, "learning_rate": 1.2075958060277394e-07, "loss": 0.0412, "step": 673 }, { "epoch": 0.9354614850798056, "grad_norm": 0.21559569239616394, "learning_rate": 1.158998513759052e-07, "loss": 0.04, "step": 674 }, { "epoch": 0.9368494101318529, "grad_norm": 0.18777711689472198, "learning_rate": 1.1113877944850804e-07, "loss": 0.0287, "step": 675 }, { "epoch": 0.9382373351839001, "grad_norm": 0.22608903050422668, "learning_rate": 1.0647646099275267e-07, "loss": 0.0501, "step": 676 }, { "epoch": 0.9396252602359473, "grad_norm": 0.23553456366062164, "learning_rate": 1.0191299018601608e-07, "loss": 0.0369, "step": 677 }, { "epoch": 0.9410131852879945, "grad_norm": 0.17608265578746796, "learning_rate": 9.744845920898527e-08, "loss": 0.0286, "step": 678 }, { "epoch": 0.9424011103400416, "grad_norm": 0.26702162623405457, "learning_rate": 9.308295824379365e-08, "loss": 0.0421, "step": 679 }, { "epoch": 0.9437890353920888, "grad_norm": 0.2531726360321045, "learning_rate": 8.881657547219869e-08, "loss": 0.0409, "step": 680 }, { "epoch": 0.945176960444136, "grad_norm": 0.18312399089336395, "learning_rate": 8.46493970738016e-08, "loss": 0.0284, "step": 681 }, { "epoch": 0.9465648854961832, "grad_norm": 0.20322586596012115, "learning_rate": 8.058150722430658e-08, "loss": 0.0353, "step": 682 }, { "epoch": 0.9479528105482304, "grad_norm": 0.23744148015975952, "learning_rate": 7.661298809381878e-08, "loss": 0.0381, "step": 683 }, { "epoch": 0.9493407356002775, "grad_norm": 0.17557460069656372, "learning_rate": 7.274391984518736e-08, "loss": 0.0311, "step": 684 }, { "epoch": 0.9507286606523248, "grad_norm": 0.2498820275068283, "learning_rate": 6.897438063238393e-08, "loss": 0.0446, "step": 685 }, { "epoch": 0.952116585704372, "grad_norm": 0.2372819185256958, "learning_rate": 6.530444659892443e-08, "loss": 0.0427, "step": 686 }, { "epoch": 0.9535045107564192, "grad_norm": 0.18727374076843262, "learning_rate": 6.173419187633201e-08, "loss": 0.0341, "step": 687 }, { "epoch": 0.9548924358084664, "grad_norm": 0.24864086508750916, "learning_rate": 5.82636885826382e-08, "loss": 0.045, "step": 688 }, { "epoch": 0.9562803608605135, "grad_norm": 0.2669983506202698, "learning_rate": 5.4893006820926355e-08, "loss": 0.0377, "step": 689 }, { "epoch": 0.9576682859125607, "grad_norm": 0.36765196919441223, "learning_rate": 5.162221467791772e-08, "loss": 0.0432, "step": 690 }, { "epoch": 0.9590562109646079, "grad_norm": 0.18304842710494995, "learning_rate": 4.8451378222592605e-08, "loss": 0.0356, "step": 691 }, { "epoch": 0.9604441360166551, "grad_norm": 0.24612656235694885, "learning_rate": 4.5380561504858586e-08, "loss": 0.0385, "step": 692 }, { "epoch": 0.9618320610687023, "grad_norm": 0.2270457148551941, "learning_rate": 4.240982655425552e-08, "loss": 0.0505, "step": 693 }, { "epoch": 0.9632199861207494, "grad_norm": 0.1908656507730484, "learning_rate": 3.953923337870147e-08, "loss": 0.0359, "step": 694 }, { "epoch": 0.9646079111727967, "grad_norm": 0.1745605766773224, "learning_rate": 3.6768839963285395e-08, "loss": 0.0304, "step": 695 }, { "epoch": 0.9659958362248439, "grad_norm": 0.23264414072036743, "learning_rate": 3.409870226908863e-08, "loss": 0.0437, "step": 696 }, { "epoch": 0.9673837612768911, "grad_norm": 0.2552529573440552, "learning_rate": 3.1528874232059635e-08, "loss": 0.0507, "step": 697 }, { "epoch": 0.9687716863289383, "grad_norm": 0.22192077338695526, "learning_rate": 2.905940776192384e-08, "loss": 0.0445, "step": 698 }, { "epoch": 0.9701596113809854, "grad_norm": 0.23040442168712616, "learning_rate": 2.669035274113274e-08, "loss": 0.0415, "step": 699 }, { "epoch": 0.9715475364330326, "grad_norm": 0.21938888728618622, "learning_rate": 2.4421757023859737e-08, "loss": 0.0369, "step": 700 }, { "epoch": 0.9729354614850798, "grad_norm": 0.19337037205696106, "learning_rate": 2.2253666435029797e-08, "loss": 0.0311, "step": 701 }, { "epoch": 0.974323386537127, "grad_norm": 0.20000146329402924, "learning_rate": 2.0186124769396855e-08, "loss": 0.0371, "step": 702 }, { "epoch": 0.9757113115891742, "grad_norm": 0.22154293954372406, "learning_rate": 1.8219173790658406e-08, "loss": 0.0336, "step": 703 }, { "epoch": 0.9770992366412213, "grad_norm": 0.234355166554451, "learning_rate": 1.6352853230609534e-08, "loss": 0.0438, "step": 704 }, { "epoch": 0.9784871616932685, "grad_norm": 0.20534580945968628, "learning_rate": 1.4587200788343524e-08, "loss": 0.0301, "step": 705 }, { "epoch": 0.9798750867453158, "grad_norm": 0.22104544937610626, "learning_rate": 1.2922252129489165e-08, "loss": 0.0442, "step": 706 }, { "epoch": 0.981263011797363, "grad_norm": 0.23223792016506195, "learning_rate": 1.1358040885490196e-08, "loss": 0.0369, "step": 707 }, { "epoch": 0.9826509368494102, "grad_norm": 0.1923191100358963, "learning_rate": 9.894598652925857e-09, "loss": 0.0356, "step": 708 }, { "epoch": 0.9840388619014573, "grad_norm": 0.2507604956626892, "learning_rate": 8.53195499287196e-09, "loss": 0.0472, "step": 709 }, { "epoch": 0.9854267869535045, "grad_norm": 0.2132437825202942, "learning_rate": 7.2701374303063565e-09, "loss": 0.0381, "step": 710 }, { "epoch": 0.9868147120055517, "grad_norm": 0.16438357532024384, "learning_rate": 6.109171453549944e-09, "loss": 0.028, "step": 711 }, { "epoch": 0.9882026370575989, "grad_norm": 0.24717937409877777, "learning_rate": 5.049080513752636e-09, "loss": 0.0439, "step": 712 }, { "epoch": 0.9895905621096461, "grad_norm": 0.2316623479127884, "learning_rate": 4.089886024421508e-09, "loss": 0.0378, "step": 713 }, { "epoch": 0.9909784871616932, "grad_norm": 0.2512715756893158, "learning_rate": 3.2316073609856e-09, "loss": 0.0378, "step": 714 }, { "epoch": 0.9923664122137404, "grad_norm": 0.27887260913848877, "learning_rate": 2.474261860406779e-09, "loss": 0.0537, "step": 715 }, { "epoch": 0.9937543372657877, "grad_norm": 0.19671159982681274, "learning_rate": 1.817864820827242e-09, "loss": 0.0277, "step": 716 }, { "epoch": 0.9951422623178349, "grad_norm": 0.22438108921051025, "learning_rate": 1.2624295012625409e-09, "loss": 0.0389, "step": 717 }, { "epoch": 0.9965301873698821, "grad_norm": 0.17432349920272827, "learning_rate": 8.079671213334639e-10, "loss": 0.0333, "step": 718 }, { "epoch": 0.9979181124219292, "grad_norm": 0.24869811534881592, "learning_rate": 4.5448686103732876e-10, "loss": 0.0411, "step": 719 }, { "epoch": 0.9993060374739764, "grad_norm": 0.24824249744415283, "learning_rate": 2.0199586056590669e-10, "loss": 0.0419, "step": 720 }, { "epoch": 1.0, "grad_norm": 0.2819853723049164, "learning_rate": 5.049922015887276e-11, "loss": 0.0334, "step": 721 }, { "epoch": 1.0, "step": 721, "total_flos": 140605265387520.0, "train_loss": 0.04385479942129488, "train_runtime": 20865.6154, "train_samples_per_second": 0.829, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 721, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 140605265387520.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }