{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 9.114428520202637, "learning_rate": 0.0, "loss": 1.562, "step": 1 }, { "epoch": 0.0064, "grad_norm": 8.773235321044922, "learning_rate": 2.0000000000000003e-06, "loss": 1.5296, "step": 2 }, { "epoch": 0.0096, "grad_norm": 9.807992935180664, "learning_rate": 4.000000000000001e-06, "loss": 1.591, "step": 3 }, { "epoch": 0.0128, "grad_norm": 8.871468544006348, "learning_rate": 6e-06, "loss": 1.6128, "step": 4 }, { "epoch": 0.016, "grad_norm": 6.351559638977051, "learning_rate": 8.000000000000001e-06, "loss": 1.3759, "step": 5 }, { "epoch": 0.0192, "grad_norm": 4.211467742919922, "learning_rate": 1e-05, "loss": 1.3595, "step": 6 }, { "epoch": 0.0224, "grad_norm": 2.549668550491333, "learning_rate": 9.967532467532468e-06, "loss": 1.2925, "step": 7 }, { "epoch": 0.0256, "grad_norm": 2.402937650680542, "learning_rate": 9.935064935064936e-06, "loss": 1.3283, "step": 8 }, { "epoch": 0.0288, "grad_norm": 2.199153184890747, "learning_rate": 9.902597402597403e-06, "loss": 1.1662, "step": 9 }, { "epoch": 0.032, "grad_norm": 1.8047702312469482, "learning_rate": 9.87012987012987e-06, "loss": 1.0762, "step": 10 }, { "epoch": 0.0352, "grad_norm": 2.2670371532440186, "learning_rate": 9.837662337662337e-06, "loss": 1.2495, "step": 11 }, { "epoch": 0.0384, "grad_norm": 1.9776052236557007, "learning_rate": 9.805194805194806e-06, "loss": 1.1189, "step": 12 }, { "epoch": 0.0416, "grad_norm": 1.8554439544677734, "learning_rate": 9.772727272727273e-06, "loss": 1.08, "step": 13 }, { "epoch": 0.0448, "grad_norm": 1.751678466796875, "learning_rate": 9.740259740259742e-06, "loss": 1.0997, "step": 14 }, { "epoch": 0.048, "grad_norm": 2.046874761581421, "learning_rate": 9.707792207792209e-06, "loss": 1.1096, "step": 15 }, { "epoch": 0.0512, "grad_norm": 1.7334591150283813, "learning_rate": 9.675324675324677e-06, "loss": 0.9895, "step": 16 }, { "epoch": 0.0544, "grad_norm": 1.6706339120864868, "learning_rate": 9.642857142857144e-06, "loss": 1.0069, "step": 17 }, { "epoch": 0.0576, "grad_norm": 1.762417197227478, "learning_rate": 9.610389610389611e-06, "loss": 1.0206, "step": 18 }, { "epoch": 0.0608, "grad_norm": 1.5199729204177856, "learning_rate": 9.577922077922078e-06, "loss": 0.9358, "step": 19 }, { "epoch": 0.064, "grad_norm": 1.4757474660873413, "learning_rate": 9.545454545454547e-06, "loss": 0.9717, "step": 20 }, { "epoch": 0.0672, "grad_norm": 1.7378476858139038, "learning_rate": 9.512987012987014e-06, "loss": 1.0139, "step": 21 }, { "epoch": 0.0704, "grad_norm": 1.7422256469726562, "learning_rate": 9.48051948051948e-06, "loss": 1.0029, "step": 22 }, { "epoch": 0.0736, "grad_norm": 1.4580937623977661, "learning_rate": 9.448051948051948e-06, "loss": 0.9449, "step": 23 }, { "epoch": 0.0768, "grad_norm": 1.8162392377853394, "learning_rate": 9.415584415584416e-06, "loss": 0.9479, "step": 24 }, { "epoch": 0.08, "grad_norm": 1.6786167621612549, "learning_rate": 9.383116883116883e-06, "loss": 0.9168, "step": 25 }, { "epoch": 0.0832, "grad_norm": 1.6172140836715698, "learning_rate": 9.350649350649352e-06, "loss": 0.936, "step": 26 }, { "epoch": 0.0864, "grad_norm": 1.503225326538086, "learning_rate": 9.318181818181819e-06, "loss": 0.8382, "step": 27 }, { "epoch": 0.0896, "grad_norm": 1.5848450660705566, "learning_rate": 9.285714285714288e-06, "loss": 0.8804, "step": 28 }, { "epoch": 0.0928, "grad_norm": 1.7622848749160767, "learning_rate": 9.253246753246755e-06, "loss": 0.8474, "step": 29 }, { "epoch": 0.096, "grad_norm": 1.6067888736724854, "learning_rate": 9.220779220779221e-06, "loss": 0.8316, "step": 30 }, { "epoch": 0.0992, "grad_norm": 1.6786375045776367, "learning_rate": 9.188311688311688e-06, "loss": 0.9484, "step": 31 }, { "epoch": 0.1024, "grad_norm": 2.331812858581543, "learning_rate": 9.155844155844157e-06, "loss": 0.9174, "step": 32 }, { "epoch": 0.1056, "grad_norm": 1.654259204864502, "learning_rate": 9.123376623376624e-06, "loss": 0.8153, "step": 33 }, { "epoch": 0.1088, "grad_norm": 2.215853452682495, "learning_rate": 9.090909090909091e-06, "loss": 0.9082, "step": 34 }, { "epoch": 0.112, "grad_norm": 1.8472061157226562, "learning_rate": 9.05844155844156e-06, "loss": 0.8669, "step": 35 }, { "epoch": 0.1152, "grad_norm": 1.5219857692718506, "learning_rate": 9.025974025974027e-06, "loss": 0.8297, "step": 36 }, { "epoch": 0.1184, "grad_norm": 1.587709665298462, "learning_rate": 8.993506493506494e-06, "loss": 0.7774, "step": 37 }, { "epoch": 0.1216, "grad_norm": 1.3187901973724365, "learning_rate": 8.96103896103896e-06, "loss": 0.8551, "step": 38 }, { "epoch": 0.1248, "grad_norm": 1.3549308776855469, "learning_rate": 8.92857142857143e-06, "loss": 0.8184, "step": 39 }, { "epoch": 0.128, "grad_norm": 1.3523176908493042, "learning_rate": 8.896103896103896e-06, "loss": 0.7871, "step": 40 }, { "epoch": 0.1312, "grad_norm": 1.5008044242858887, "learning_rate": 8.863636363636365e-06, "loss": 0.8173, "step": 41 }, { "epoch": 0.1344, "grad_norm": 1.548140048980713, "learning_rate": 8.831168831168832e-06, "loss": 0.7689, "step": 42 }, { "epoch": 0.1376, "grad_norm": 1.4833955764770508, "learning_rate": 8.7987012987013e-06, "loss": 0.774, "step": 43 }, { "epoch": 0.1408, "grad_norm": 1.3166873455047607, "learning_rate": 8.766233766233767e-06, "loss": 0.8248, "step": 44 }, { "epoch": 0.144, "grad_norm": 1.6397587060928345, "learning_rate": 8.733766233766234e-06, "loss": 0.8606, "step": 45 }, { "epoch": 0.1472, "grad_norm": 1.293379306793213, "learning_rate": 8.701298701298701e-06, "loss": 0.8185, "step": 46 }, { "epoch": 0.1504, "grad_norm": 1.3891499042510986, "learning_rate": 8.66883116883117e-06, "loss": 0.7558, "step": 47 }, { "epoch": 0.1536, "grad_norm": 1.4359731674194336, "learning_rate": 8.636363636363637e-06, "loss": 0.7746, "step": 48 }, { "epoch": 0.1568, "grad_norm": 1.6567460298538208, "learning_rate": 8.603896103896104e-06, "loss": 0.8247, "step": 49 }, { "epoch": 0.16, "grad_norm": 1.2832330465316772, "learning_rate": 8.571428571428571e-06, "loss": 0.7534, "step": 50 }, { "epoch": 0.1632, "grad_norm": 1.4563064575195312, "learning_rate": 8.53896103896104e-06, "loss": 0.7768, "step": 51 }, { "epoch": 0.1664, "grad_norm": 1.4121637344360352, "learning_rate": 8.506493506493507e-06, "loss": 0.7595, "step": 52 }, { "epoch": 0.1696, "grad_norm": 1.4510411024093628, "learning_rate": 8.474025974025975e-06, "loss": 0.8273, "step": 53 }, { "epoch": 0.1728, "grad_norm": 1.5629265308380127, "learning_rate": 8.441558441558442e-06, "loss": 0.757, "step": 54 }, { "epoch": 0.176, "grad_norm": 1.4735215902328491, "learning_rate": 8.40909090909091e-06, "loss": 0.7466, "step": 55 }, { "epoch": 0.1792, "grad_norm": 1.4382805824279785, "learning_rate": 8.376623376623378e-06, "loss": 0.7762, "step": 56 }, { "epoch": 0.1824, "grad_norm": 1.5057328939437866, "learning_rate": 8.344155844155845e-06, "loss": 0.8177, "step": 57 }, { "epoch": 0.1856, "grad_norm": 1.3751904964447021, "learning_rate": 8.311688311688313e-06, "loss": 0.8083, "step": 58 }, { "epoch": 0.1888, "grad_norm": 1.4311106204986572, "learning_rate": 8.27922077922078e-06, "loss": 0.7575, "step": 59 }, { "epoch": 0.192, "grad_norm": 1.5389622449874878, "learning_rate": 8.246753246753247e-06, "loss": 0.7711, "step": 60 }, { "epoch": 0.1952, "grad_norm": 1.5474581718444824, "learning_rate": 8.214285714285714e-06, "loss": 0.7572, "step": 61 }, { "epoch": 0.1984, "grad_norm": 1.5178961753845215, "learning_rate": 8.181818181818183e-06, "loss": 0.7849, "step": 62 }, { "epoch": 0.2016, "grad_norm": 1.3560807704925537, "learning_rate": 8.14935064935065e-06, "loss": 0.7433, "step": 63 }, { "epoch": 0.2048, "grad_norm": 1.6572718620300293, "learning_rate": 8.116883116883117e-06, "loss": 0.7343, "step": 64 }, { "epoch": 0.208, "grad_norm": 1.4690048694610596, "learning_rate": 8.084415584415586e-06, "loss": 0.6982, "step": 65 }, { "epoch": 0.2112, "grad_norm": 1.6963770389556885, "learning_rate": 8.051948051948052e-06, "loss": 0.7367, "step": 66 }, { "epoch": 0.2144, "grad_norm": 1.3336354494094849, "learning_rate": 8.019480519480521e-06, "loss": 0.7031, "step": 67 }, { "epoch": 0.2176, "grad_norm": 1.6367865800857544, "learning_rate": 7.987012987012988e-06, "loss": 0.7058, "step": 68 }, { "epoch": 0.2208, "grad_norm": 1.3497200012207031, "learning_rate": 7.954545454545455e-06, "loss": 0.6457, "step": 69 }, { "epoch": 0.224, "grad_norm": 1.8194010257720947, "learning_rate": 7.922077922077924e-06, "loss": 0.7454, "step": 70 }, { "epoch": 0.2272, "grad_norm": 1.7833340167999268, "learning_rate": 7.88961038961039e-06, "loss": 0.766, "step": 71 }, { "epoch": 0.2304, "grad_norm": 1.6490355730056763, "learning_rate": 7.857142857142858e-06, "loss": 0.7469, "step": 72 }, { "epoch": 0.2336, "grad_norm": 1.6471493244171143, "learning_rate": 7.824675324675325e-06, "loss": 0.7579, "step": 73 }, { "epoch": 0.2368, "grad_norm": 1.3370977640151978, "learning_rate": 7.792207792207793e-06, "loss": 0.6612, "step": 74 }, { "epoch": 0.24, "grad_norm": 1.584168791770935, "learning_rate": 7.75974025974026e-06, "loss": 0.6613, "step": 75 }, { "epoch": 0.2432, "grad_norm": 1.6082500219345093, "learning_rate": 7.727272727272727e-06, "loss": 0.6579, "step": 76 }, { "epoch": 0.2464, "grad_norm": 1.5998899936676025, "learning_rate": 7.694805194805194e-06, "loss": 0.7474, "step": 77 }, { "epoch": 0.2496, "grad_norm": 1.447123408317566, "learning_rate": 7.662337662337663e-06, "loss": 0.6495, "step": 78 }, { "epoch": 0.2528, "grad_norm": 1.5577723979949951, "learning_rate": 7.62987012987013e-06, "loss": 0.7509, "step": 79 }, { "epoch": 0.256, "grad_norm": 1.7872244119644165, "learning_rate": 7.597402597402598e-06, "loss": 0.6809, "step": 80 }, { "epoch": 0.2592, "grad_norm": 1.8440085649490356, "learning_rate": 7.564935064935065e-06, "loss": 0.7054, "step": 81 }, { "epoch": 0.2624, "grad_norm": 1.4694713354110718, "learning_rate": 7.532467532467533e-06, "loss": 0.71, "step": 82 }, { "epoch": 0.2656, "grad_norm": 1.8418391942977905, "learning_rate": 7.500000000000001e-06, "loss": 0.7606, "step": 83 }, { "epoch": 0.2688, "grad_norm": 1.6990721225738525, "learning_rate": 7.467532467532468e-06, "loss": 0.7687, "step": 84 }, { "epoch": 0.272, "grad_norm": 1.6245852708816528, "learning_rate": 7.435064935064936e-06, "loss": 0.7185, "step": 85 }, { "epoch": 0.2752, "grad_norm": 1.608574628829956, "learning_rate": 7.402597402597404e-06, "loss": 0.605, "step": 86 }, { "epoch": 0.2784, "grad_norm": 1.759828805923462, "learning_rate": 7.370129870129871e-06, "loss": 0.6541, "step": 87 }, { "epoch": 0.2816, "grad_norm": 1.7893486022949219, "learning_rate": 7.3376623376623375e-06, "loss": 0.6413, "step": 88 }, { "epoch": 0.2848, "grad_norm": 1.948918104171753, "learning_rate": 7.305194805194806e-06, "loss": 0.7028, "step": 89 }, { "epoch": 0.288, "grad_norm": 1.7952879667282104, "learning_rate": 7.272727272727273e-06, "loss": 0.6337, "step": 90 }, { "epoch": 0.2912, "grad_norm": 1.7043386697769165, "learning_rate": 7.240259740259741e-06, "loss": 0.6916, "step": 91 }, { "epoch": 0.2944, "grad_norm": 1.7043230533599854, "learning_rate": 7.207792207792208e-06, "loss": 0.6523, "step": 92 }, { "epoch": 0.2976, "grad_norm": 1.8149183988571167, "learning_rate": 7.175324675324677e-06, "loss": 0.7268, "step": 93 }, { "epoch": 0.3008, "grad_norm": 1.624100923538208, "learning_rate": 7.1428571428571436e-06, "loss": 0.6268, "step": 94 }, { "epoch": 0.304, "grad_norm": 1.7294209003448486, "learning_rate": 7.1103896103896105e-06, "loss": 0.5939, "step": 95 }, { "epoch": 0.3072, "grad_norm": 1.7482402324676514, "learning_rate": 7.077922077922078e-06, "loss": 0.5782, "step": 96 }, { "epoch": 0.3104, "grad_norm": 1.9230289459228516, "learning_rate": 7.045454545454546e-06, "loss": 0.6721, "step": 97 }, { "epoch": 0.3136, "grad_norm": 1.613495111465454, "learning_rate": 7.012987012987014e-06, "loss": 0.69, "step": 98 }, { "epoch": 0.3168, "grad_norm": 1.7930631637573242, "learning_rate": 6.980519480519481e-06, "loss": 0.6764, "step": 99 }, { "epoch": 0.32, "grad_norm": 1.6818891763687134, "learning_rate": 6.948051948051948e-06, "loss": 0.6481, "step": 100 }, { "epoch": 0.3232, "grad_norm": 1.7405375242233276, "learning_rate": 6.9155844155844165e-06, "loss": 0.6156, "step": 101 }, { "epoch": 0.3264, "grad_norm": 1.9247913360595703, "learning_rate": 6.8831168831168835e-06, "loss": 0.6262, "step": 102 }, { "epoch": 0.3296, "grad_norm": 1.7777565717697144, "learning_rate": 6.850649350649351e-06, "loss": 0.6276, "step": 103 }, { "epoch": 0.3328, "grad_norm": 1.8507918119430542, "learning_rate": 6.818181818181818e-06, "loss": 0.6644, "step": 104 }, { "epoch": 0.336, "grad_norm": 1.6047053337097168, "learning_rate": 6.785714285714287e-06, "loss": 0.6214, "step": 105 }, { "epoch": 0.3392, "grad_norm": 1.9592560529708862, "learning_rate": 6.753246753246754e-06, "loss": 0.5962, "step": 106 }, { "epoch": 0.3424, "grad_norm": 2.0305397510528564, "learning_rate": 6.720779220779221e-06, "loss": 0.6886, "step": 107 }, { "epoch": 0.3456, "grad_norm": 1.6223368644714355, "learning_rate": 6.688311688311689e-06, "loss": 0.5772, "step": 108 }, { "epoch": 0.3488, "grad_norm": 1.9203166961669922, "learning_rate": 6.6558441558441565e-06, "loss": 0.6469, "step": 109 }, { "epoch": 0.352, "grad_norm": 1.8619487285614014, "learning_rate": 6.623376623376624e-06, "loss": 0.5696, "step": 110 }, { "epoch": 0.3552, "grad_norm": 2.2053561210632324, "learning_rate": 6.590909090909091e-06, "loss": 0.6634, "step": 111 }, { "epoch": 0.3584, "grad_norm": 1.883378505706787, "learning_rate": 6.55844155844156e-06, "loss": 0.5707, "step": 112 }, { "epoch": 0.3616, "grad_norm": 2.2268807888031006, "learning_rate": 6.525974025974027e-06, "loss": 0.6507, "step": 113 }, { "epoch": 0.3648, "grad_norm": 2.0959272384643555, "learning_rate": 6.493506493506494e-06, "loss": 0.5803, "step": 114 }, { "epoch": 0.368, "grad_norm": 2.075263023376465, "learning_rate": 6.461038961038961e-06, "loss": 0.6287, "step": 115 }, { "epoch": 0.3712, "grad_norm": 2.277855157852173, "learning_rate": 6.4285714285714295e-06, "loss": 0.5876, "step": 116 }, { "epoch": 0.3744, "grad_norm": 1.7043213844299316, "learning_rate": 6.3961038961038964e-06, "loss": 0.5566, "step": 117 }, { "epoch": 0.3776, "grad_norm": 1.9531790018081665, "learning_rate": 6.363636363636364e-06, "loss": 0.6064, "step": 118 }, { "epoch": 0.3808, "grad_norm": 2.610369920730591, "learning_rate": 6.331168831168831e-06, "loss": 0.6368, "step": 119 }, { "epoch": 0.384, "grad_norm": 1.954626441001892, "learning_rate": 6.2987012987013e-06, "loss": 0.5574, "step": 120 }, { "epoch": 0.3872, "grad_norm": 1.761479139328003, "learning_rate": 6.266233766233767e-06, "loss": 0.5741, "step": 121 }, { "epoch": 0.3904, "grad_norm": 1.8345882892608643, "learning_rate": 6.233766233766234e-06, "loss": 0.5458, "step": 122 }, { "epoch": 0.3936, "grad_norm": 1.9553594589233398, "learning_rate": 6.201298701298702e-06, "loss": 0.5591, "step": 123 }, { "epoch": 0.3968, "grad_norm": 2.1521012783050537, "learning_rate": 6.168831168831169e-06, "loss": 0.5994, "step": 124 }, { "epoch": 0.4, "grad_norm": 1.955110788345337, "learning_rate": 6.136363636363637e-06, "loss": 0.5228, "step": 125 }, { "epoch": 0.4032, "grad_norm": 1.8666290044784546, "learning_rate": 6.103896103896104e-06, "loss": 0.4747, "step": 126 }, { "epoch": 0.4064, "grad_norm": 2.1669390201568604, "learning_rate": 6.071428571428571e-06, "loss": 0.6153, "step": 127 }, { "epoch": 0.4096, "grad_norm": 2.1832473278045654, "learning_rate": 6.03896103896104e-06, "loss": 0.5655, "step": 128 }, { "epoch": 0.4128, "grad_norm": 2.049572229385376, "learning_rate": 6.006493506493507e-06, "loss": 0.5377, "step": 129 }, { "epoch": 0.416, "grad_norm": 2.7029855251312256, "learning_rate": 5.9740259740259746e-06, "loss": 0.5978, "step": 130 }, { "epoch": 0.4192, "grad_norm": 2.115083932876587, "learning_rate": 5.9415584415584415e-06, "loss": 0.6316, "step": 131 }, { "epoch": 0.4224, "grad_norm": 2.350674629211426, "learning_rate": 5.90909090909091e-06, "loss": 0.5369, "step": 132 }, { "epoch": 0.4256, "grad_norm": 2.26525616645813, "learning_rate": 5.876623376623377e-06, "loss": 0.5906, "step": 133 }, { "epoch": 0.4288, "grad_norm": 2.239849328994751, "learning_rate": 5.844155844155844e-06, "loss": 0.5638, "step": 134 }, { "epoch": 0.432, "grad_norm": 1.9738165140151978, "learning_rate": 5.811688311688313e-06, "loss": 0.4722, "step": 135 }, { "epoch": 0.4352, "grad_norm": 1.8766188621520996, "learning_rate": 5.77922077922078e-06, "loss": 0.5301, "step": 136 }, { "epoch": 0.4384, "grad_norm": 2.4637691974639893, "learning_rate": 5.7467532467532475e-06, "loss": 0.5612, "step": 137 }, { "epoch": 0.4416, "grad_norm": 2.5778348445892334, "learning_rate": 5.7142857142857145e-06, "loss": 0.5339, "step": 138 }, { "epoch": 0.4448, "grad_norm": 1.9568742513656616, "learning_rate": 5.681818181818183e-06, "loss": 0.4766, "step": 139 }, { "epoch": 0.448, "grad_norm": 2.5698182582855225, "learning_rate": 5.64935064935065e-06, "loss": 0.5086, "step": 140 }, { "epoch": 0.4512, "grad_norm": 2.4523143768310547, "learning_rate": 5.616883116883117e-06, "loss": 0.4908, "step": 141 }, { "epoch": 0.4544, "grad_norm": 2.6425259113311768, "learning_rate": 5.584415584415585e-06, "loss": 0.5949, "step": 142 }, { "epoch": 0.4576, "grad_norm": 2.20942759513855, "learning_rate": 5.551948051948053e-06, "loss": 0.5159, "step": 143 }, { "epoch": 0.4608, "grad_norm": 2.5742132663726807, "learning_rate": 5.5194805194805205e-06, "loss": 0.5172, "step": 144 }, { "epoch": 0.464, "grad_norm": 2.8136837482452393, "learning_rate": 5.4870129870129875e-06, "loss": 0.5273, "step": 145 }, { "epoch": 0.4672, "grad_norm": 2.452361583709717, "learning_rate": 5.4545454545454545e-06, "loss": 0.6228, "step": 146 }, { "epoch": 0.4704, "grad_norm": 2.162668466567993, "learning_rate": 5.422077922077923e-06, "loss": 0.5063, "step": 147 }, { "epoch": 0.4736, "grad_norm": 2.8804166316986084, "learning_rate": 5.38961038961039e-06, "loss": 0.54, "step": 148 }, { "epoch": 0.4768, "grad_norm": 2.6202263832092285, "learning_rate": 5.357142857142857e-06, "loss": 0.5427, "step": 149 }, { "epoch": 0.48, "grad_norm": 2.7187700271606445, "learning_rate": 5.324675324675325e-06, "loss": 0.5385, "step": 150 }, { "epoch": 0.4832, "grad_norm": 2.8711507320404053, "learning_rate": 5.292207792207793e-06, "loss": 0.5583, "step": 151 }, { "epoch": 0.4864, "grad_norm": 2.556797981262207, "learning_rate": 5.2597402597402605e-06, "loss": 0.4728, "step": 152 }, { "epoch": 0.4896, "grad_norm": 2.4534521102905273, "learning_rate": 5.2272727272727274e-06, "loss": 0.4558, "step": 153 }, { "epoch": 0.4928, "grad_norm": 2.8788859844207764, "learning_rate": 5.194805194805194e-06, "loss": 0.4523, "step": 154 }, { "epoch": 0.496, "grad_norm": 2.3744523525238037, "learning_rate": 5.162337662337663e-06, "loss": 0.4648, "step": 155 }, { "epoch": 0.4992, "grad_norm": 2.3659892082214355, "learning_rate": 5.12987012987013e-06, "loss": 0.4865, "step": 156 }, { "epoch": 0.5024, "grad_norm": 2.6716127395629883, "learning_rate": 5.097402597402598e-06, "loss": 0.4858, "step": 157 }, { "epoch": 0.5056, "grad_norm": 2.7237024307250977, "learning_rate": 5.064935064935065e-06, "loss": 0.5172, "step": 158 }, { "epoch": 0.5088, "grad_norm": 2.4779224395751953, "learning_rate": 5.0324675324675334e-06, "loss": 0.496, "step": 159 }, { "epoch": 0.512, "grad_norm": 3.7067513465881348, "learning_rate": 5e-06, "loss": 0.5195, "step": 160 }, { "epoch": 0.5152, "grad_norm": 2.9037418365478516, "learning_rate": 4.967532467532468e-06, "loss": 0.5014, "step": 161 }, { "epoch": 0.5184, "grad_norm": 2.811875343322754, "learning_rate": 4.935064935064935e-06, "loss": 0.4667, "step": 162 }, { "epoch": 0.5216, "grad_norm": 2.891486406326294, "learning_rate": 4.902597402597403e-06, "loss": 0.5138, "step": 163 }, { "epoch": 0.5248, "grad_norm": 2.7912118434906006, "learning_rate": 4.870129870129871e-06, "loss": 0.4836, "step": 164 }, { "epoch": 0.528, "grad_norm": 2.7582876682281494, "learning_rate": 4.837662337662339e-06, "loss": 0.4498, "step": 165 }, { "epoch": 0.5312, "grad_norm": 2.8489789962768555, "learning_rate": 4.805194805194806e-06, "loss": 0.4955, "step": 166 }, { "epoch": 0.5344, "grad_norm": 3.1706783771514893, "learning_rate": 4.772727272727273e-06, "loss": 0.4739, "step": 167 }, { "epoch": 0.5376, "grad_norm": 3.1050240993499756, "learning_rate": 4.74025974025974e-06, "loss": 0.4909, "step": 168 }, { "epoch": 0.5408, "grad_norm": 3.0862069129943848, "learning_rate": 4.707792207792208e-06, "loss": 0.517, "step": 169 }, { "epoch": 0.544, "grad_norm": 2.733415365219116, "learning_rate": 4.675324675324676e-06, "loss": 0.4806, "step": 170 }, { "epoch": 0.5472, "grad_norm": 2.7830162048339844, "learning_rate": 4.642857142857144e-06, "loss": 0.419, "step": 171 }, { "epoch": 0.5504, "grad_norm": 3.1513049602508545, "learning_rate": 4.610389610389611e-06, "loss": 0.4169, "step": 172 }, { "epoch": 0.5536, "grad_norm": 2.7717857360839844, "learning_rate": 4.5779220779220786e-06, "loss": 0.4918, "step": 173 }, { "epoch": 0.5568, "grad_norm": 2.6569430828094482, "learning_rate": 4.5454545454545455e-06, "loss": 0.5055, "step": 174 }, { "epoch": 0.56, "grad_norm": 2.9318227767944336, "learning_rate": 4.512987012987013e-06, "loss": 0.4094, "step": 175 }, { "epoch": 0.5632, "grad_norm": 2.718247890472412, "learning_rate": 4.48051948051948e-06, "loss": 0.4, "step": 176 }, { "epoch": 0.5664, "grad_norm": 2.8019731044769287, "learning_rate": 4.448051948051948e-06, "loss": 0.395, "step": 177 }, { "epoch": 0.5696, "grad_norm": 3.344794511795044, "learning_rate": 4.415584415584416e-06, "loss": 0.4411, "step": 178 }, { "epoch": 0.5728, "grad_norm": 3.349193572998047, "learning_rate": 4.383116883116884e-06, "loss": 0.4763, "step": 179 }, { "epoch": 0.576, "grad_norm": 3.292356252670288, "learning_rate": 4.350649350649351e-06, "loss": 0.468, "step": 180 }, { "epoch": 0.5792, "grad_norm": 2.837277412414551, "learning_rate": 4.3181818181818185e-06, "loss": 0.424, "step": 181 }, { "epoch": 0.5824, "grad_norm": 3.776475429534912, "learning_rate": 4.2857142857142855e-06, "loss": 0.4657, "step": 182 }, { "epoch": 0.5856, "grad_norm": 2.8376808166503906, "learning_rate": 4.253246753246753e-06, "loss": 0.4174, "step": 183 }, { "epoch": 0.5888, "grad_norm": 3.3143470287323, "learning_rate": 4.220779220779221e-06, "loss": 0.4708, "step": 184 }, { "epoch": 0.592, "grad_norm": 2.8305394649505615, "learning_rate": 4.188311688311689e-06, "loss": 0.4161, "step": 185 }, { "epoch": 0.5952, "grad_norm": 2.988297462463379, "learning_rate": 4.155844155844157e-06, "loss": 0.4206, "step": 186 }, { "epoch": 0.5984, "grad_norm": 2.821812152862549, "learning_rate": 4.123376623376624e-06, "loss": 0.3768, "step": 187 }, { "epoch": 0.6016, "grad_norm": 3.8506267070770264, "learning_rate": 4.0909090909090915e-06, "loss": 0.4605, "step": 188 }, { "epoch": 0.6048, "grad_norm": 3.259915351867676, "learning_rate": 4.0584415584415584e-06, "loss": 0.4419, "step": 189 }, { "epoch": 0.608, "grad_norm": 4.268383979797363, "learning_rate": 4.025974025974026e-06, "loss": 0.4116, "step": 190 }, { "epoch": 0.6112, "grad_norm": 3.081031560897827, "learning_rate": 3.993506493506494e-06, "loss": 0.4345, "step": 191 }, { "epoch": 0.6144, "grad_norm": 3.4930484294891357, "learning_rate": 3.961038961038962e-06, "loss": 0.4765, "step": 192 }, { "epoch": 0.6176, "grad_norm": 2.9748096466064453, "learning_rate": 3.928571428571429e-06, "loss": 0.3707, "step": 193 }, { "epoch": 0.6208, "grad_norm": 3.076026678085327, "learning_rate": 3.896103896103897e-06, "loss": 0.4349, "step": 194 }, { "epoch": 0.624, "grad_norm": 3.174711227416992, "learning_rate": 3.863636363636364e-06, "loss": 0.4212, "step": 195 }, { "epoch": 0.6272, "grad_norm": 2.978343963623047, "learning_rate": 3.831168831168831e-06, "loss": 0.3806, "step": 196 }, { "epoch": 0.6304, "grad_norm": 3.3710131645202637, "learning_rate": 3.798701298701299e-06, "loss": 0.4786, "step": 197 }, { "epoch": 0.6336, "grad_norm": 3.396897077560425, "learning_rate": 3.7662337662337666e-06, "loss": 0.4117, "step": 198 }, { "epoch": 0.6368, "grad_norm": 3.5931994915008545, "learning_rate": 3.733766233766234e-06, "loss": 0.4491, "step": 199 }, { "epoch": 0.64, "grad_norm": 3.251948356628418, "learning_rate": 3.701298701298702e-06, "loss": 0.3456, "step": 200 }, { "epoch": 0.6432, "grad_norm": 3.2774784564971924, "learning_rate": 3.6688311688311688e-06, "loss": 0.3645, "step": 201 }, { "epoch": 0.6464, "grad_norm": 3.322416305541992, "learning_rate": 3.6363636363636366e-06, "loss": 0.3473, "step": 202 }, { "epoch": 0.6496, "grad_norm": 3.3718931674957275, "learning_rate": 3.603896103896104e-06, "loss": 0.4622, "step": 203 }, { "epoch": 0.6528, "grad_norm": 3.8636627197265625, "learning_rate": 3.5714285714285718e-06, "loss": 0.4172, "step": 204 }, { "epoch": 0.656, "grad_norm": 3.1563334465026855, "learning_rate": 3.538961038961039e-06, "loss": 0.3661, "step": 205 }, { "epoch": 0.6592, "grad_norm": 3.838324785232544, "learning_rate": 3.506493506493507e-06, "loss": 0.4516, "step": 206 }, { "epoch": 0.6624, "grad_norm": 3.9685139656066895, "learning_rate": 3.474025974025974e-06, "loss": 0.4031, "step": 207 }, { "epoch": 0.6656, "grad_norm": 3.288994789123535, "learning_rate": 3.4415584415584418e-06, "loss": 0.3857, "step": 208 }, { "epoch": 0.6688, "grad_norm": 3.7377102375030518, "learning_rate": 3.409090909090909e-06, "loss": 0.4074, "step": 209 }, { "epoch": 0.672, "grad_norm": 3.793630599975586, "learning_rate": 3.376623376623377e-06, "loss": 0.4302, "step": 210 }, { "epoch": 0.6752, "grad_norm": 3.891111373901367, "learning_rate": 3.3441558441558443e-06, "loss": 0.4471, "step": 211 }, { "epoch": 0.6784, "grad_norm": 4.572322368621826, "learning_rate": 3.311688311688312e-06, "loss": 0.4063, "step": 212 }, { "epoch": 0.6816, "grad_norm": 3.859729051589966, "learning_rate": 3.27922077922078e-06, "loss": 0.4104, "step": 213 }, { "epoch": 0.6848, "grad_norm": 3.663115978240967, "learning_rate": 3.246753246753247e-06, "loss": 0.4246, "step": 214 }, { "epoch": 0.688, "grad_norm": 3.357668876647949, "learning_rate": 3.2142857142857147e-06, "loss": 0.3821, "step": 215 }, { "epoch": 0.6912, "grad_norm": 2.821131944656372, "learning_rate": 3.181818181818182e-06, "loss": 0.344, "step": 216 }, { "epoch": 0.6944, "grad_norm": 3.3244221210479736, "learning_rate": 3.14935064935065e-06, "loss": 0.3346, "step": 217 }, { "epoch": 0.6976, "grad_norm": 3.7634806632995605, "learning_rate": 3.116883116883117e-06, "loss": 0.3368, "step": 218 }, { "epoch": 0.7008, "grad_norm": 3.9963314533233643, "learning_rate": 3.0844155844155847e-06, "loss": 0.37, "step": 219 }, { "epoch": 0.704, "grad_norm": 4.0982890129089355, "learning_rate": 3.051948051948052e-06, "loss": 0.3717, "step": 220 }, { "epoch": 0.7072, "grad_norm": 3.808129072189331, "learning_rate": 3.01948051948052e-06, "loss": 0.3604, "step": 221 }, { "epoch": 0.7104, "grad_norm": 3.781965732574463, "learning_rate": 2.9870129870129873e-06, "loss": 0.3388, "step": 222 }, { "epoch": 0.7136, "grad_norm": 3.4916951656341553, "learning_rate": 2.954545454545455e-06, "loss": 0.3535, "step": 223 }, { "epoch": 0.7168, "grad_norm": 4.166407108306885, "learning_rate": 2.922077922077922e-06, "loss": 0.3649, "step": 224 }, { "epoch": 0.72, "grad_norm": 3.813291311264038, "learning_rate": 2.88961038961039e-06, "loss": 0.4032, "step": 225 }, { "epoch": 0.7232, "grad_norm": 3.7685770988464355, "learning_rate": 2.8571428571428573e-06, "loss": 0.3721, "step": 226 }, { "epoch": 0.7264, "grad_norm": 4.274139404296875, "learning_rate": 2.824675324675325e-06, "loss": 0.3563, "step": 227 }, { "epoch": 0.7296, "grad_norm": 3.735616445541382, "learning_rate": 2.7922077922077925e-06, "loss": 0.3707, "step": 228 }, { "epoch": 0.7328, "grad_norm": 3.705681800842285, "learning_rate": 2.7597402597402603e-06, "loss": 0.3904, "step": 229 }, { "epoch": 0.736, "grad_norm": 3.6566576957702637, "learning_rate": 2.7272727272727272e-06, "loss": 0.4036, "step": 230 }, { "epoch": 0.7392, "grad_norm": 3.2774415016174316, "learning_rate": 2.694805194805195e-06, "loss": 0.3859, "step": 231 }, { "epoch": 0.7424, "grad_norm": 3.4545371532440186, "learning_rate": 2.6623376623376624e-06, "loss": 0.3154, "step": 232 }, { "epoch": 0.7456, "grad_norm": 3.9478533267974854, "learning_rate": 2.6298701298701302e-06, "loss": 0.3694, "step": 233 }, { "epoch": 0.7488, "grad_norm": 3.6408331394195557, "learning_rate": 2.597402597402597e-06, "loss": 0.3066, "step": 234 }, { "epoch": 0.752, "grad_norm": 3.2598726749420166, "learning_rate": 2.564935064935065e-06, "loss": 0.3307, "step": 235 }, { "epoch": 0.7552, "grad_norm": 3.67964768409729, "learning_rate": 2.5324675324675324e-06, "loss": 0.3166, "step": 236 }, { "epoch": 0.7584, "grad_norm": 3.9483423233032227, "learning_rate": 2.5e-06, "loss": 0.3362, "step": 237 }, { "epoch": 0.7616, "grad_norm": 3.4860994815826416, "learning_rate": 2.4675324675324676e-06, "loss": 0.3271, "step": 238 }, { "epoch": 0.7648, "grad_norm": 5.214137554168701, "learning_rate": 2.4350649350649354e-06, "loss": 0.3607, "step": 239 }, { "epoch": 0.768, "grad_norm": 3.991419792175293, "learning_rate": 2.402597402597403e-06, "loss": 0.3628, "step": 240 }, { "epoch": 0.7712, "grad_norm": 4.0303778648376465, "learning_rate": 2.37012987012987e-06, "loss": 0.2875, "step": 241 }, { "epoch": 0.7744, "grad_norm": 3.3276679515838623, "learning_rate": 2.337662337662338e-06, "loss": 0.2834, "step": 242 }, { "epoch": 0.7776, "grad_norm": 3.434382438659668, "learning_rate": 2.3051948051948054e-06, "loss": 0.3483, "step": 243 }, { "epoch": 0.7808, "grad_norm": 3.6131680011749268, "learning_rate": 2.2727272727272728e-06, "loss": 0.2756, "step": 244 }, { "epoch": 0.784, "grad_norm": 3.658128499984741, "learning_rate": 2.24025974025974e-06, "loss": 0.272, "step": 245 }, { "epoch": 0.7872, "grad_norm": 5.547757148742676, "learning_rate": 2.207792207792208e-06, "loss": 0.2987, "step": 246 }, { "epoch": 0.7904, "grad_norm": 4.140172958374023, "learning_rate": 2.1753246753246753e-06, "loss": 0.3237, "step": 247 }, { "epoch": 0.7936, "grad_norm": 3.3053691387176514, "learning_rate": 2.1428571428571427e-06, "loss": 0.2837, "step": 248 }, { "epoch": 0.7968, "grad_norm": 3.5432918071746826, "learning_rate": 2.1103896103896105e-06, "loss": 0.3476, "step": 249 }, { "epoch": 0.8, "grad_norm": 4.45695161819458, "learning_rate": 2.0779220779220784e-06, "loss": 0.2915, "step": 250 }, { "epoch": 0.8032, "grad_norm": 3.557931900024414, "learning_rate": 2.0454545454545457e-06, "loss": 0.3359, "step": 251 }, { "epoch": 0.8064, "grad_norm": 4.017322540283203, "learning_rate": 2.012987012987013e-06, "loss": 0.3171, "step": 252 }, { "epoch": 0.8096, "grad_norm": 3.919919490814209, "learning_rate": 1.980519480519481e-06, "loss": 0.3071, "step": 253 }, { "epoch": 0.8128, "grad_norm": 3.5345091819763184, "learning_rate": 1.9480519480519483e-06, "loss": 0.3649, "step": 254 }, { "epoch": 0.816, "grad_norm": 4.000681400299072, "learning_rate": 1.9155844155844157e-06, "loss": 0.3264, "step": 255 }, { "epoch": 0.8192, "grad_norm": 4.065380573272705, "learning_rate": 1.8831168831168833e-06, "loss": 0.3378, "step": 256 }, { "epoch": 0.8224, "grad_norm": 4.254638195037842, "learning_rate": 1.850649350649351e-06, "loss": 0.3792, "step": 257 }, { "epoch": 0.8256, "grad_norm": 4.198444366455078, "learning_rate": 1.8181818181818183e-06, "loss": 0.2847, "step": 258 }, { "epoch": 0.8288, "grad_norm": 4.403450012207031, "learning_rate": 1.7857142857142859e-06, "loss": 0.291, "step": 259 }, { "epoch": 0.832, "grad_norm": 3.7455475330352783, "learning_rate": 1.7532467532467535e-06, "loss": 0.2582, "step": 260 }, { "epoch": 0.8352, "grad_norm": 3.777442216873169, "learning_rate": 1.7207792207792209e-06, "loss": 0.2925, "step": 261 }, { "epoch": 0.8384, "grad_norm": 4.861299514770508, "learning_rate": 1.6883116883116885e-06, "loss": 0.3465, "step": 262 }, { "epoch": 0.8416, "grad_norm": 3.7868454456329346, "learning_rate": 1.655844155844156e-06, "loss": 0.3251, "step": 263 }, { "epoch": 0.8448, "grad_norm": 4.0611724853515625, "learning_rate": 1.6233766233766235e-06, "loss": 0.3107, "step": 264 }, { "epoch": 0.848, "grad_norm": 4.38793420791626, "learning_rate": 1.590909090909091e-06, "loss": 0.2946, "step": 265 }, { "epoch": 0.8512, "grad_norm": 4.053177833557129, "learning_rate": 1.5584415584415584e-06, "loss": 0.3414, "step": 266 }, { "epoch": 0.8544, "grad_norm": 4.455831050872803, "learning_rate": 1.525974025974026e-06, "loss": 0.3175, "step": 267 }, { "epoch": 0.8576, "grad_norm": 3.924949884414673, "learning_rate": 1.4935064935064936e-06, "loss": 0.2887, "step": 268 }, { "epoch": 0.8608, "grad_norm": 4.737699031829834, "learning_rate": 1.461038961038961e-06, "loss": 0.3288, "step": 269 }, { "epoch": 0.864, "grad_norm": 3.276146650314331, "learning_rate": 1.4285714285714286e-06, "loss": 0.2345, "step": 270 }, { "epoch": 0.8672, "grad_norm": 4.697585582733154, "learning_rate": 1.3961038961038962e-06, "loss": 0.3279, "step": 271 }, { "epoch": 0.8704, "grad_norm": 4.041415214538574, "learning_rate": 1.3636363636363636e-06, "loss": 0.2838, "step": 272 }, { "epoch": 0.8736, "grad_norm": 3.8998961448669434, "learning_rate": 1.3311688311688312e-06, "loss": 0.2866, "step": 273 }, { "epoch": 0.8768, "grad_norm": 4.033306121826172, "learning_rate": 1.2987012987012986e-06, "loss": 0.3085, "step": 274 }, { "epoch": 0.88, "grad_norm": 4.726825714111328, "learning_rate": 1.2662337662337662e-06, "loss": 0.3369, "step": 275 }, { "epoch": 0.8832, "grad_norm": 3.8073678016662598, "learning_rate": 1.2337662337662338e-06, "loss": 0.283, "step": 276 }, { "epoch": 0.8864, "grad_norm": 3.888046979904175, "learning_rate": 1.2012987012987014e-06, "loss": 0.275, "step": 277 }, { "epoch": 0.8896, "grad_norm": 3.4480137825012207, "learning_rate": 1.168831168831169e-06, "loss": 0.2789, "step": 278 }, { "epoch": 0.8928, "grad_norm": 4.157343864440918, "learning_rate": 1.1363636363636364e-06, "loss": 0.2847, "step": 279 }, { "epoch": 0.896, "grad_norm": 3.852142333984375, "learning_rate": 1.103896103896104e-06, "loss": 0.302, "step": 280 }, { "epoch": 0.8992, "grad_norm": 3.817131280899048, "learning_rate": 1.0714285714285714e-06, "loss": 0.2965, "step": 281 }, { "epoch": 0.9024, "grad_norm": 3.1981892585754395, "learning_rate": 1.0389610389610392e-06, "loss": 0.273, "step": 282 }, { "epoch": 0.9056, "grad_norm": 4.378617286682129, "learning_rate": 1.0064935064935066e-06, "loss": 0.269, "step": 283 }, { "epoch": 0.9088, "grad_norm": 4.007122039794922, "learning_rate": 9.740259740259742e-07, "loss": 0.2494, "step": 284 }, { "epoch": 0.912, "grad_norm": 4.203665256500244, "learning_rate": 9.415584415584417e-07, "loss": 0.2944, "step": 285 }, { "epoch": 0.9152, "grad_norm": 3.7177679538726807, "learning_rate": 9.090909090909091e-07, "loss": 0.2791, "step": 286 }, { "epoch": 0.9184, "grad_norm": 3.361874580383301, "learning_rate": 8.766233766233767e-07, "loss": 0.3019, "step": 287 }, { "epoch": 0.9216, "grad_norm": 4.043386459350586, "learning_rate": 8.441558441558442e-07, "loss": 0.2912, "step": 288 }, { "epoch": 0.9248, "grad_norm": 3.9391589164733887, "learning_rate": 8.116883116883117e-07, "loss": 0.2372, "step": 289 }, { "epoch": 0.928, "grad_norm": 3.9059672355651855, "learning_rate": 7.792207792207792e-07, "loss": 0.2625, "step": 290 }, { "epoch": 0.9312, "grad_norm": 3.78255558013916, "learning_rate": 7.467532467532468e-07, "loss": 0.2854, "step": 291 }, { "epoch": 0.9344, "grad_norm": 3.6052775382995605, "learning_rate": 7.142857142857143e-07, "loss": 0.2327, "step": 292 }, { "epoch": 0.9376, "grad_norm": 4.373789310455322, "learning_rate": 6.818181818181818e-07, "loss": 0.346, "step": 293 }, { "epoch": 0.9408, "grad_norm": 3.584813356399536, "learning_rate": 6.493506493506493e-07, "loss": 0.2845, "step": 294 }, { "epoch": 0.944, "grad_norm": 4.024709224700928, "learning_rate": 6.168831168831169e-07, "loss": 0.3263, "step": 295 }, { "epoch": 0.9472, "grad_norm": 3.5559256076812744, "learning_rate": 5.844155844155845e-07, "loss": 0.2466, "step": 296 }, { "epoch": 0.9504, "grad_norm": 4.1213274002075195, "learning_rate": 5.51948051948052e-07, "loss": 0.2988, "step": 297 }, { "epoch": 0.9536, "grad_norm": 4.450615406036377, "learning_rate": 5.194805194805196e-07, "loss": 0.3328, "step": 298 }, { "epoch": 0.9568, "grad_norm": 4.196364402770996, "learning_rate": 4.870129870129871e-07, "loss": 0.2889, "step": 299 }, { "epoch": 0.96, "grad_norm": 3.8642749786376953, "learning_rate": 4.5454545454545457e-07, "loss": 0.2406, "step": 300 }, { "epoch": 0.9632, "grad_norm": 4.460577964782715, "learning_rate": 4.220779220779221e-07, "loss": 0.301, "step": 301 }, { "epoch": 0.9664, "grad_norm": 3.5979244709014893, "learning_rate": 3.896103896103896e-07, "loss": 0.2194, "step": 302 }, { "epoch": 0.9696, "grad_norm": 3.9422249794006348, "learning_rate": 3.5714285714285716e-07, "loss": 0.2685, "step": 303 }, { "epoch": 0.9728, "grad_norm": 4.546063423156738, "learning_rate": 3.2467532467532465e-07, "loss": 0.321, "step": 304 }, { "epoch": 0.976, "grad_norm": 3.7650575637817383, "learning_rate": 2.9220779220779225e-07, "loss": 0.2769, "step": 305 }, { "epoch": 0.9792, "grad_norm": 5.088820457458496, "learning_rate": 2.597402597402598e-07, "loss": 0.2852, "step": 306 }, { "epoch": 0.9824, "grad_norm": 4.189055919647217, "learning_rate": 2.2727272727272729e-07, "loss": 0.2919, "step": 307 }, { "epoch": 0.9856, "grad_norm": 4.348959922790527, "learning_rate": 1.948051948051948e-07, "loss": 0.3112, "step": 308 }, { "epoch": 0.9888, "grad_norm": 4.837542533874512, "learning_rate": 1.6233766233766232e-07, "loss": 0.2468, "step": 309 }, { "epoch": 0.992, "grad_norm": 3.7436952590942383, "learning_rate": 1.298701298701299e-07, "loss": 0.2773, "step": 310 }, { "epoch": 0.9952, "grad_norm": 4.24392032623291, "learning_rate": 9.74025974025974e-08, "loss": 0.2797, "step": 311 }, { "epoch": 0.9984, "grad_norm": 3.743685007095337, "learning_rate": 6.493506493506495e-08, "loss": 0.2829, "step": 312 }, { "epoch": 1.0, "grad_norm": 6.174961566925049, "learning_rate": 3.2467532467532474e-08, "loss": 0.3284, "step": 313 } ], "logging_steps": 1, "max_steps": 313, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3015804912100966e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }