{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 9.548905372619629, "learning_rate": 0.0, "loss": 1.6041, "step": 1 }, { "epoch": 0.0064, "grad_norm": 8.360369682312012, "learning_rate": 2.0000000000000003e-06, "loss": 1.43, "step": 2 }, { "epoch": 0.0096, "grad_norm": 9.501611709594727, "learning_rate": 4.000000000000001e-06, "loss": 1.6751, "step": 3 }, { "epoch": 0.0128, "grad_norm": 8.665339469909668, "learning_rate": 6e-06, "loss": 1.6014, "step": 4 }, { "epoch": 0.016, "grad_norm": 6.536843776702881, "learning_rate": 8.000000000000001e-06, "loss": 1.3807, "step": 5 }, { "epoch": 0.0192, "grad_norm": 3.9980669021606445, "learning_rate": 1e-05, "loss": 1.2621, "step": 6 }, { "epoch": 0.0224, "grad_norm": 2.882711172103882, "learning_rate": 9.967532467532468e-06, "loss": 1.3812, "step": 7 }, { "epoch": 0.0256, "grad_norm": 2.067737340927124, "learning_rate": 9.935064935064936e-06, "loss": 1.1674, "step": 8 }, { "epoch": 0.0288, "grad_norm": 2.1736371517181396, "learning_rate": 9.902597402597403e-06, "loss": 1.2078, "step": 9 }, { "epoch": 0.032, "grad_norm": 2.2333147525787354, "learning_rate": 9.87012987012987e-06, "loss": 1.2477, "step": 10 }, { "epoch": 0.0352, "grad_norm": 1.869605541229248, "learning_rate": 9.837662337662337e-06, "loss": 1.1323, "step": 11 }, { "epoch": 0.0384, "grad_norm": 1.9734994173049927, "learning_rate": 9.805194805194806e-06, "loss": 1.258, "step": 12 }, { "epoch": 0.0416, "grad_norm": 2.027984142303467, "learning_rate": 9.772727272727273e-06, "loss": 1.1692, "step": 13 }, { "epoch": 0.0448, "grad_norm": 1.8902798891067505, "learning_rate": 9.740259740259742e-06, "loss": 1.1181, "step": 14 }, { "epoch": 0.048, "grad_norm": 1.6616740226745605, "learning_rate": 9.707792207792209e-06, "loss": 1.1082, "step": 15 }, { "epoch": 0.0512, "grad_norm": 2.1065773963928223, "learning_rate": 9.675324675324677e-06, "loss": 1.0953, "step": 16 }, { "epoch": 0.0544, "grad_norm": 1.6929274797439575, "learning_rate": 9.642857142857144e-06, "loss": 0.9976, "step": 17 }, { "epoch": 0.0576, "grad_norm": 1.7909464836120605, "learning_rate": 9.610389610389611e-06, "loss": 1.0779, "step": 18 }, { "epoch": 0.0608, "grad_norm": 1.987488031387329, "learning_rate": 9.577922077922078e-06, "loss": 1.0974, "step": 19 }, { "epoch": 0.064, "grad_norm": 1.535858154296875, "learning_rate": 9.545454545454547e-06, "loss": 1.0124, "step": 20 }, { "epoch": 0.0672, "grad_norm": 1.6738166809082031, "learning_rate": 9.512987012987014e-06, "loss": 0.9932, "step": 21 }, { "epoch": 0.0704, "grad_norm": 1.4918556213378906, "learning_rate": 9.48051948051948e-06, "loss": 0.9481, "step": 22 }, { "epoch": 0.0736, "grad_norm": 1.652398705482483, "learning_rate": 9.448051948051948e-06, "loss": 0.9542, "step": 23 }, { "epoch": 0.0768, "grad_norm": 1.567749261856079, "learning_rate": 9.415584415584416e-06, "loss": 0.8848, "step": 24 }, { "epoch": 0.08, "grad_norm": 1.6268699169158936, "learning_rate": 9.383116883116883e-06, "loss": 1.0485, "step": 25 }, { "epoch": 0.0832, "grad_norm": 1.528889775276184, "learning_rate": 9.350649350649352e-06, "loss": 0.941, "step": 26 }, { "epoch": 0.0864, "grad_norm": 1.8066990375518799, "learning_rate": 9.318181818181819e-06, "loss": 1.0402, "step": 27 }, { "epoch": 0.0896, "grad_norm": 1.2178040742874146, "learning_rate": 9.285714285714288e-06, "loss": 0.9057, "step": 28 }, { "epoch": 0.0928, "grad_norm": 1.5407291650772095, "learning_rate": 9.253246753246755e-06, "loss": 0.921, "step": 29 }, { "epoch": 0.096, "grad_norm": 1.5522805452346802, "learning_rate": 9.220779220779221e-06, "loss": 0.9231, "step": 30 }, { "epoch": 0.0992, "grad_norm": 1.1725654602050781, "learning_rate": 9.188311688311688e-06, "loss": 0.7628, "step": 31 }, { "epoch": 0.1024, "grad_norm": 1.5114904642105103, "learning_rate": 9.155844155844157e-06, "loss": 0.8904, "step": 32 }, { "epoch": 0.1056, "grad_norm": 2.2382700443267822, "learning_rate": 9.123376623376624e-06, "loss": 1.0447, "step": 33 }, { "epoch": 0.1088, "grad_norm": 1.2792514562606812, "learning_rate": 9.090909090909091e-06, "loss": 0.8466, "step": 34 }, { "epoch": 0.112, "grad_norm": 1.5901352167129517, "learning_rate": 9.05844155844156e-06, "loss": 0.8383, "step": 35 }, { "epoch": 0.1152, "grad_norm": 1.8127880096435547, "learning_rate": 9.025974025974027e-06, "loss": 0.9088, "step": 36 }, { "epoch": 0.1184, "grad_norm": 1.3327851295471191, "learning_rate": 8.993506493506494e-06, "loss": 0.8263, "step": 37 }, { "epoch": 0.1216, "grad_norm": 1.2125405073165894, "learning_rate": 8.96103896103896e-06, "loss": 0.846, "step": 38 }, { "epoch": 0.1248, "grad_norm": 1.4686274528503418, "learning_rate": 8.92857142857143e-06, "loss": 0.9285, "step": 39 }, { "epoch": 0.128, "grad_norm": 1.7080440521240234, "learning_rate": 8.896103896103896e-06, "loss": 0.9183, "step": 40 }, { "epoch": 0.1312, "grad_norm": 1.402506709098816, "learning_rate": 8.863636363636365e-06, "loss": 0.7918, "step": 41 }, { "epoch": 0.1344, "grad_norm": 1.311579942703247, "learning_rate": 8.831168831168832e-06, "loss": 0.8007, "step": 42 }, { "epoch": 0.1376, "grad_norm": 1.3540467023849487, "learning_rate": 8.7987012987013e-06, "loss": 0.792, "step": 43 }, { "epoch": 0.1408, "grad_norm": 1.3443299531936646, "learning_rate": 8.766233766233767e-06, "loss": 0.8446, "step": 44 }, { "epoch": 0.144, "grad_norm": 1.6124398708343506, "learning_rate": 8.733766233766234e-06, "loss": 0.873, "step": 45 }, { "epoch": 0.1472, "grad_norm": 1.4039077758789062, "learning_rate": 8.701298701298701e-06, "loss": 0.9067, "step": 46 }, { "epoch": 0.1504, "grad_norm": 1.3932620286941528, "learning_rate": 8.66883116883117e-06, "loss": 0.7678, "step": 47 }, { "epoch": 0.1536, "grad_norm": 1.4655569791793823, "learning_rate": 8.636363636363637e-06, "loss": 0.8846, "step": 48 }, { "epoch": 0.1568, "grad_norm": 1.6178548336029053, "learning_rate": 8.603896103896104e-06, "loss": 0.8972, "step": 49 }, { "epoch": 0.16, "grad_norm": 1.2229878902435303, "learning_rate": 8.571428571428571e-06, "loss": 0.7186, "step": 50 }, { "epoch": 0.1632, "grad_norm": 1.399183988571167, "learning_rate": 8.53896103896104e-06, "loss": 0.8156, "step": 51 }, { "epoch": 0.1664, "grad_norm": 1.337349534034729, "learning_rate": 8.506493506493507e-06, "loss": 0.7844, "step": 52 }, { "epoch": 0.1696, "grad_norm": 1.311928391456604, "learning_rate": 8.474025974025975e-06, "loss": 0.8665, "step": 53 }, { "epoch": 0.1728, "grad_norm": 1.3711683750152588, "learning_rate": 8.441558441558442e-06, "loss": 0.764, "step": 54 }, { "epoch": 0.176, "grad_norm": 1.4377758502960205, "learning_rate": 8.40909090909091e-06, "loss": 0.8027, "step": 55 }, { "epoch": 0.1792, "grad_norm": 1.3840374946594238, "learning_rate": 8.376623376623378e-06, "loss": 0.9163, "step": 56 }, { "epoch": 0.1824, "grad_norm": 1.5715651512145996, "learning_rate": 8.344155844155845e-06, "loss": 0.8111, "step": 57 }, { "epoch": 0.1856, "grad_norm": 1.2862898111343384, "learning_rate": 8.311688311688313e-06, "loss": 0.7984, "step": 58 }, { "epoch": 0.1888, "grad_norm": 1.4305953979492188, "learning_rate": 8.27922077922078e-06, "loss": 0.7762, "step": 59 }, { "epoch": 0.192, "grad_norm": 1.514124870300293, "learning_rate": 8.246753246753247e-06, "loss": 0.6909, "step": 60 }, { "epoch": 0.1952, "grad_norm": 1.4759447574615479, "learning_rate": 8.214285714285714e-06, "loss": 0.8051, "step": 61 }, { "epoch": 0.1984, "grad_norm": 1.3367384672164917, "learning_rate": 8.181818181818183e-06, "loss": 0.8096, "step": 62 }, { "epoch": 0.2016, "grad_norm": 1.262237310409546, "learning_rate": 8.14935064935065e-06, "loss": 0.7599, "step": 63 }, { "epoch": 0.2048, "grad_norm": 1.4441957473754883, "learning_rate": 8.116883116883117e-06, "loss": 0.7741, "step": 64 }, { "epoch": 0.208, "grad_norm": 1.4128937721252441, "learning_rate": 8.084415584415586e-06, "loss": 0.7929, "step": 65 }, { "epoch": 0.2112, "grad_norm": 1.3074737787246704, "learning_rate": 8.051948051948052e-06, "loss": 0.7569, "step": 66 }, { "epoch": 0.2144, "grad_norm": 1.2770249843597412, "learning_rate": 8.019480519480521e-06, "loss": 0.7327, "step": 67 }, { "epoch": 0.2176, "grad_norm": 1.3476999998092651, "learning_rate": 7.987012987012988e-06, "loss": 0.7398, "step": 68 }, { "epoch": 0.2208, "grad_norm": 1.3660588264465332, "learning_rate": 7.954545454545455e-06, "loss": 0.7428, "step": 69 }, { "epoch": 0.224, "grad_norm": 1.3757730722427368, "learning_rate": 7.922077922077924e-06, "loss": 0.772, "step": 70 }, { "epoch": 0.2272, "grad_norm": 1.418833613395691, "learning_rate": 7.88961038961039e-06, "loss": 0.7315, "step": 71 }, { "epoch": 0.2304, "grad_norm": 1.497719645500183, "learning_rate": 7.857142857142858e-06, "loss": 0.7603, "step": 72 }, { "epoch": 0.2336, "grad_norm": 1.6543736457824707, "learning_rate": 7.824675324675325e-06, "loss": 0.7426, "step": 73 }, { "epoch": 0.2368, "grad_norm": 1.2009284496307373, "learning_rate": 7.792207792207793e-06, "loss": 0.7149, "step": 74 }, { "epoch": 0.24, "grad_norm": 1.5950077772140503, "learning_rate": 7.75974025974026e-06, "loss": 0.6904, "step": 75 }, { "epoch": 0.2432, "grad_norm": 1.7591907978057861, "learning_rate": 7.727272727272727e-06, "loss": 0.8181, "step": 76 }, { "epoch": 0.2464, "grad_norm": 1.5061492919921875, "learning_rate": 7.694805194805194e-06, "loss": 0.6687, "step": 77 }, { "epoch": 0.2496, "grad_norm": 1.391204595565796, "learning_rate": 7.662337662337663e-06, "loss": 0.7099, "step": 78 }, { "epoch": 0.2528, "grad_norm": 1.6681373119354248, "learning_rate": 7.62987012987013e-06, "loss": 0.7715, "step": 79 }, { "epoch": 0.256, "grad_norm": 1.4587377309799194, "learning_rate": 7.597402597402598e-06, "loss": 0.7986, "step": 80 }, { "epoch": 0.2592, "grad_norm": 1.4327019453048706, "learning_rate": 7.564935064935065e-06, "loss": 0.7615, "step": 81 }, { "epoch": 0.2624, "grad_norm": 1.592335820198059, "learning_rate": 7.532467532467533e-06, "loss": 0.7194, "step": 82 }, { "epoch": 0.2656, "grad_norm": 1.818969488143921, "learning_rate": 7.500000000000001e-06, "loss": 0.7778, "step": 83 }, { "epoch": 0.2688, "grad_norm": 1.5635945796966553, "learning_rate": 7.467532467532468e-06, "loss": 0.6832, "step": 84 }, { "epoch": 0.272, "grad_norm": 1.6165601015090942, "learning_rate": 7.435064935064936e-06, "loss": 0.7262, "step": 85 }, { "epoch": 0.2752, "grad_norm": 1.6714136600494385, "learning_rate": 7.402597402597404e-06, "loss": 0.6945, "step": 86 }, { "epoch": 0.2784, "grad_norm": 1.4669193029403687, "learning_rate": 7.370129870129871e-06, "loss": 0.7597, "step": 87 }, { "epoch": 0.2816, "grad_norm": 1.6006850004196167, "learning_rate": 7.3376623376623375e-06, "loss": 0.6943, "step": 88 }, { "epoch": 0.2848, "grad_norm": 1.5513739585876465, "learning_rate": 7.305194805194806e-06, "loss": 0.6766, "step": 89 }, { "epoch": 0.288, "grad_norm": 1.5769884586334229, "learning_rate": 7.272727272727273e-06, "loss": 0.7036, "step": 90 }, { "epoch": 0.2912, "grad_norm": 1.698893666267395, "learning_rate": 7.240259740259741e-06, "loss": 0.7322, "step": 91 }, { "epoch": 0.2944, "grad_norm": 1.6237844228744507, "learning_rate": 7.207792207792208e-06, "loss": 0.756, "step": 92 }, { "epoch": 0.2976, "grad_norm": 1.5962598323822021, "learning_rate": 7.175324675324677e-06, "loss": 0.6466, "step": 93 }, { "epoch": 0.3008, "grad_norm": 1.3866199254989624, "learning_rate": 7.1428571428571436e-06, "loss": 0.5878, "step": 94 }, { "epoch": 0.304, "grad_norm": 1.7351548671722412, "learning_rate": 7.1103896103896105e-06, "loss": 0.7033, "step": 95 }, { "epoch": 0.3072, "grad_norm": 1.7491967678070068, "learning_rate": 7.077922077922078e-06, "loss": 0.7007, "step": 96 }, { "epoch": 0.3104, "grad_norm": 1.5318310260772705, "learning_rate": 7.045454545454546e-06, "loss": 0.578, "step": 97 }, { "epoch": 0.3136, "grad_norm": 1.6933001279830933, "learning_rate": 7.012987012987014e-06, "loss": 0.6657, "step": 98 }, { "epoch": 0.3168, "grad_norm": 1.7131991386413574, "learning_rate": 6.980519480519481e-06, "loss": 0.7085, "step": 99 }, { "epoch": 0.32, "grad_norm": 1.897113561630249, "learning_rate": 6.948051948051948e-06, "loss": 0.6948, "step": 100 }, { "epoch": 0.3232, "grad_norm": 1.5741852521896362, "learning_rate": 6.9155844155844165e-06, "loss": 0.6236, "step": 101 }, { "epoch": 0.3264, "grad_norm": 1.8281790018081665, "learning_rate": 6.8831168831168835e-06, "loss": 0.5968, "step": 102 }, { "epoch": 0.3296, "grad_norm": 1.6088893413543701, "learning_rate": 6.850649350649351e-06, "loss": 0.6262, "step": 103 }, { "epoch": 0.3328, "grad_norm": 1.8895440101623535, "learning_rate": 6.818181818181818e-06, "loss": 0.6655, "step": 104 }, { "epoch": 0.336, "grad_norm": 1.8878438472747803, "learning_rate": 6.785714285714287e-06, "loss": 0.5968, "step": 105 }, { "epoch": 0.3392, "grad_norm": 1.8233320713043213, "learning_rate": 6.753246753246754e-06, "loss": 0.6842, "step": 106 }, { "epoch": 0.3424, "grad_norm": 1.6819425821304321, "learning_rate": 6.720779220779221e-06, "loss": 0.6533, "step": 107 }, { "epoch": 0.3456, "grad_norm": 2.240962505340576, "learning_rate": 6.688311688311689e-06, "loss": 0.7833, "step": 108 }, { "epoch": 0.3488, "grad_norm": 1.8028863668441772, "learning_rate": 6.6558441558441565e-06, "loss": 0.5872, "step": 109 }, { "epoch": 0.352, "grad_norm": 2.085261106491089, "learning_rate": 6.623376623376624e-06, "loss": 0.6819, "step": 110 }, { "epoch": 0.3552, "grad_norm": 1.844897985458374, "learning_rate": 6.590909090909091e-06, "loss": 0.6996, "step": 111 }, { "epoch": 0.3584, "grad_norm": 1.9914166927337646, "learning_rate": 6.55844155844156e-06, "loss": 0.674, "step": 112 }, { "epoch": 0.3616, "grad_norm": 1.8009718656539917, "learning_rate": 6.525974025974027e-06, "loss": 0.625, "step": 113 }, { "epoch": 0.3648, "grad_norm": 2.120251178741455, "learning_rate": 6.493506493506494e-06, "loss": 0.6252, "step": 114 }, { "epoch": 0.368, "grad_norm": 1.9561840295791626, "learning_rate": 6.461038961038961e-06, "loss": 0.6218, "step": 115 }, { "epoch": 0.3712, "grad_norm": 1.9256573915481567, "learning_rate": 6.4285714285714295e-06, "loss": 0.6251, "step": 116 }, { "epoch": 0.3744, "grad_norm": 2.0886967182159424, "learning_rate": 6.3961038961038964e-06, "loss": 0.5972, "step": 117 }, { "epoch": 0.3776, "grad_norm": 1.90208101272583, "learning_rate": 6.363636363636364e-06, "loss": 0.6392, "step": 118 }, { "epoch": 0.3808, "grad_norm": 2.1293461322784424, "learning_rate": 6.331168831168831e-06, "loss": 0.6637, "step": 119 }, { "epoch": 0.384, "grad_norm": 1.7344480752944946, "learning_rate": 6.2987012987013e-06, "loss": 0.5419, "step": 120 }, { "epoch": 0.3872, "grad_norm": 2.242717742919922, "learning_rate": 6.266233766233767e-06, "loss": 0.69, "step": 121 }, { "epoch": 0.3904, "grad_norm": 1.9519375562667847, "learning_rate": 6.233766233766234e-06, "loss": 0.6116, "step": 122 }, { "epoch": 0.3936, "grad_norm": 2.716726541519165, "learning_rate": 6.201298701298702e-06, "loss": 0.6661, "step": 123 }, { "epoch": 0.3968, "grad_norm": 1.9421472549438477, "learning_rate": 6.168831168831169e-06, "loss": 0.6495, "step": 124 }, { "epoch": 0.4, "grad_norm": 1.7099575996398926, "learning_rate": 6.136363636363637e-06, "loss": 0.5694, "step": 125 }, { "epoch": 0.4032, "grad_norm": 1.9398987293243408, "learning_rate": 6.103896103896104e-06, "loss": 0.6332, "step": 126 }, { "epoch": 0.4064, "grad_norm": 1.8356436491012573, "learning_rate": 6.071428571428571e-06, "loss": 0.5879, "step": 127 }, { "epoch": 0.4096, "grad_norm": 2.187277317047119, "learning_rate": 6.03896103896104e-06, "loss": 0.5762, "step": 128 }, { "epoch": 0.4128, "grad_norm": 2.1066439151763916, "learning_rate": 6.006493506493507e-06, "loss": 0.6246, "step": 129 }, { "epoch": 0.416, "grad_norm": 2.073390483856201, "learning_rate": 5.9740259740259746e-06, "loss": 0.5941, "step": 130 }, { "epoch": 0.4192, "grad_norm": 2.0324959754943848, "learning_rate": 5.9415584415584415e-06, "loss": 0.6146, "step": 131 }, { "epoch": 0.4224, "grad_norm": 1.9881010055541992, "learning_rate": 5.90909090909091e-06, "loss": 0.5945, "step": 132 }, { "epoch": 0.4256, "grad_norm": 2.017544984817505, "learning_rate": 5.876623376623377e-06, "loss": 0.5837, "step": 133 }, { "epoch": 0.4288, "grad_norm": 2.583054780960083, "learning_rate": 5.844155844155844e-06, "loss": 0.6035, "step": 134 }, { "epoch": 0.432, "grad_norm": 2.6116082668304443, "learning_rate": 5.811688311688313e-06, "loss": 0.6499, "step": 135 }, { "epoch": 0.4352, "grad_norm": 2.4798545837402344, "learning_rate": 5.77922077922078e-06, "loss": 0.635, "step": 136 }, { "epoch": 0.4384, "grad_norm": 2.100008010864258, "learning_rate": 5.7467532467532475e-06, "loss": 0.5438, "step": 137 }, { "epoch": 0.4416, "grad_norm": 2.369692325592041, "learning_rate": 5.7142857142857145e-06, "loss": 0.5559, "step": 138 }, { "epoch": 0.4448, "grad_norm": 2.6044960021972656, "learning_rate": 5.681818181818183e-06, "loss": 0.6192, "step": 139 }, { "epoch": 0.448, "grad_norm": 2.3004958629608154, "learning_rate": 5.64935064935065e-06, "loss": 0.5644, "step": 140 }, { "epoch": 0.4512, "grad_norm": 2.30646014213562, "learning_rate": 5.616883116883117e-06, "loss": 0.6024, "step": 141 }, { "epoch": 0.4544, "grad_norm": 2.3673393726348877, "learning_rate": 5.584415584415585e-06, "loss": 0.5238, "step": 142 }, { "epoch": 0.4576, "grad_norm": 2.66957950592041, "learning_rate": 5.551948051948053e-06, "loss": 0.6148, "step": 143 }, { "epoch": 0.4608, "grad_norm": 2.209986925125122, "learning_rate": 5.5194805194805205e-06, "loss": 0.5579, "step": 144 }, { "epoch": 0.464, "grad_norm": 2.405188798904419, "learning_rate": 5.4870129870129875e-06, "loss": 0.5783, "step": 145 }, { "epoch": 0.4672, "grad_norm": 2.380112886428833, "learning_rate": 5.4545454545454545e-06, "loss": 0.5438, "step": 146 }, { "epoch": 0.4704, "grad_norm": 2.145167350769043, "learning_rate": 5.422077922077923e-06, "loss": 0.5081, "step": 147 }, { "epoch": 0.4736, "grad_norm": 2.42675518989563, "learning_rate": 5.38961038961039e-06, "loss": 0.5667, "step": 148 }, { "epoch": 0.4768, "grad_norm": 2.2153475284576416, "learning_rate": 5.357142857142857e-06, "loss": 0.495, "step": 149 }, { "epoch": 0.48, "grad_norm": 2.5394628047943115, "learning_rate": 5.324675324675325e-06, "loss": 0.4709, "step": 150 }, { "epoch": 0.4832, "grad_norm": 2.5719780921936035, "learning_rate": 5.292207792207793e-06, "loss": 0.5384, "step": 151 }, { "epoch": 0.4864, "grad_norm": 2.4524574279785156, "learning_rate": 5.2597402597402605e-06, "loss": 0.5615, "step": 152 }, { "epoch": 0.4896, "grad_norm": 2.5788793563842773, "learning_rate": 5.2272727272727274e-06, "loss": 0.4973, "step": 153 }, { "epoch": 0.4928, "grad_norm": 2.253735065460205, "learning_rate": 5.194805194805194e-06, "loss": 0.48, "step": 154 }, { "epoch": 0.496, "grad_norm": 2.8777077198028564, "learning_rate": 5.162337662337663e-06, "loss": 0.527, "step": 155 }, { "epoch": 0.4992, "grad_norm": 2.1467268466949463, "learning_rate": 5.12987012987013e-06, "loss": 0.5227, "step": 156 }, { "epoch": 0.5024, "grad_norm": 2.4068822860717773, "learning_rate": 5.097402597402598e-06, "loss": 0.4767, "step": 157 }, { "epoch": 0.5056, "grad_norm": 2.4299392700195312, "learning_rate": 5.064935064935065e-06, "loss": 0.508, "step": 158 }, { "epoch": 0.5088, "grad_norm": 2.200183868408203, "learning_rate": 5.0324675324675334e-06, "loss": 0.5159, "step": 159 }, { "epoch": 0.512, "grad_norm": 2.351933240890503, "learning_rate": 5e-06, "loss": 0.4603, "step": 160 }, { "epoch": 0.5152, "grad_norm": 2.501340389251709, "learning_rate": 4.967532467532468e-06, "loss": 0.4853, "step": 161 }, { "epoch": 0.5184, "grad_norm": 2.415330171585083, "learning_rate": 4.935064935064935e-06, "loss": 0.5096, "step": 162 }, { "epoch": 0.5216, "grad_norm": 3.2392561435699463, "learning_rate": 4.902597402597403e-06, "loss": 0.4772, "step": 163 }, { "epoch": 0.5248, "grad_norm": 2.5991477966308594, "learning_rate": 4.870129870129871e-06, "loss": 0.4631, "step": 164 }, { "epoch": 0.528, "grad_norm": 3.1602790355682373, "learning_rate": 4.837662337662339e-06, "loss": 0.5219, "step": 165 }, { "epoch": 0.5312, "grad_norm": 3.0979714393615723, "learning_rate": 4.805194805194806e-06, "loss": 0.5795, "step": 166 }, { "epoch": 0.5344, "grad_norm": 2.6634860038757324, "learning_rate": 4.772727272727273e-06, "loss": 0.5505, "step": 167 }, { "epoch": 0.5376, "grad_norm": 2.6327335834503174, "learning_rate": 4.74025974025974e-06, "loss": 0.482, "step": 168 }, { "epoch": 0.5408, "grad_norm": 2.570263624191284, "learning_rate": 4.707792207792208e-06, "loss": 0.4346, "step": 169 }, { "epoch": 0.544, "grad_norm": 2.69173526763916, "learning_rate": 4.675324675324676e-06, "loss": 0.5307, "step": 170 }, { "epoch": 0.5472, "grad_norm": 2.4208245277404785, "learning_rate": 4.642857142857144e-06, "loss": 0.4918, "step": 171 }, { "epoch": 0.5504, "grad_norm": 2.279787063598633, "learning_rate": 4.610389610389611e-06, "loss": 0.4563, "step": 172 }, { "epoch": 0.5536, "grad_norm": 2.5185399055480957, "learning_rate": 4.5779220779220786e-06, "loss": 0.4685, "step": 173 }, { "epoch": 0.5568, "grad_norm": 2.4868409633636475, "learning_rate": 4.5454545454545455e-06, "loss": 0.4616, "step": 174 }, { "epoch": 0.56, "grad_norm": 3.0586249828338623, "learning_rate": 4.512987012987013e-06, "loss": 0.4981, "step": 175 }, { "epoch": 0.5632, "grad_norm": 2.484553575515747, "learning_rate": 4.48051948051948e-06, "loss": 0.5173, "step": 176 }, { "epoch": 0.5664, "grad_norm": 2.8379547595977783, "learning_rate": 4.448051948051948e-06, "loss": 0.5163, "step": 177 }, { "epoch": 0.5696, "grad_norm": 3.016439914703369, "learning_rate": 4.415584415584416e-06, "loss": 0.4193, "step": 178 }, { "epoch": 0.5728, "grad_norm": 3.060291290283203, "learning_rate": 4.383116883116884e-06, "loss": 0.4838, "step": 179 }, { "epoch": 0.576, "grad_norm": 2.8870880603790283, "learning_rate": 4.350649350649351e-06, "loss": 0.4921, "step": 180 }, { "epoch": 0.5792, "grad_norm": 2.58658766746521, "learning_rate": 4.3181818181818185e-06, "loss": 0.4667, "step": 181 }, { "epoch": 0.5824, "grad_norm": 2.6489756107330322, "learning_rate": 4.2857142857142855e-06, "loss": 0.4268, "step": 182 }, { "epoch": 0.5856, "grad_norm": 3.263758897781372, "learning_rate": 4.253246753246753e-06, "loss": 0.4556, "step": 183 }, { "epoch": 0.5888, "grad_norm": 3.113347053527832, "learning_rate": 4.220779220779221e-06, "loss": 0.4584, "step": 184 }, { "epoch": 0.592, "grad_norm": 2.749751091003418, "learning_rate": 4.188311688311689e-06, "loss": 0.4366, "step": 185 }, { "epoch": 0.5952, "grad_norm": 2.726654052734375, "learning_rate": 4.155844155844157e-06, "loss": 0.4574, "step": 186 }, { "epoch": 0.5984, "grad_norm": 2.737936019897461, "learning_rate": 4.123376623376624e-06, "loss": 0.4458, "step": 187 }, { "epoch": 0.6016, "grad_norm": 2.5224568843841553, "learning_rate": 4.0909090909090915e-06, "loss": 0.4375, "step": 188 }, { "epoch": 0.6048, "grad_norm": 3.1188995838165283, "learning_rate": 4.0584415584415584e-06, "loss": 0.4735, "step": 189 }, { "epoch": 0.608, "grad_norm": 3.059140920639038, "learning_rate": 4.025974025974026e-06, "loss": 0.4596, "step": 190 }, { "epoch": 0.6112, "grad_norm": 2.5312676429748535, "learning_rate": 3.993506493506494e-06, "loss": 0.4069, "step": 191 }, { "epoch": 0.6144, "grad_norm": 2.821013927459717, "learning_rate": 3.961038961038962e-06, "loss": 0.4671, "step": 192 }, { "epoch": 0.6176, "grad_norm": 3.2510814666748047, "learning_rate": 3.928571428571429e-06, "loss": 0.4492, "step": 193 }, { "epoch": 0.6208, "grad_norm": 3.137798547744751, "learning_rate": 3.896103896103897e-06, "loss": 0.379, "step": 194 }, { "epoch": 0.624, "grad_norm": 2.846541404724121, "learning_rate": 3.863636363636364e-06, "loss": 0.3914, "step": 195 }, { "epoch": 0.6272, "grad_norm": 3.1899781227111816, "learning_rate": 3.831168831168831e-06, "loss": 0.4699, "step": 196 }, { "epoch": 0.6304, "grad_norm": 3.118363857269287, "learning_rate": 3.798701298701299e-06, "loss": 0.4115, "step": 197 }, { "epoch": 0.6336, "grad_norm": 3.8301401138305664, "learning_rate": 3.7662337662337666e-06, "loss": 0.4093, "step": 198 }, { "epoch": 0.6368, "grad_norm": 3.856610059738159, "learning_rate": 3.733766233766234e-06, "loss": 0.4603, "step": 199 }, { "epoch": 0.64, "grad_norm": 4.301602363586426, "learning_rate": 3.701298701298702e-06, "loss": 0.4026, "step": 200 }, { "epoch": 0.6432, "grad_norm": 3.5105443000793457, "learning_rate": 3.6688311688311688e-06, "loss": 0.5042, "step": 201 }, { "epoch": 0.6464, "grad_norm": 3.22503399848938, "learning_rate": 3.6363636363636366e-06, "loss": 0.4417, "step": 202 }, { "epoch": 0.6496, "grad_norm": 3.0266528129577637, "learning_rate": 3.603896103896104e-06, "loss": 0.4204, "step": 203 }, { "epoch": 0.6528, "grad_norm": 3.440232992172241, "learning_rate": 3.5714285714285718e-06, "loss": 0.4388, "step": 204 }, { "epoch": 0.656, "grad_norm": 3.1369011402130127, "learning_rate": 3.538961038961039e-06, "loss": 0.3727, "step": 205 }, { "epoch": 0.6592, "grad_norm": 3.406059503555298, "learning_rate": 3.506493506493507e-06, "loss": 0.3845, "step": 206 }, { "epoch": 0.6624, "grad_norm": 3.070361614227295, "learning_rate": 3.474025974025974e-06, "loss": 0.3876, "step": 207 }, { "epoch": 0.6656, "grad_norm": 3.1251604557037354, "learning_rate": 3.4415584415584418e-06, "loss": 0.4594, "step": 208 }, { "epoch": 0.6688, "grad_norm": 3.755077838897705, "learning_rate": 3.409090909090909e-06, "loss": 0.4547, "step": 209 }, { "epoch": 0.672, "grad_norm": 3.0717198848724365, "learning_rate": 3.376623376623377e-06, "loss": 0.3853, "step": 210 }, { "epoch": 0.6752, "grad_norm": 3.4697611331939697, "learning_rate": 3.3441558441558443e-06, "loss": 0.3974, "step": 211 }, { "epoch": 0.6784, "grad_norm": 3.2027981281280518, "learning_rate": 3.311688311688312e-06, "loss": 0.3518, "step": 212 }, { "epoch": 0.6816, "grad_norm": 3.3500893115997314, "learning_rate": 3.27922077922078e-06, "loss": 0.4752, "step": 213 }, { "epoch": 0.6848, "grad_norm": 3.9136343002319336, "learning_rate": 3.246753246753247e-06, "loss": 0.4509, "step": 214 }, { "epoch": 0.688, "grad_norm": 3.5852739810943604, "learning_rate": 3.2142857142857147e-06, "loss": 0.3698, "step": 215 }, { "epoch": 0.6912, "grad_norm": 2.958652973175049, "learning_rate": 3.181818181818182e-06, "loss": 0.404, "step": 216 }, { "epoch": 0.6944, "grad_norm": 2.812950611114502, "learning_rate": 3.14935064935065e-06, "loss": 0.323, "step": 217 }, { "epoch": 0.6976, "grad_norm": 3.6003265380859375, "learning_rate": 3.116883116883117e-06, "loss": 0.3955, "step": 218 }, { "epoch": 0.7008, "grad_norm": 3.6587672233581543, "learning_rate": 3.0844155844155847e-06, "loss": 0.386, "step": 219 }, { "epoch": 0.704, "grad_norm": 3.6365511417388916, "learning_rate": 3.051948051948052e-06, "loss": 0.3928, "step": 220 }, { "epoch": 0.7072, "grad_norm": 3.3309619426727295, "learning_rate": 3.01948051948052e-06, "loss": 0.4252, "step": 221 }, { "epoch": 0.7104, "grad_norm": 3.8079402446746826, "learning_rate": 2.9870129870129873e-06, "loss": 0.3709, "step": 222 }, { "epoch": 0.7136, "grad_norm": 3.5054643154144287, "learning_rate": 2.954545454545455e-06, "loss": 0.3754, "step": 223 }, { "epoch": 0.7168, "grad_norm": 3.159898519515991, "learning_rate": 2.922077922077922e-06, "loss": 0.3728, "step": 224 }, { "epoch": 0.72, "grad_norm": 3.584296226501465, "learning_rate": 2.88961038961039e-06, "loss": 0.4081, "step": 225 }, { "epoch": 0.7232, "grad_norm": 3.1787877082824707, "learning_rate": 2.8571428571428573e-06, "loss": 0.3665, "step": 226 }, { "epoch": 0.7264, "grad_norm": 2.677522897720337, "learning_rate": 2.824675324675325e-06, "loss": 0.2992, "step": 227 }, { "epoch": 0.7296, "grad_norm": 3.978600263595581, "learning_rate": 2.7922077922077925e-06, "loss": 0.3937, "step": 228 }, { "epoch": 0.7328, "grad_norm": 3.122856855392456, "learning_rate": 2.7597402597402603e-06, "loss": 0.3803, "step": 229 }, { "epoch": 0.736, "grad_norm": 3.339524507522583, "learning_rate": 2.7272727272727272e-06, "loss": 0.386, "step": 230 }, { "epoch": 0.7392, "grad_norm": 3.5689985752105713, "learning_rate": 2.694805194805195e-06, "loss": 0.3727, "step": 231 }, { "epoch": 0.7424, "grad_norm": 3.6701812744140625, "learning_rate": 2.6623376623376624e-06, "loss": 0.3603, "step": 232 }, { "epoch": 0.7456, "grad_norm": 3.027395486831665, "learning_rate": 2.6298701298701302e-06, "loss": 0.3522, "step": 233 }, { "epoch": 0.7488, "grad_norm": 3.4681172370910645, "learning_rate": 2.597402597402597e-06, "loss": 0.4836, "step": 234 }, { "epoch": 0.752, "grad_norm": 3.5065665245056152, "learning_rate": 2.564935064935065e-06, "loss": 0.3834, "step": 235 }, { "epoch": 0.7552, "grad_norm": 4.68280029296875, "learning_rate": 2.5324675324675324e-06, "loss": 0.3449, "step": 236 }, { "epoch": 0.7584, "grad_norm": 4.197664737701416, "learning_rate": 2.5e-06, "loss": 0.3756, "step": 237 }, { "epoch": 0.7616, "grad_norm": 3.1506834030151367, "learning_rate": 2.4675324675324676e-06, "loss": 0.3334, "step": 238 }, { "epoch": 0.7648, "grad_norm": 4.261498928070068, "learning_rate": 2.4350649350649354e-06, "loss": 0.3947, "step": 239 }, { "epoch": 0.768, "grad_norm": 3.613182783126831, "learning_rate": 2.402597402597403e-06, "loss": 0.291, "step": 240 }, { "epoch": 0.7712, "grad_norm": 3.1160547733306885, "learning_rate": 2.37012987012987e-06, "loss": 0.3419, "step": 241 }, { "epoch": 0.7744, "grad_norm": 3.765911102294922, "learning_rate": 2.337662337662338e-06, "loss": 0.3947, "step": 242 }, { "epoch": 0.7776, "grad_norm": 3.1016063690185547, "learning_rate": 2.3051948051948054e-06, "loss": 0.3245, "step": 243 }, { "epoch": 0.7808, "grad_norm": 3.860008955001831, "learning_rate": 2.2727272727272728e-06, "loss": 0.3445, "step": 244 }, { "epoch": 0.784, "grad_norm": 3.2103567123413086, "learning_rate": 2.24025974025974e-06, "loss": 0.2996, "step": 245 }, { "epoch": 0.7872, "grad_norm": 3.3398399353027344, "learning_rate": 2.207792207792208e-06, "loss": 0.378, "step": 246 }, { "epoch": 0.7904, "grad_norm": 3.529299020767212, "learning_rate": 2.1753246753246753e-06, "loss": 0.3466, "step": 247 }, { "epoch": 0.7936, "grad_norm": 3.3010692596435547, "learning_rate": 2.1428571428571427e-06, "loss": 0.2845, "step": 248 }, { "epoch": 0.7968, "grad_norm": 3.961313486099243, "learning_rate": 2.1103896103896105e-06, "loss": 0.3889, "step": 249 }, { "epoch": 0.8, "grad_norm": 3.386698007583618, "learning_rate": 2.0779220779220784e-06, "loss": 0.2975, "step": 250 }, { "epoch": 0.8032, "grad_norm": 3.718106269836426, "learning_rate": 2.0454545454545457e-06, "loss": 0.3472, "step": 251 }, { "epoch": 0.8064, "grad_norm": 3.8275556564331055, "learning_rate": 2.012987012987013e-06, "loss": 0.3176, "step": 252 }, { "epoch": 0.8096, "grad_norm": 3.5234034061431885, "learning_rate": 1.980519480519481e-06, "loss": 0.3279, "step": 253 }, { "epoch": 0.8128, "grad_norm": 4.082868576049805, "learning_rate": 1.9480519480519483e-06, "loss": 0.3111, "step": 254 }, { "epoch": 0.816, "grad_norm": 3.8824350833892822, "learning_rate": 1.9155844155844157e-06, "loss": 0.3334, "step": 255 }, { "epoch": 0.8192, "grad_norm": 3.439039707183838, "learning_rate": 1.8831168831168833e-06, "loss": 0.3136, "step": 256 }, { "epoch": 0.8224, "grad_norm": 4.407830715179443, "learning_rate": 1.850649350649351e-06, "loss": 0.3089, "step": 257 }, { "epoch": 0.8256, "grad_norm": 4.113597869873047, "learning_rate": 1.8181818181818183e-06, "loss": 0.3666, "step": 258 }, { "epoch": 0.8288, "grad_norm": 4.972888469696045, "learning_rate": 1.7857142857142859e-06, "loss": 0.335, "step": 259 }, { "epoch": 0.832, "grad_norm": 4.1458048820495605, "learning_rate": 1.7532467532467535e-06, "loss": 0.3818, "step": 260 }, { "epoch": 0.8352, "grad_norm": 4.115975379943848, "learning_rate": 1.7207792207792209e-06, "loss": 0.3342, "step": 261 }, { "epoch": 0.8384, "grad_norm": 3.344860076904297, "learning_rate": 1.6883116883116885e-06, "loss": 0.3054, "step": 262 }, { "epoch": 0.8416, "grad_norm": 3.544015407562256, "learning_rate": 1.655844155844156e-06, "loss": 0.3118, "step": 263 }, { "epoch": 0.8448, "grad_norm": 3.5486655235290527, "learning_rate": 1.6233766233766235e-06, "loss": 0.3515, "step": 264 }, { "epoch": 0.848, "grad_norm": 2.9218409061431885, "learning_rate": 1.590909090909091e-06, "loss": 0.3812, "step": 265 }, { "epoch": 0.8512, "grad_norm": 4.031822204589844, "learning_rate": 1.5584415584415584e-06, "loss": 0.3563, "step": 266 }, { "epoch": 0.8544, "grad_norm": 3.7376978397369385, "learning_rate": 1.525974025974026e-06, "loss": 0.3629, "step": 267 }, { "epoch": 0.8576, "grad_norm": 3.612359046936035, "learning_rate": 1.4935064935064936e-06, "loss": 0.3124, "step": 268 }, { "epoch": 0.8608, "grad_norm": 4.43179988861084, "learning_rate": 1.461038961038961e-06, "loss": 0.378, "step": 269 }, { "epoch": 0.864, "grad_norm": 4.0608391761779785, "learning_rate": 1.4285714285714286e-06, "loss": 0.342, "step": 270 }, { "epoch": 0.8672, "grad_norm": 2.922579050064087, "learning_rate": 1.3961038961038962e-06, "loss": 0.2483, "step": 271 }, { "epoch": 0.8704, "grad_norm": 4.325706958770752, "learning_rate": 1.3636363636363636e-06, "loss": 0.3178, "step": 272 }, { "epoch": 0.8736, "grad_norm": 3.271789312362671, "learning_rate": 1.3311688311688312e-06, "loss": 0.2458, "step": 273 }, { "epoch": 0.8768, "grad_norm": 3.6045310497283936, "learning_rate": 1.2987012987012986e-06, "loss": 0.3291, "step": 274 }, { "epoch": 0.88, "grad_norm": 3.202625036239624, "learning_rate": 1.2662337662337662e-06, "loss": 0.3202, "step": 275 }, { "epoch": 0.8832, "grad_norm": 3.570035696029663, "learning_rate": 1.2337662337662338e-06, "loss": 0.3006, "step": 276 }, { "epoch": 0.8864, "grad_norm": 3.309915065765381, "learning_rate": 1.2012987012987014e-06, "loss": 0.3388, "step": 277 }, { "epoch": 0.8896, "grad_norm": 4.571963310241699, "learning_rate": 1.168831168831169e-06, "loss": 0.3483, "step": 278 }, { "epoch": 0.8928, "grad_norm": 3.655355930328369, "learning_rate": 1.1363636363636364e-06, "loss": 0.2964, "step": 279 }, { "epoch": 0.896, "grad_norm": 3.7896673679351807, "learning_rate": 1.103896103896104e-06, "loss": 0.2922, "step": 280 }, { "epoch": 0.8992, "grad_norm": 4.307418346405029, "learning_rate": 1.0714285714285714e-06, "loss": 0.2949, "step": 281 }, { "epoch": 0.9024, "grad_norm": 3.3083884716033936, "learning_rate": 1.0389610389610392e-06, "loss": 0.3209, "step": 282 }, { "epoch": 0.9056, "grad_norm": 4.244633197784424, "learning_rate": 1.0064935064935066e-06, "loss": 0.2946, "step": 283 }, { "epoch": 0.9088, "grad_norm": 3.999847888946533, "learning_rate": 9.740259740259742e-07, "loss": 0.3211, "step": 284 }, { "epoch": 0.912, "grad_norm": 3.737175226211548, "learning_rate": 9.415584415584417e-07, "loss": 0.226, "step": 285 }, { "epoch": 0.9152, "grad_norm": 3.8389108180999756, "learning_rate": 9.090909090909091e-07, "loss": 0.278, "step": 286 }, { "epoch": 0.9184, "grad_norm": 3.7956488132476807, "learning_rate": 8.766233766233767e-07, "loss": 0.2682, "step": 287 }, { "epoch": 0.9216, "grad_norm": 3.7749340534210205, "learning_rate": 8.441558441558442e-07, "loss": 0.3251, "step": 288 }, { "epoch": 0.9248, "grad_norm": 5.137296676635742, "learning_rate": 8.116883116883117e-07, "loss": 0.401, "step": 289 }, { "epoch": 0.928, "grad_norm": 3.7601916790008545, "learning_rate": 7.792207792207792e-07, "loss": 0.3065, "step": 290 }, { "epoch": 0.9312, "grad_norm": 4.77811861038208, "learning_rate": 7.467532467532468e-07, "loss": 0.3074, "step": 291 }, { "epoch": 0.9344, "grad_norm": 3.95403790473938, "learning_rate": 7.142857142857143e-07, "loss": 0.3167, "step": 292 }, { "epoch": 0.9376, "grad_norm": 3.9109158515930176, "learning_rate": 6.818181818181818e-07, "loss": 0.354, "step": 293 }, { "epoch": 0.9408, "grad_norm": 3.9957079887390137, "learning_rate": 6.493506493506493e-07, "loss": 0.3079, "step": 294 }, { "epoch": 0.944, "grad_norm": 4.9614739418029785, "learning_rate": 6.168831168831169e-07, "loss": 0.3258, "step": 295 }, { "epoch": 0.9472, "grad_norm": 3.6851398944854736, "learning_rate": 5.844155844155845e-07, "loss": 0.2817, "step": 296 }, { "epoch": 0.9504, "grad_norm": 3.8794970512390137, "learning_rate": 5.51948051948052e-07, "loss": 0.2766, "step": 297 }, { "epoch": 0.9536, "grad_norm": 3.5052332878112793, "learning_rate": 5.194805194805196e-07, "loss": 0.306, "step": 298 }, { "epoch": 0.9568, "grad_norm": 3.526350259780884, "learning_rate": 4.870129870129871e-07, "loss": 0.3138, "step": 299 }, { "epoch": 0.96, "grad_norm": 4.306396007537842, "learning_rate": 4.5454545454545457e-07, "loss": 0.3048, "step": 300 }, { "epoch": 0.9632, "grad_norm": 3.977013111114502, "learning_rate": 4.220779220779221e-07, "loss": 0.2984, "step": 301 }, { "epoch": 0.9664, "grad_norm": 3.4203877449035645, "learning_rate": 3.896103896103896e-07, "loss": 0.3002, "step": 302 }, { "epoch": 0.9696, "grad_norm": 3.6374948024749756, "learning_rate": 3.5714285714285716e-07, "loss": 0.2961, "step": 303 }, { "epoch": 0.9728, "grad_norm": 3.726452350616455, "learning_rate": 3.2467532467532465e-07, "loss": 0.2844, "step": 304 }, { "epoch": 0.976, "grad_norm": 4.53134822845459, "learning_rate": 2.9220779220779225e-07, "loss": 0.2937, "step": 305 }, { "epoch": 0.9792, "grad_norm": 4.388988494873047, "learning_rate": 2.597402597402598e-07, "loss": 0.3295, "step": 306 }, { "epoch": 0.9824, "grad_norm": 3.973243236541748, "learning_rate": 2.2727272727272729e-07, "loss": 0.2991, "step": 307 }, { "epoch": 0.9856, "grad_norm": 4.007715225219727, "learning_rate": 1.948051948051948e-07, "loss": 0.2812, "step": 308 }, { "epoch": 0.9888, "grad_norm": 3.9321532249450684, "learning_rate": 1.6233766233766232e-07, "loss": 0.2825, "step": 309 }, { "epoch": 0.992, "grad_norm": 4.078246593475342, "learning_rate": 1.298701298701299e-07, "loss": 0.2848, "step": 310 }, { "epoch": 0.9952, "grad_norm": 4.222702980041504, "learning_rate": 9.74025974025974e-08, "loss": 0.2945, "step": 311 }, { "epoch": 0.9984, "grad_norm": 4.369868755340576, "learning_rate": 6.493506493506495e-08, "loss": 0.2944, "step": 312 }, { "epoch": 1.0, "grad_norm": 5.1346235275268555, "learning_rate": 3.2467532467532474e-08, "loss": 0.2697, "step": 313 } ], "logging_steps": 1, "max_steps": 313, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3318933877534515e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }