{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018140589569160998, "grad_norm": 0.95703125, "learning_rate": 0.0, "loss": 2.6996, "step": 1 }, { "epoch": 0.036281179138321996, "grad_norm": 0.9921875, "learning_rate": 4.000000000000001e-06, "loss": 2.66, "step": 2 }, { "epoch": 0.05442176870748299, "grad_norm": 1.0703125, "learning_rate": 8.000000000000001e-06, "loss": 2.6808, "step": 3 }, { "epoch": 0.07256235827664399, "grad_norm": 1.0078125, "learning_rate": 1.2e-05, "loss": 2.6952, "step": 4 }, { "epoch": 0.09070294784580499, "grad_norm": 0.94140625, "learning_rate": 1.6000000000000003e-05, "loss": 2.6385, "step": 5 }, { "epoch": 0.10884353741496598, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.6775, "step": 6 }, { "epoch": 0.12698412698412698, "grad_norm": 0.8828125, "learning_rate": 1.9877300613496935e-05, "loss": 2.6544, "step": 7 }, { "epoch": 0.14512471655328799, "grad_norm": 0.87109375, "learning_rate": 1.9754601226993868e-05, "loss": 2.6104, "step": 8 }, { "epoch": 0.16326530612244897, "grad_norm": 0.84375, "learning_rate": 1.96319018404908e-05, "loss": 2.5816, "step": 9 }, { "epoch": 0.18140589569160998, "grad_norm": 0.921875, "learning_rate": 1.9509202453987733e-05, "loss": 2.6088, "step": 10 }, { "epoch": 0.19954648526077098, "grad_norm": 0.85546875, "learning_rate": 1.9386503067484663e-05, "loss": 2.5572, "step": 11 }, { "epoch": 0.21768707482993196, "grad_norm": 0.8046875, "learning_rate": 1.9263803680981596e-05, "loss": 2.5599, "step": 12 }, { "epoch": 0.23582766439909297, "grad_norm": 0.8125, "learning_rate": 1.914110429447853e-05, "loss": 2.5125, "step": 13 }, { "epoch": 0.25396825396825395, "grad_norm": 0.78515625, "learning_rate": 1.9018404907975462e-05, "loss": 2.551, "step": 14 }, { "epoch": 0.272108843537415, "grad_norm": 0.73046875, "learning_rate": 1.8895705521472395e-05, "loss": 2.5148, "step": 15 }, { "epoch": 0.29024943310657597, "grad_norm": 0.74609375, "learning_rate": 1.8773006134969328e-05, "loss": 2.4706, "step": 16 }, { "epoch": 0.30839002267573695, "grad_norm": 0.76171875, "learning_rate": 1.8650306748466257e-05, "loss": 2.435, "step": 17 }, { "epoch": 0.32653061224489793, "grad_norm": 0.7109375, "learning_rate": 1.852760736196319e-05, "loss": 2.4895, "step": 18 }, { "epoch": 0.34467120181405897, "grad_norm": 0.6953125, "learning_rate": 1.8404907975460123e-05, "loss": 2.4438, "step": 19 }, { "epoch": 0.36281179138321995, "grad_norm": 0.703125, "learning_rate": 1.828220858895706e-05, "loss": 2.4875, "step": 20 }, { "epoch": 0.38095238095238093, "grad_norm": 0.67578125, "learning_rate": 1.815950920245399e-05, "loss": 2.4617, "step": 21 }, { "epoch": 0.39909297052154197, "grad_norm": 0.734375, "learning_rate": 1.8036809815950922e-05, "loss": 2.3989, "step": 22 }, { "epoch": 0.41723356009070295, "grad_norm": 0.703125, "learning_rate": 1.7914110429447855e-05, "loss": 2.3609, "step": 23 }, { "epoch": 0.43537414965986393, "grad_norm": 0.671875, "learning_rate": 1.7791411042944788e-05, "loss": 2.3001, "step": 24 }, { "epoch": 0.45351473922902497, "grad_norm": 0.66796875, "learning_rate": 1.766871165644172e-05, "loss": 2.3645, "step": 25 }, { "epoch": 0.47165532879818595, "grad_norm": 0.671875, "learning_rate": 1.7546012269938654e-05, "loss": 2.3013, "step": 26 }, { "epoch": 0.4897959183673469, "grad_norm": 0.69921875, "learning_rate": 1.7423312883435583e-05, "loss": 2.4127, "step": 27 }, { "epoch": 0.5079365079365079, "grad_norm": 0.6640625, "learning_rate": 1.7300613496932516e-05, "loss": 2.3384, "step": 28 }, { "epoch": 0.5260770975056689, "grad_norm": 0.66015625, "learning_rate": 1.717791411042945e-05, "loss": 2.2717, "step": 29 }, { "epoch": 0.54421768707483, "grad_norm": 0.6953125, "learning_rate": 1.7055214723926382e-05, "loss": 2.3407, "step": 30 }, { "epoch": 0.562358276643991, "grad_norm": 0.640625, "learning_rate": 1.6932515337423315e-05, "loss": 2.2607, "step": 31 }, { "epoch": 0.5804988662131519, "grad_norm": 0.671875, "learning_rate": 1.6809815950920248e-05, "loss": 2.2913, "step": 32 }, { "epoch": 0.5986394557823129, "grad_norm": 0.69921875, "learning_rate": 1.6687116564417178e-05, "loss": 2.3287, "step": 33 }, { "epoch": 0.6167800453514739, "grad_norm": 0.63671875, "learning_rate": 1.656441717791411e-05, "loss": 2.2778, "step": 34 }, { "epoch": 0.6349206349206349, "grad_norm": 0.6484375, "learning_rate": 1.6441717791411043e-05, "loss": 2.2741, "step": 35 }, { "epoch": 0.6530612244897959, "grad_norm": 0.63671875, "learning_rate": 1.6319018404907976e-05, "loss": 2.2311, "step": 36 }, { "epoch": 0.671201814058957, "grad_norm": 0.6796875, "learning_rate": 1.619631901840491e-05, "loss": 2.2988, "step": 37 }, { "epoch": 0.6893424036281179, "grad_norm": 0.6328125, "learning_rate": 1.6073619631901842e-05, "loss": 2.2643, "step": 38 }, { "epoch": 0.7074829931972789, "grad_norm": 0.6796875, "learning_rate": 1.5950920245398772e-05, "loss": 2.2546, "step": 39 }, { "epoch": 0.7256235827664399, "grad_norm": 0.671875, "learning_rate": 1.5828220858895708e-05, "loss": 2.2735, "step": 40 }, { "epoch": 0.7437641723356009, "grad_norm": 0.578125, "learning_rate": 1.570552147239264e-05, "loss": 2.2546, "step": 41 }, { "epoch": 0.7619047619047619, "grad_norm": 0.6484375, "learning_rate": 1.5582822085889574e-05, "loss": 2.2388, "step": 42 }, { "epoch": 0.780045351473923, "grad_norm": 0.68359375, "learning_rate": 1.5460122699386504e-05, "loss": 2.2796, "step": 43 }, { "epoch": 0.7981859410430839, "grad_norm": 0.62109375, "learning_rate": 1.5337423312883436e-05, "loss": 2.2356, "step": 44 }, { "epoch": 0.8163265306122449, "grad_norm": 0.6171875, "learning_rate": 1.5214723926380371e-05, "loss": 2.2562, "step": 45 }, { "epoch": 0.8344671201814059, "grad_norm": 0.66015625, "learning_rate": 1.50920245398773e-05, "loss": 2.2189, "step": 46 }, { "epoch": 0.8526077097505669, "grad_norm": 0.64453125, "learning_rate": 1.4969325153374235e-05, "loss": 2.2293, "step": 47 }, { "epoch": 0.8707482993197279, "grad_norm": 0.6328125, "learning_rate": 1.4846625766871168e-05, "loss": 2.1792, "step": 48 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6484375, "learning_rate": 1.47239263803681e-05, "loss": 2.2333, "step": 49 }, { "epoch": 0.9070294784580499, "grad_norm": 0.58984375, "learning_rate": 1.4601226993865032e-05, "loss": 2.2005, "step": 50 }, { "epoch": 0.9251700680272109, "grad_norm": 0.70703125, "learning_rate": 1.4478527607361965e-05, "loss": 2.2617, "step": 51 }, { "epoch": 0.9433106575963719, "grad_norm": 0.64453125, "learning_rate": 1.4355828220858897e-05, "loss": 2.2125, "step": 52 }, { "epoch": 0.9614512471655329, "grad_norm": 0.734375, "learning_rate": 1.423312883435583e-05, "loss": 2.237, "step": 53 }, { "epoch": 0.9795918367346939, "grad_norm": 0.703125, "learning_rate": 1.4110429447852763e-05, "loss": 2.1756, "step": 54 }, { "epoch": 0.9977324263038548, "grad_norm": 0.62890625, "learning_rate": 1.3987730061349694e-05, "loss": 2.2037, "step": 55 }, { "epoch": 1.0, "grad_norm": 1.9765625, "learning_rate": 1.3865030674846627e-05, "loss": 2.1214, "step": 56 }, { "epoch": 1.0, "eval_loss": 2.2088165283203125, "eval_model_preparation_time": 0.0224, "eval_runtime": 2.7857, "eval_samples_per_second": 35.179, "eval_steps_per_second": 17.59, "step": 56 }, { "epoch": 1.018140589569161, "grad_norm": 0.58984375, "learning_rate": 1.374233128834356e-05, "loss": 2.1552, "step": 57 }, { "epoch": 1.036281179138322, "grad_norm": 0.62890625, "learning_rate": 1.3619631901840491e-05, "loss": 2.1247, "step": 58 }, { "epoch": 1.054421768707483, "grad_norm": 0.6171875, "learning_rate": 1.3496932515337424e-05, "loss": 2.2268, "step": 59 }, { "epoch": 1.072562358276644, "grad_norm": 0.65234375, "learning_rate": 1.3374233128834357e-05, "loss": 2.1801, "step": 60 }, { "epoch": 1.090702947845805, "grad_norm": 0.65625, "learning_rate": 1.3251533742331288e-05, "loss": 2.1991, "step": 61 }, { "epoch": 1.1088435374149659, "grad_norm": 0.70703125, "learning_rate": 1.3128834355828221e-05, "loss": 2.1206, "step": 62 }, { "epoch": 1.126984126984127, "grad_norm": 0.73828125, "learning_rate": 1.3006134969325156e-05, "loss": 2.1545, "step": 63 }, { "epoch": 1.145124716553288, "grad_norm": 0.65234375, "learning_rate": 1.2883435582822085e-05, "loss": 2.1574, "step": 64 }, { "epoch": 1.163265306122449, "grad_norm": 0.62890625, "learning_rate": 1.276073619631902e-05, "loss": 2.1384, "step": 65 }, { "epoch": 1.18140589569161, "grad_norm": 0.66015625, "learning_rate": 1.2638036809815953e-05, "loss": 2.1563, "step": 66 }, { "epoch": 1.199546485260771, "grad_norm": 0.69921875, "learning_rate": 1.2515337423312886e-05, "loss": 2.1593, "step": 67 }, { "epoch": 1.217687074829932, "grad_norm": 0.76953125, "learning_rate": 1.2392638036809817e-05, "loss": 2.1628, "step": 68 }, { "epoch": 1.235827664399093, "grad_norm": 0.66796875, "learning_rate": 1.226993865030675e-05, "loss": 2.1392, "step": 69 }, { "epoch": 1.253968253968254, "grad_norm": 0.7578125, "learning_rate": 1.2147239263803683e-05, "loss": 2.2247, "step": 70 }, { "epoch": 1.272108843537415, "grad_norm": 0.71484375, "learning_rate": 1.2024539877300614e-05, "loss": 2.1673, "step": 71 }, { "epoch": 1.290249433106576, "grad_norm": 0.61328125, "learning_rate": 1.1901840490797547e-05, "loss": 2.1676, "step": 72 }, { "epoch": 1.308390022675737, "grad_norm": 0.69140625, "learning_rate": 1.177914110429448e-05, "loss": 2.0911, "step": 73 }, { "epoch": 1.3265306122448979, "grad_norm": 0.69140625, "learning_rate": 1.1656441717791411e-05, "loss": 2.1493, "step": 74 }, { "epoch": 1.344671201814059, "grad_norm": 0.6875, "learning_rate": 1.1533742331288344e-05, "loss": 2.1459, "step": 75 }, { "epoch": 1.36281179138322, "grad_norm": 0.7421875, "learning_rate": 1.1411042944785277e-05, "loss": 2.0973, "step": 76 }, { "epoch": 1.380952380952381, "grad_norm": 0.69921875, "learning_rate": 1.1288343558282208e-05, "loss": 2.0893, "step": 77 }, { "epoch": 1.399092970521542, "grad_norm": 0.73046875, "learning_rate": 1.1165644171779141e-05, "loss": 2.1779, "step": 78 }, { "epoch": 1.417233560090703, "grad_norm": 0.6796875, "learning_rate": 1.1042944785276076e-05, "loss": 2.0661, "step": 79 }, { "epoch": 1.435374149659864, "grad_norm": 0.6796875, "learning_rate": 1.0920245398773005e-05, "loss": 2.1201, "step": 80 }, { "epoch": 1.4535147392290249, "grad_norm": 0.65234375, "learning_rate": 1.079754601226994e-05, "loss": 2.0765, "step": 81 }, { "epoch": 1.471655328798186, "grad_norm": 0.66015625, "learning_rate": 1.0674846625766873e-05, "loss": 2.091, "step": 82 }, { "epoch": 1.489795918367347, "grad_norm": 0.6796875, "learning_rate": 1.0552147239263804e-05, "loss": 2.1094, "step": 83 }, { "epoch": 1.507936507936508, "grad_norm": 0.7109375, "learning_rate": 1.0429447852760737e-05, "loss": 2.2231, "step": 84 }, { "epoch": 1.5260770975056688, "grad_norm": 0.66015625, "learning_rate": 1.030674846625767e-05, "loss": 2.1197, "step": 85 }, { "epoch": 1.54421768707483, "grad_norm": 0.7421875, "learning_rate": 1.0184049079754601e-05, "loss": 2.1248, "step": 86 }, { "epoch": 1.562358276643991, "grad_norm": 0.671875, "learning_rate": 1.0061349693251534e-05, "loss": 2.157, "step": 87 }, { "epoch": 1.5804988662131518, "grad_norm": 0.67578125, "learning_rate": 9.938650306748467e-06, "loss": 2.1562, "step": 88 }, { "epoch": 1.598639455782313, "grad_norm": 0.6796875, "learning_rate": 9.8159509202454e-06, "loss": 2.095, "step": 89 }, { "epoch": 1.616780045351474, "grad_norm": 0.72265625, "learning_rate": 9.693251533742331e-06, "loss": 2.1363, "step": 90 }, { "epoch": 1.6349206349206349, "grad_norm": 0.6328125, "learning_rate": 9.570552147239264e-06, "loss": 2.0856, "step": 91 }, { "epoch": 1.6530612244897958, "grad_norm": 0.6796875, "learning_rate": 9.447852760736197e-06, "loss": 2.1142, "step": 92 }, { "epoch": 1.671201814058957, "grad_norm": 0.72265625, "learning_rate": 9.325153374233129e-06, "loss": 2.1567, "step": 93 }, { "epoch": 1.689342403628118, "grad_norm": 0.7890625, "learning_rate": 9.202453987730062e-06, "loss": 2.1214, "step": 94 }, { "epoch": 1.7074829931972788, "grad_norm": 0.703125, "learning_rate": 9.079754601226994e-06, "loss": 2.1152, "step": 95 }, { "epoch": 1.72562358276644, "grad_norm": 0.69921875, "learning_rate": 8.957055214723927e-06, "loss": 2.0999, "step": 96 }, { "epoch": 1.743764172335601, "grad_norm": 0.6484375, "learning_rate": 8.83435582822086e-06, "loss": 2.1361, "step": 97 }, { "epoch": 1.7619047619047619, "grad_norm": 0.75390625, "learning_rate": 8.711656441717792e-06, "loss": 2.1357, "step": 98 }, { "epoch": 1.780045351473923, "grad_norm": 0.7421875, "learning_rate": 8.588957055214725e-06, "loss": 2.1405, "step": 99 }, { "epoch": 1.798185941043084, "grad_norm": 0.73046875, "learning_rate": 8.466257668711658e-06, "loss": 2.0975, "step": 100 }, { "epoch": 1.816326530612245, "grad_norm": 0.68359375, "learning_rate": 8.343558282208589e-06, "loss": 2.1457, "step": 101 }, { "epoch": 1.8344671201814058, "grad_norm": 0.703125, "learning_rate": 8.220858895705522e-06, "loss": 2.068, "step": 102 }, { "epoch": 1.8526077097505669, "grad_norm": 0.703125, "learning_rate": 8.098159509202455e-06, "loss": 2.1473, "step": 103 }, { "epoch": 1.870748299319728, "grad_norm": 0.66015625, "learning_rate": 7.975460122699386e-06, "loss": 2.0983, "step": 104 }, { "epoch": 1.8888888888888888, "grad_norm": 0.7265625, "learning_rate": 7.85276073619632e-06, "loss": 2.0952, "step": 105 }, { "epoch": 1.90702947845805, "grad_norm": 0.7578125, "learning_rate": 7.730061349693252e-06, "loss": 2.086, "step": 106 }, { "epoch": 1.925170068027211, "grad_norm": 0.69140625, "learning_rate": 7.6073619631901856e-06, "loss": 2.1086, "step": 107 }, { "epoch": 1.943310657596372, "grad_norm": 0.671875, "learning_rate": 7.484662576687118e-06, "loss": 2.092, "step": 108 }, { "epoch": 1.9614512471655328, "grad_norm": 0.77734375, "learning_rate": 7.36196319018405e-06, "loss": 2.1334, "step": 109 }, { "epoch": 1.9795918367346939, "grad_norm": 0.77734375, "learning_rate": 7.239263803680983e-06, "loss": 2.0644, "step": 110 }, { "epoch": 1.997732426303855, "grad_norm": 0.703125, "learning_rate": 7.116564417177915e-06, "loss": 2.0589, "step": 111 }, { "epoch": 2.0, "grad_norm": 1.6953125, "learning_rate": 6.993865030674847e-06, "loss": 2.0806, "step": 112 }, { "epoch": 2.0, "eval_loss": 2.1338422298431396, "eval_model_preparation_time": 0.0224, "eval_runtime": 2.7565, "eval_samples_per_second": 35.552, "eval_steps_per_second": 17.776, "step": 112 }, { "epoch": 2.018140589569161, "grad_norm": 0.6875, "learning_rate": 6.87116564417178e-06, "loss": 2.0871, "step": 113 }, { "epoch": 2.036281179138322, "grad_norm": 0.78125, "learning_rate": 6.748466257668712e-06, "loss": 2.0733, "step": 114 }, { "epoch": 2.054421768707483, "grad_norm": 0.7421875, "learning_rate": 6.625766871165644e-06, "loss": 2.0882, "step": 115 }, { "epoch": 2.072562358276644, "grad_norm": 0.70703125, "learning_rate": 6.503067484662578e-06, "loss": 2.028, "step": 116 }, { "epoch": 2.090702947845805, "grad_norm": 0.8125, "learning_rate": 6.38036809815951e-06, "loss": 2.0844, "step": 117 }, { "epoch": 2.108843537414966, "grad_norm": 0.7421875, "learning_rate": 6.257668711656443e-06, "loss": 2.1208, "step": 118 }, { "epoch": 2.126984126984127, "grad_norm": 0.70703125, "learning_rate": 6.134969325153375e-06, "loss": 2.1136, "step": 119 }, { "epoch": 2.145124716553288, "grad_norm": 0.734375, "learning_rate": 6.012269938650307e-06, "loss": 2.091, "step": 120 }, { "epoch": 2.163265306122449, "grad_norm": 0.68359375, "learning_rate": 5.88957055214724e-06, "loss": 2.0499, "step": 121 }, { "epoch": 2.18140589569161, "grad_norm": 0.734375, "learning_rate": 5.766871165644172e-06, "loss": 2.0557, "step": 122 }, { "epoch": 2.199546485260771, "grad_norm": 0.640625, "learning_rate": 5.644171779141104e-06, "loss": 2.0273, "step": 123 }, { "epoch": 2.2176870748299318, "grad_norm": 0.76953125, "learning_rate": 5.521472392638038e-06, "loss": 2.0663, "step": 124 }, { "epoch": 2.235827664399093, "grad_norm": 0.67578125, "learning_rate": 5.39877300613497e-06, "loss": 2.0865, "step": 125 }, { "epoch": 2.253968253968254, "grad_norm": 0.70703125, "learning_rate": 5.276073619631902e-06, "loss": 2.0568, "step": 126 }, { "epoch": 2.272108843537415, "grad_norm": 0.70703125, "learning_rate": 5.153374233128835e-06, "loss": 2.0819, "step": 127 }, { "epoch": 2.290249433106576, "grad_norm": 0.7734375, "learning_rate": 5.030674846625767e-06, "loss": 2.0872, "step": 128 }, { "epoch": 2.308390022675737, "grad_norm": 0.7734375, "learning_rate": 4.9079754601227e-06, "loss": 2.1564, "step": 129 }, { "epoch": 2.326530612244898, "grad_norm": 0.73828125, "learning_rate": 4.785276073619632e-06, "loss": 2.0401, "step": 130 }, { "epoch": 2.3446712018140587, "grad_norm": 0.68359375, "learning_rate": 4.662576687116564e-06, "loss": 2.0781, "step": 131 }, { "epoch": 2.36281179138322, "grad_norm": 0.73828125, "learning_rate": 4.539877300613497e-06, "loss": 2.0845, "step": 132 }, { "epoch": 2.380952380952381, "grad_norm": 0.7578125, "learning_rate": 4.41717791411043e-06, "loss": 2.0638, "step": 133 }, { "epoch": 2.399092970521542, "grad_norm": 0.77734375, "learning_rate": 4.294478527607362e-06, "loss": 2.0817, "step": 134 }, { "epoch": 2.417233560090703, "grad_norm": 0.71875, "learning_rate": 4.171779141104294e-06, "loss": 2.056, "step": 135 }, { "epoch": 2.435374149659864, "grad_norm": 0.703125, "learning_rate": 4.049079754601227e-06, "loss": 2.109, "step": 136 }, { "epoch": 2.453514739229025, "grad_norm": 0.6484375, "learning_rate": 3.92638036809816e-06, "loss": 2.0948, "step": 137 }, { "epoch": 2.471655328798186, "grad_norm": 0.77734375, "learning_rate": 3.8036809815950928e-06, "loss": 2.1051, "step": 138 }, { "epoch": 2.489795918367347, "grad_norm": 0.7421875, "learning_rate": 3.680981595092025e-06, "loss": 2.0321, "step": 139 }, { "epoch": 2.507936507936508, "grad_norm": 0.81640625, "learning_rate": 3.5582822085889574e-06, "loss": 2.1067, "step": 140 }, { "epoch": 2.526077097505669, "grad_norm": 0.6796875, "learning_rate": 3.43558282208589e-06, "loss": 2.1105, "step": 141 }, { "epoch": 2.54421768707483, "grad_norm": 0.73046875, "learning_rate": 3.312883435582822e-06, "loss": 2.1413, "step": 142 }, { "epoch": 2.562358276643991, "grad_norm": 0.7265625, "learning_rate": 3.190184049079755e-06, "loss": 2.0716, "step": 143 }, { "epoch": 2.580498866213152, "grad_norm": 0.69140625, "learning_rate": 3.0674846625766875e-06, "loss": 2.0711, "step": 144 }, { "epoch": 2.5986394557823127, "grad_norm": 0.7578125, "learning_rate": 2.94478527607362e-06, "loss": 2.0836, "step": 145 }, { "epoch": 2.616780045351474, "grad_norm": 0.71484375, "learning_rate": 2.822085889570552e-06, "loss": 2.0586, "step": 146 }, { "epoch": 2.634920634920635, "grad_norm": 0.70703125, "learning_rate": 2.699386503067485e-06, "loss": 2.0593, "step": 147 }, { "epoch": 2.6530612244897958, "grad_norm": 0.6484375, "learning_rate": 2.5766871165644175e-06, "loss": 2.0547, "step": 148 }, { "epoch": 2.671201814058957, "grad_norm": 0.671875, "learning_rate": 2.45398773006135e-06, "loss": 2.0451, "step": 149 }, { "epoch": 2.689342403628118, "grad_norm": 0.6640625, "learning_rate": 2.331288343558282e-06, "loss": 2.0731, "step": 150 }, { "epoch": 2.707482993197279, "grad_norm": 0.6796875, "learning_rate": 2.208588957055215e-06, "loss": 2.0026, "step": 151 }, { "epoch": 2.72562358276644, "grad_norm": 0.765625, "learning_rate": 2.085889570552147e-06, "loss": 2.1035, "step": 152 }, { "epoch": 2.743764172335601, "grad_norm": 0.7109375, "learning_rate": 1.96319018404908e-06, "loss": 2.0727, "step": 153 }, { "epoch": 2.761904761904762, "grad_norm": 0.69921875, "learning_rate": 1.8404907975460124e-06, "loss": 2.0177, "step": 154 }, { "epoch": 2.780045351473923, "grad_norm": 0.71484375, "learning_rate": 1.717791411042945e-06, "loss": 2.0351, "step": 155 }, { "epoch": 2.798185941043084, "grad_norm": 0.69921875, "learning_rate": 1.5950920245398775e-06, "loss": 2.0597, "step": 156 }, { "epoch": 2.816326530612245, "grad_norm": 0.70703125, "learning_rate": 1.47239263803681e-06, "loss": 2.048, "step": 157 }, { "epoch": 2.834467120181406, "grad_norm": 0.6953125, "learning_rate": 1.3496932515337425e-06, "loss": 2.0717, "step": 158 }, { "epoch": 2.8526077097505667, "grad_norm": 0.69921875, "learning_rate": 1.226993865030675e-06, "loss": 2.1007, "step": 159 }, { "epoch": 2.870748299319728, "grad_norm": 0.71875, "learning_rate": 1.1042944785276075e-06, "loss": 2.0228, "step": 160 }, { "epoch": 2.888888888888889, "grad_norm": 0.69140625, "learning_rate": 9.8159509202454e-07, "loss": 2.0413, "step": 161 }, { "epoch": 2.9070294784580497, "grad_norm": 0.6875, "learning_rate": 8.588957055214725e-07, "loss": 2.0616, "step": 162 }, { "epoch": 2.925170068027211, "grad_norm": 0.71875, "learning_rate": 7.36196319018405e-07, "loss": 2.0833, "step": 163 }, { "epoch": 2.943310657596372, "grad_norm": 0.75, "learning_rate": 6.134969325153375e-07, "loss": 2.1265, "step": 164 }, { "epoch": 2.9614512471655328, "grad_norm": 0.71875, "learning_rate": 4.9079754601227e-07, "loss": 2.0442, "step": 165 }, { "epoch": 2.979591836734694, "grad_norm": 0.6875, "learning_rate": 3.680981595092025e-07, "loss": 2.0522, "step": 166 }, { "epoch": 2.997732426303855, "grad_norm": 0.70703125, "learning_rate": 2.45398773006135e-07, "loss": 2.0616, "step": 167 }, { "epoch": 3.0, "grad_norm": 2.078125, "learning_rate": 1.226993865030675e-07, "loss": 2.0104, "step": 168 } ], "logging_steps": 1, "max_steps": 168, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.058921741456589e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }