{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018140589569160998, "grad_norm": 1.1640625, "learning_rate": 0.0, "loss": 2.822, "step": 1 }, { "epoch": 0.036281179138321996, "grad_norm": 1.1953125, "learning_rate": 4.000000000000001e-06, "loss": 2.8203, "step": 2 }, { "epoch": 0.05442176870748299, "grad_norm": 1.1484375, "learning_rate": 8.000000000000001e-06, "loss": 2.8642, "step": 3 }, { "epoch": 0.07256235827664399, "grad_norm": 1.1953125, "learning_rate": 1.2e-05, "loss": 2.8628, "step": 4 }, { "epoch": 0.09070294784580499, "grad_norm": 1.2421875, "learning_rate": 1.6000000000000003e-05, "loss": 2.9157, "step": 5 }, { "epoch": 0.10884353741496598, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.8873, "step": 6 }, { "epoch": 0.12698412698412698, "grad_norm": 1.109375, "learning_rate": 1.9877300613496935e-05, "loss": 2.8306, "step": 7 }, { "epoch": 0.14512471655328799, "grad_norm": 1.0703125, "learning_rate": 1.9754601226993868e-05, "loss": 2.8072, "step": 8 }, { "epoch": 0.16326530612244897, "grad_norm": 1.03125, "learning_rate": 1.96319018404908e-05, "loss": 2.7269, "step": 9 }, { "epoch": 0.18140589569160998, "grad_norm": 0.953125, "learning_rate": 1.9509202453987733e-05, "loss": 2.7068, "step": 10 }, { "epoch": 0.19954648526077098, "grad_norm": 0.953125, "learning_rate": 1.9386503067484663e-05, "loss": 2.7302, "step": 11 }, { "epoch": 0.21768707482993196, "grad_norm": 0.95703125, "learning_rate": 1.9263803680981596e-05, "loss": 2.7229, "step": 12 }, { "epoch": 0.23582766439909297, "grad_norm": 0.85546875, "learning_rate": 1.914110429447853e-05, "loss": 2.6181, "step": 13 }, { "epoch": 0.25396825396825395, "grad_norm": 0.8984375, "learning_rate": 1.9018404907975462e-05, "loss": 2.6338, "step": 14 }, { "epoch": 0.272108843537415, "grad_norm": 0.8046875, "learning_rate": 1.8895705521472395e-05, "loss": 2.6268, "step": 15 }, { "epoch": 0.29024943310657597, "grad_norm": 0.87109375, "learning_rate": 1.8773006134969328e-05, "loss": 2.5858, "step": 16 }, { "epoch": 0.30839002267573695, "grad_norm": 0.875, "learning_rate": 1.8650306748466257e-05, "loss": 2.5589, "step": 17 }, { "epoch": 0.32653061224489793, "grad_norm": 0.79296875, "learning_rate": 1.852760736196319e-05, "loss": 2.5118, "step": 18 }, { "epoch": 0.34467120181405897, "grad_norm": 0.84375, "learning_rate": 1.8404907975460123e-05, "loss": 2.5572, "step": 19 }, { "epoch": 0.36281179138321995, "grad_norm": 0.8046875, "learning_rate": 1.828220858895706e-05, "loss": 2.4944, "step": 20 }, { "epoch": 0.38095238095238093, "grad_norm": 0.83203125, "learning_rate": 1.815950920245399e-05, "loss": 2.4983, "step": 21 }, { "epoch": 0.39909297052154197, "grad_norm": 0.76953125, "learning_rate": 1.8036809815950922e-05, "loss": 2.4173, "step": 22 }, { "epoch": 0.41723356009070295, "grad_norm": 0.77734375, "learning_rate": 1.7914110429447855e-05, "loss": 2.4191, "step": 23 }, { "epoch": 0.43537414965986393, "grad_norm": 0.78125, "learning_rate": 1.7791411042944788e-05, "loss": 2.4321, "step": 24 }, { "epoch": 0.45351473922902497, "grad_norm": 0.7734375, "learning_rate": 1.766871165644172e-05, "loss": 2.4134, "step": 25 }, { "epoch": 0.47165532879818595, "grad_norm": 0.76171875, "learning_rate": 1.7546012269938654e-05, "loss": 2.3852, "step": 26 }, { "epoch": 0.4897959183673469, "grad_norm": 0.79296875, "learning_rate": 1.7423312883435583e-05, "loss": 2.4246, "step": 27 }, { "epoch": 0.5079365079365079, "grad_norm": 0.7734375, "learning_rate": 1.7300613496932516e-05, "loss": 2.3398, "step": 28 }, { "epoch": 0.5260770975056689, "grad_norm": 0.78515625, "learning_rate": 1.717791411042945e-05, "loss": 2.421, "step": 29 }, { "epoch": 0.54421768707483, "grad_norm": 0.703125, "learning_rate": 1.7055214723926382e-05, "loss": 2.3035, "step": 30 }, { "epoch": 0.562358276643991, "grad_norm": 0.74609375, "learning_rate": 1.6932515337423315e-05, "loss": 2.3796, "step": 31 }, { "epoch": 0.5804988662131519, "grad_norm": 0.76171875, "learning_rate": 1.6809815950920248e-05, "loss": 2.3895, "step": 32 }, { "epoch": 0.5986394557823129, "grad_norm": 0.71875, "learning_rate": 1.6687116564417178e-05, "loss": 2.3415, "step": 33 }, { "epoch": 0.6167800453514739, "grad_norm": 0.734375, "learning_rate": 1.656441717791411e-05, "loss": 2.3352, "step": 34 }, { "epoch": 0.6349206349206349, "grad_norm": 0.7734375, "learning_rate": 1.6441717791411043e-05, "loss": 2.3103, "step": 35 }, { "epoch": 0.6530612244897959, "grad_norm": 0.70703125, "learning_rate": 1.6319018404907976e-05, "loss": 2.2984, "step": 36 }, { "epoch": 0.671201814058957, "grad_norm": 0.71875, "learning_rate": 1.619631901840491e-05, "loss": 2.2928, "step": 37 }, { "epoch": 0.6893424036281179, "grad_norm": 0.71484375, "learning_rate": 1.6073619631901842e-05, "loss": 2.3743, "step": 38 }, { "epoch": 0.7074829931972789, "grad_norm": 0.8984375, "learning_rate": 1.5950920245398772e-05, "loss": 2.353, "step": 39 }, { "epoch": 0.7256235827664399, "grad_norm": 0.69921875, "learning_rate": 1.5828220858895708e-05, "loss": 2.3164, "step": 40 }, { "epoch": 0.7437641723356009, "grad_norm": 0.79296875, "learning_rate": 1.570552147239264e-05, "loss": 2.3112, "step": 41 }, { "epoch": 0.7619047619047619, "grad_norm": 0.7578125, "learning_rate": 1.5582822085889574e-05, "loss": 2.344, "step": 42 }, { "epoch": 0.780045351473923, "grad_norm": 0.69140625, "learning_rate": 1.5460122699386504e-05, "loss": 2.2518, "step": 43 }, { "epoch": 0.7981859410430839, "grad_norm": 0.7265625, "learning_rate": 1.5337423312883436e-05, "loss": 2.3238, "step": 44 }, { "epoch": 0.8163265306122449, "grad_norm": 0.65234375, "learning_rate": 1.5214723926380371e-05, "loss": 2.2546, "step": 45 }, { "epoch": 0.8344671201814059, "grad_norm": 0.734375, "learning_rate": 1.50920245398773e-05, "loss": 2.2407, "step": 46 }, { "epoch": 0.8526077097505669, "grad_norm": 0.74609375, "learning_rate": 1.4969325153374235e-05, "loss": 2.3146, "step": 47 }, { "epoch": 0.8707482993197279, "grad_norm": 0.796875, "learning_rate": 1.4846625766871168e-05, "loss": 2.3157, "step": 48 }, { "epoch": 0.8888888888888888, "grad_norm": 0.76171875, "learning_rate": 1.47239263803681e-05, "loss": 2.2666, "step": 49 }, { "epoch": 0.9070294784580499, "grad_norm": 0.7890625, "learning_rate": 1.4601226993865032e-05, "loss": 2.2766, "step": 50 }, { "epoch": 0.9251700680272109, "grad_norm": 0.79296875, "learning_rate": 1.4478527607361965e-05, "loss": 2.2274, "step": 51 }, { "epoch": 0.9433106575963719, "grad_norm": 0.74609375, "learning_rate": 1.4355828220858897e-05, "loss": 2.2769, "step": 52 }, { "epoch": 0.9614512471655329, "grad_norm": 0.72265625, "learning_rate": 1.423312883435583e-05, "loss": 2.2824, "step": 53 }, { "epoch": 0.9795918367346939, "grad_norm": 0.734375, "learning_rate": 1.4110429447852763e-05, "loss": 2.2354, "step": 54 }, { "epoch": 0.9977324263038548, "grad_norm": 0.75390625, "learning_rate": 1.3987730061349694e-05, "loss": 2.1867, "step": 55 }, { "epoch": 1.0, "grad_norm": 2.109375, "learning_rate": 1.3865030674846627e-05, "loss": 2.1015, "step": 56 }, { "epoch": 1.0, "eval_loss": 2.264234781265259, "eval_model_preparation_time": 0.0231, "eval_runtime": 4.5732, "eval_samples_per_second": 21.429, "eval_steps_per_second": 10.715, "step": 56 }, { "epoch": 1.018140589569161, "grad_norm": 0.71875, "learning_rate": 1.374233128834356e-05, "loss": 2.2474, "step": 57 }, { "epoch": 1.036281179138322, "grad_norm": 0.75390625, "learning_rate": 1.3619631901840491e-05, "loss": 2.1569, "step": 58 }, { "epoch": 1.054421768707483, "grad_norm": 0.7265625, "learning_rate": 1.3496932515337424e-05, "loss": 2.214, "step": 59 }, { "epoch": 1.072562358276644, "grad_norm": 0.7734375, "learning_rate": 1.3374233128834357e-05, "loss": 2.1724, "step": 60 }, { "epoch": 1.090702947845805, "grad_norm": 0.78125, "learning_rate": 1.3251533742331288e-05, "loss": 2.2226, "step": 61 }, { "epoch": 1.1088435374149659, "grad_norm": 0.75, "learning_rate": 1.3128834355828221e-05, "loss": 2.1916, "step": 62 }, { "epoch": 1.126984126984127, "grad_norm": 0.77734375, "learning_rate": 1.3006134969325156e-05, "loss": 2.1849, "step": 63 }, { "epoch": 1.145124716553288, "grad_norm": 0.73828125, "learning_rate": 1.2883435582822085e-05, "loss": 2.235, "step": 64 }, { "epoch": 1.163265306122449, "grad_norm": 0.7265625, "learning_rate": 1.276073619631902e-05, "loss": 2.1792, "step": 65 }, { "epoch": 1.18140589569161, "grad_norm": 0.7890625, "learning_rate": 1.2638036809815953e-05, "loss": 2.2478, "step": 66 }, { "epoch": 1.199546485260771, "grad_norm": 0.71875, "learning_rate": 1.2515337423312886e-05, "loss": 2.2172, "step": 67 }, { "epoch": 1.217687074829932, "grad_norm": 0.75390625, "learning_rate": 1.2392638036809817e-05, "loss": 2.1998, "step": 68 }, { "epoch": 1.235827664399093, "grad_norm": 0.74609375, "learning_rate": 1.226993865030675e-05, "loss": 2.2202, "step": 69 }, { "epoch": 1.253968253968254, "grad_norm": 0.82421875, "learning_rate": 1.2147239263803683e-05, "loss": 2.1275, "step": 70 }, { "epoch": 1.272108843537415, "grad_norm": 0.73828125, "learning_rate": 1.2024539877300614e-05, "loss": 2.1966, "step": 71 }, { "epoch": 1.290249433106576, "grad_norm": 0.91796875, "learning_rate": 1.1901840490797547e-05, "loss": 2.1833, "step": 72 }, { "epoch": 1.308390022675737, "grad_norm": 0.7265625, "learning_rate": 1.177914110429448e-05, "loss": 2.2302, "step": 73 }, { "epoch": 1.3265306122448979, "grad_norm": 0.73828125, "learning_rate": 1.1656441717791411e-05, "loss": 2.1988, "step": 74 }, { "epoch": 1.344671201814059, "grad_norm": 0.80078125, "learning_rate": 1.1533742331288344e-05, "loss": 2.1477, "step": 75 }, { "epoch": 1.36281179138322, "grad_norm": 0.78515625, "learning_rate": 1.1411042944785277e-05, "loss": 2.1234, "step": 76 }, { "epoch": 1.380952380952381, "grad_norm": 0.77734375, "learning_rate": 1.1288343558282208e-05, "loss": 2.2154, "step": 77 }, { "epoch": 1.399092970521542, "grad_norm": 0.87109375, "learning_rate": 1.1165644171779141e-05, "loss": 2.1708, "step": 78 }, { "epoch": 1.417233560090703, "grad_norm": 0.8828125, "learning_rate": 1.1042944785276076e-05, "loss": 2.2043, "step": 79 }, { "epoch": 1.435374149659864, "grad_norm": 0.81640625, "learning_rate": 1.0920245398773005e-05, "loss": 2.1758, "step": 80 }, { "epoch": 1.4535147392290249, "grad_norm": 0.76953125, "learning_rate": 1.079754601226994e-05, "loss": 2.1694, "step": 81 }, { "epoch": 1.471655328798186, "grad_norm": 0.88671875, "learning_rate": 1.0674846625766873e-05, "loss": 2.2036, "step": 82 }, { "epoch": 1.489795918367347, "grad_norm": 0.78125, "learning_rate": 1.0552147239263804e-05, "loss": 2.1908, "step": 83 }, { "epoch": 1.507936507936508, "grad_norm": 0.765625, "learning_rate": 1.0429447852760737e-05, "loss": 2.1322, "step": 84 }, { "epoch": 1.5260770975056688, "grad_norm": 0.81640625, "learning_rate": 1.030674846625767e-05, "loss": 2.1388, "step": 85 }, { "epoch": 1.54421768707483, "grad_norm": 0.80859375, "learning_rate": 1.0184049079754601e-05, "loss": 2.1776, "step": 86 }, { "epoch": 1.562358276643991, "grad_norm": 0.78125, "learning_rate": 1.0061349693251534e-05, "loss": 2.1833, "step": 87 }, { "epoch": 1.5804988662131518, "grad_norm": 0.734375, "learning_rate": 9.938650306748467e-06, "loss": 2.1012, "step": 88 }, { "epoch": 1.598639455782313, "grad_norm": 0.7734375, "learning_rate": 9.8159509202454e-06, "loss": 2.1698, "step": 89 }, { "epoch": 1.616780045351474, "grad_norm": 0.76953125, "learning_rate": 9.693251533742331e-06, "loss": 2.1517, "step": 90 }, { "epoch": 1.6349206349206349, "grad_norm": 0.7734375, "learning_rate": 9.570552147239264e-06, "loss": 2.1725, "step": 91 }, { "epoch": 1.6530612244897958, "grad_norm": 0.78125, "learning_rate": 9.447852760736197e-06, "loss": 2.1007, "step": 92 }, { "epoch": 1.671201814058957, "grad_norm": 0.83984375, "learning_rate": 9.325153374233129e-06, "loss": 2.2395, "step": 93 }, { "epoch": 1.689342403628118, "grad_norm": 0.76953125, "learning_rate": 9.202453987730062e-06, "loss": 2.1661, "step": 94 }, { "epoch": 1.7074829931972788, "grad_norm": 0.83984375, "learning_rate": 9.079754601226994e-06, "loss": 2.2016, "step": 95 }, { "epoch": 1.72562358276644, "grad_norm": 0.78125, "learning_rate": 8.957055214723927e-06, "loss": 2.1966, "step": 96 }, { "epoch": 1.743764172335601, "grad_norm": 0.83203125, "learning_rate": 8.83435582822086e-06, "loss": 2.1595, "step": 97 }, { "epoch": 1.7619047619047619, "grad_norm": 0.7890625, "learning_rate": 8.711656441717792e-06, "loss": 2.1531, "step": 98 }, { "epoch": 1.780045351473923, "grad_norm": 0.76953125, "learning_rate": 8.588957055214725e-06, "loss": 2.1517, "step": 99 }, { "epoch": 1.798185941043084, "grad_norm": 0.82421875, "learning_rate": 8.466257668711658e-06, "loss": 2.1511, "step": 100 }, { "epoch": 1.816326530612245, "grad_norm": 0.81640625, "learning_rate": 8.343558282208589e-06, "loss": 2.2195, "step": 101 }, { "epoch": 1.8344671201814058, "grad_norm": 0.796875, "learning_rate": 8.220858895705522e-06, "loss": 2.1237, "step": 102 }, { "epoch": 1.8526077097505669, "grad_norm": 0.76953125, "learning_rate": 8.098159509202455e-06, "loss": 2.111, "step": 103 }, { "epoch": 1.870748299319728, "grad_norm": 0.83984375, "learning_rate": 7.975460122699386e-06, "loss": 2.1262, "step": 104 }, { "epoch": 1.8888888888888888, "grad_norm": 0.96875, "learning_rate": 7.85276073619632e-06, "loss": 2.1277, "step": 105 }, { "epoch": 1.90702947845805, "grad_norm": 0.75, "learning_rate": 7.730061349693252e-06, "loss": 2.0992, "step": 106 }, { "epoch": 1.925170068027211, "grad_norm": 0.765625, "learning_rate": 7.6073619631901856e-06, "loss": 2.1456, "step": 107 }, { "epoch": 1.943310657596372, "grad_norm": 0.87890625, "learning_rate": 7.484662576687118e-06, "loss": 2.1794, "step": 108 }, { "epoch": 1.9614512471655328, "grad_norm": 0.82421875, "learning_rate": 7.36196319018405e-06, "loss": 2.1473, "step": 109 }, { "epoch": 1.9795918367346939, "grad_norm": 0.78125, "learning_rate": 7.239263803680983e-06, "loss": 2.0759, "step": 110 }, { "epoch": 1.997732426303855, "grad_norm": 0.8359375, "learning_rate": 7.116564417177915e-06, "loss": 2.1376, "step": 111 }, { "epoch": 2.0, "grad_norm": 4.375, "learning_rate": 6.993865030674847e-06, "loss": 2.3915, "step": 112 }, { "epoch": 2.0, "eval_loss": 2.181279420852661, "eval_model_preparation_time": 0.0231, "eval_runtime": 3.9648, "eval_samples_per_second": 24.718, "eval_steps_per_second": 12.359, "step": 112 }, { "epoch": 2.018140589569161, "grad_norm": 0.83203125, "learning_rate": 6.87116564417178e-06, "loss": 2.1222, "step": 113 }, { "epoch": 2.036281179138322, "grad_norm": 0.8125, "learning_rate": 6.748466257668712e-06, "loss": 2.0568, "step": 114 }, { "epoch": 2.054421768707483, "grad_norm": 0.7890625, "learning_rate": 6.625766871165644e-06, "loss": 2.0761, "step": 115 }, { "epoch": 2.072562358276644, "grad_norm": 0.859375, "learning_rate": 6.503067484662578e-06, "loss": 2.1027, "step": 116 }, { "epoch": 2.090702947845805, "grad_norm": 0.79296875, "learning_rate": 6.38036809815951e-06, "loss": 2.0915, "step": 117 }, { "epoch": 2.108843537414966, "grad_norm": 0.80078125, "learning_rate": 6.257668711656443e-06, "loss": 2.0946, "step": 118 }, { "epoch": 2.126984126984127, "grad_norm": 0.78515625, "learning_rate": 6.134969325153375e-06, "loss": 2.1037, "step": 119 }, { "epoch": 2.145124716553288, "grad_norm": 0.81640625, "learning_rate": 6.012269938650307e-06, "loss": 2.1129, "step": 120 }, { "epoch": 2.163265306122449, "grad_norm": 0.8515625, "learning_rate": 5.88957055214724e-06, "loss": 2.1101, "step": 121 }, { "epoch": 2.18140589569161, "grad_norm": 0.76953125, "learning_rate": 5.766871165644172e-06, "loss": 2.0923, "step": 122 }, { "epoch": 2.199546485260771, "grad_norm": 0.81640625, "learning_rate": 5.644171779141104e-06, "loss": 2.1136, "step": 123 }, { "epoch": 2.2176870748299318, "grad_norm": 0.90234375, "learning_rate": 5.521472392638038e-06, "loss": 2.1365, "step": 124 }, { "epoch": 2.235827664399093, "grad_norm": 0.8515625, "learning_rate": 5.39877300613497e-06, "loss": 2.1602, "step": 125 }, { "epoch": 2.253968253968254, "grad_norm": 1.1015625, "learning_rate": 5.276073619631902e-06, "loss": 2.1267, "step": 126 }, { "epoch": 2.272108843537415, "grad_norm": 0.7578125, "learning_rate": 5.153374233128835e-06, "loss": 2.1402, "step": 127 }, { "epoch": 2.290249433106576, "grad_norm": 0.78125, "learning_rate": 5.030674846625767e-06, "loss": 2.143, "step": 128 }, { "epoch": 2.308390022675737, "grad_norm": 0.81640625, "learning_rate": 4.9079754601227e-06, "loss": 2.1097, "step": 129 }, { "epoch": 2.326530612244898, "grad_norm": 0.80078125, "learning_rate": 4.785276073619632e-06, "loss": 2.1121, "step": 130 }, { "epoch": 2.3446712018140587, "grad_norm": 0.76171875, "learning_rate": 4.662576687116564e-06, "loss": 2.061, "step": 131 }, { "epoch": 2.36281179138322, "grad_norm": 0.82421875, "learning_rate": 4.539877300613497e-06, "loss": 2.108, "step": 132 }, { "epoch": 2.380952380952381, "grad_norm": 0.8671875, "learning_rate": 4.41717791411043e-06, "loss": 2.1038, "step": 133 }, { "epoch": 2.399092970521542, "grad_norm": 0.82421875, "learning_rate": 4.294478527607362e-06, "loss": 2.1283, "step": 134 }, { "epoch": 2.417233560090703, "grad_norm": 0.80078125, "learning_rate": 4.171779141104294e-06, "loss": 2.089, "step": 135 }, { "epoch": 2.435374149659864, "grad_norm": 0.78515625, "learning_rate": 4.049079754601227e-06, "loss": 2.0953, "step": 136 }, { "epoch": 2.453514739229025, "grad_norm": 0.8671875, "learning_rate": 3.92638036809816e-06, "loss": 2.0882, "step": 137 }, { "epoch": 2.471655328798186, "grad_norm": 0.85546875, "learning_rate": 3.8036809815950928e-06, "loss": 2.1179, "step": 138 }, { "epoch": 2.489795918367347, "grad_norm": 0.8046875, "learning_rate": 3.680981595092025e-06, "loss": 2.0365, "step": 139 }, { "epoch": 2.507936507936508, "grad_norm": 0.875, "learning_rate": 3.5582822085889574e-06, "loss": 2.08, "step": 140 }, { "epoch": 2.526077097505669, "grad_norm": 0.77734375, "learning_rate": 3.43558282208589e-06, "loss": 2.1329, "step": 141 }, { "epoch": 2.54421768707483, "grad_norm": 0.765625, "learning_rate": 3.312883435582822e-06, "loss": 2.1195, "step": 142 }, { "epoch": 2.562358276643991, "grad_norm": 0.765625, "learning_rate": 3.190184049079755e-06, "loss": 2.0559, "step": 143 }, { "epoch": 2.580498866213152, "grad_norm": 0.80859375, "learning_rate": 3.0674846625766875e-06, "loss": 2.0892, "step": 144 }, { "epoch": 2.5986394557823127, "grad_norm": 0.80078125, "learning_rate": 2.94478527607362e-06, "loss": 2.1426, "step": 145 }, { "epoch": 2.616780045351474, "grad_norm": 0.9609375, "learning_rate": 2.822085889570552e-06, "loss": 2.0978, "step": 146 }, { "epoch": 2.634920634920635, "grad_norm": 0.83984375, "learning_rate": 2.699386503067485e-06, "loss": 2.1416, "step": 147 }, { "epoch": 2.6530612244897958, "grad_norm": 0.87890625, "learning_rate": 2.5766871165644175e-06, "loss": 2.0845, "step": 148 }, { "epoch": 2.671201814058957, "grad_norm": 0.828125, "learning_rate": 2.45398773006135e-06, "loss": 2.0741, "step": 149 }, { "epoch": 2.689342403628118, "grad_norm": 0.8984375, "learning_rate": 2.331288343558282e-06, "loss": 2.1163, "step": 150 }, { "epoch": 2.707482993197279, "grad_norm": 0.78515625, "learning_rate": 2.208588957055215e-06, "loss": 2.075, "step": 151 }, { "epoch": 2.72562358276644, "grad_norm": 0.80859375, "learning_rate": 2.085889570552147e-06, "loss": 2.1189, "step": 152 }, { "epoch": 2.743764172335601, "grad_norm": 0.8828125, "learning_rate": 1.96319018404908e-06, "loss": 2.1323, "step": 153 }, { "epoch": 2.761904761904762, "grad_norm": 0.8671875, "learning_rate": 1.8404907975460124e-06, "loss": 2.1536, "step": 154 }, { "epoch": 2.780045351473923, "grad_norm": 0.875, "learning_rate": 1.717791411042945e-06, "loss": 2.114, "step": 155 }, { "epoch": 2.798185941043084, "grad_norm": 0.80078125, "learning_rate": 1.5950920245398775e-06, "loss": 2.0959, "step": 156 }, { "epoch": 2.816326530612245, "grad_norm": 0.85546875, "learning_rate": 1.47239263803681e-06, "loss": 2.1214, "step": 157 }, { "epoch": 2.834467120181406, "grad_norm": 0.8125, "learning_rate": 1.3496932515337425e-06, "loss": 2.1182, "step": 158 }, { "epoch": 2.8526077097505667, "grad_norm": 0.84375, "learning_rate": 1.226993865030675e-06, "loss": 2.1335, "step": 159 }, { "epoch": 2.870748299319728, "grad_norm": 0.79296875, "learning_rate": 1.1042944785276075e-06, "loss": 2.133, "step": 160 }, { "epoch": 2.888888888888889, "grad_norm": 0.84375, "learning_rate": 9.8159509202454e-07, "loss": 2.0833, "step": 161 }, { "epoch": 2.9070294784580497, "grad_norm": 0.796875, "learning_rate": 8.588957055214725e-07, "loss": 2.0795, "step": 162 }, { "epoch": 2.925170068027211, "grad_norm": 0.88671875, "learning_rate": 7.36196319018405e-07, "loss": 2.1272, "step": 163 }, { "epoch": 2.943310657596372, "grad_norm": 0.8515625, "learning_rate": 6.134969325153375e-07, "loss": 2.1501, "step": 164 }, { "epoch": 2.9614512471655328, "grad_norm": 0.95703125, "learning_rate": 4.9079754601227e-07, "loss": 2.049, "step": 165 }, { "epoch": 2.979591836734694, "grad_norm": 0.890625, "learning_rate": 3.680981595092025e-07, "loss": 2.1184, "step": 166 }, { "epoch": 2.997732426303855, "grad_norm": 0.84375, "learning_rate": 2.45398773006135e-07, "loss": 2.1397, "step": 167 }, { "epoch": 3.0, "grad_norm": 2.5625, "learning_rate": 1.226993865030675e-07, "loss": 1.9771, "step": 168 } ], "logging_steps": 1, "max_steps": 168, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7557758337642496e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }