{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009070294784580499, "grad_norm": 0.9609375, "learning_rate": 0.0, "loss": 2.7821, "step": 1 }, { "epoch": 0.018140589569160998, "grad_norm": 0.99609375, "learning_rate": 4.000000000000001e-06, "loss": 2.7555, "step": 2 }, { "epoch": 0.027210884353741496, "grad_norm": 1.03125, "learning_rate": 8.000000000000001e-06, "loss": 2.7964, "step": 3 }, { "epoch": 0.036281179138321996, "grad_norm": 1.0, "learning_rate": 1.2e-05, "loss": 2.8019, "step": 4 }, { "epoch": 0.045351473922902494, "grad_norm": 0.94140625, "learning_rate": 1.6000000000000003e-05, "loss": 2.6892, "step": 5 }, { "epoch": 0.05442176870748299, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.7791, "step": 6 }, { "epoch": 0.06349206349206349, "grad_norm": 0.9375, "learning_rate": 1.9939024390243904e-05, "loss": 2.7525, "step": 7 }, { "epoch": 0.07256235827664399, "grad_norm": 0.93359375, "learning_rate": 1.9878048780487806e-05, "loss": 2.6766, "step": 8 }, { "epoch": 0.08163265306122448, "grad_norm": 0.8828125, "learning_rate": 1.9817073170731708e-05, "loss": 2.74, "step": 9 }, { "epoch": 0.09070294784580499, "grad_norm": 0.90234375, "learning_rate": 1.975609756097561e-05, "loss": 2.677, "step": 10 }, { "epoch": 0.09977324263038549, "grad_norm": 0.83203125, "learning_rate": 1.9695121951219512e-05, "loss": 2.6407, "step": 11 }, { "epoch": 0.10884353741496598, "grad_norm": 0.8671875, "learning_rate": 1.9634146341463414e-05, "loss": 2.6783, "step": 12 }, { "epoch": 0.11791383219954649, "grad_norm": 0.85546875, "learning_rate": 1.957317073170732e-05, "loss": 2.6221, "step": 13 }, { "epoch": 0.12698412698412698, "grad_norm": 0.80078125, "learning_rate": 1.9512195121951222e-05, "loss": 2.6387, "step": 14 }, { "epoch": 0.1360544217687075, "grad_norm": 0.7734375, "learning_rate": 1.9451219512195124e-05, "loss": 2.6071, "step": 15 }, { "epoch": 0.14512471655328799, "grad_norm": 0.7109375, "learning_rate": 1.9390243902439026e-05, "loss": 2.5977, "step": 16 }, { "epoch": 0.15419501133786848, "grad_norm": 0.71484375, "learning_rate": 1.9329268292682928e-05, "loss": 2.5523, "step": 17 }, { "epoch": 0.16326530612244897, "grad_norm": 0.73828125, "learning_rate": 1.926829268292683e-05, "loss": 2.5804, "step": 18 }, { "epoch": 0.17233560090702948, "grad_norm": 0.703125, "learning_rate": 1.9207317073170733e-05, "loss": 2.4848, "step": 19 }, { "epoch": 0.18140589569160998, "grad_norm": 0.7421875, "learning_rate": 1.9146341463414635e-05, "loss": 2.5069, "step": 20 }, { "epoch": 0.19047619047619047, "grad_norm": 0.71484375, "learning_rate": 1.9085365853658537e-05, "loss": 2.4904, "step": 21 }, { "epoch": 0.19954648526077098, "grad_norm": 0.734375, "learning_rate": 1.902439024390244e-05, "loss": 2.4733, "step": 22 }, { "epoch": 0.20861678004535147, "grad_norm": 0.6875, "learning_rate": 1.896341463414634e-05, "loss": 2.4548, "step": 23 }, { "epoch": 0.21768707482993196, "grad_norm": 0.71484375, "learning_rate": 1.8902439024390243e-05, "loss": 2.414, "step": 24 }, { "epoch": 0.22675736961451248, "grad_norm": 0.734375, "learning_rate": 1.8841463414634145e-05, "loss": 2.4784, "step": 25 }, { "epoch": 0.23582766439909297, "grad_norm": 0.6875, "learning_rate": 1.878048780487805e-05, "loss": 2.4775, "step": 26 }, { "epoch": 0.24489795918367346, "grad_norm": 0.6875, "learning_rate": 1.8719512195121953e-05, "loss": 2.4024, "step": 27 }, { "epoch": 0.25396825396825395, "grad_norm": 0.75, "learning_rate": 1.8658536585365855e-05, "loss": 2.4668, "step": 28 }, { "epoch": 0.26303854875283444, "grad_norm": 0.7265625, "learning_rate": 1.8597560975609757e-05, "loss": 2.4542, "step": 29 }, { "epoch": 0.272108843537415, "grad_norm": 0.7890625, "learning_rate": 1.8536585365853663e-05, "loss": 2.4045, "step": 30 }, { "epoch": 0.2811791383219955, "grad_norm": 0.6796875, "learning_rate": 1.8475609756097565e-05, "loss": 2.413, "step": 31 }, { "epoch": 0.29024943310657597, "grad_norm": 0.6953125, "learning_rate": 1.8414634146341467e-05, "loss": 2.4288, "step": 32 }, { "epoch": 0.29931972789115646, "grad_norm": 0.73046875, "learning_rate": 1.835365853658537e-05, "loss": 2.4127, "step": 33 }, { "epoch": 0.30839002267573695, "grad_norm": 0.73046875, "learning_rate": 1.829268292682927e-05, "loss": 2.4347, "step": 34 }, { "epoch": 0.31746031746031744, "grad_norm": 0.73828125, "learning_rate": 1.8231707317073173e-05, "loss": 2.3892, "step": 35 }, { "epoch": 0.32653061224489793, "grad_norm": 0.671875, "learning_rate": 1.8170731707317075e-05, "loss": 2.3851, "step": 36 }, { "epoch": 0.3356009070294785, "grad_norm": 0.70703125, "learning_rate": 1.8109756097560977e-05, "loss": 2.3523, "step": 37 }, { "epoch": 0.34467120181405897, "grad_norm": 0.65234375, "learning_rate": 1.804878048780488e-05, "loss": 2.3436, "step": 38 }, { "epoch": 0.35374149659863946, "grad_norm": 0.671875, "learning_rate": 1.798780487804878e-05, "loss": 2.3701, "step": 39 }, { "epoch": 0.36281179138321995, "grad_norm": 0.75, "learning_rate": 1.7926829268292684e-05, "loss": 2.3614, "step": 40 }, { "epoch": 0.37188208616780044, "grad_norm": 0.71875, "learning_rate": 1.7865853658536586e-05, "loss": 2.3828, "step": 41 }, { "epoch": 0.38095238095238093, "grad_norm": 0.8359375, "learning_rate": 1.7804878048780488e-05, "loss": 2.3818, "step": 42 }, { "epoch": 0.3900226757369615, "grad_norm": 0.7109375, "learning_rate": 1.7743902439024393e-05, "loss": 2.2896, "step": 43 }, { "epoch": 0.39909297052154197, "grad_norm": 0.66015625, "learning_rate": 1.7682926829268296e-05, "loss": 2.2879, "step": 44 }, { "epoch": 0.40816326530612246, "grad_norm": 0.6953125, "learning_rate": 1.7621951219512198e-05, "loss": 2.341, "step": 45 }, { "epoch": 0.41723356009070295, "grad_norm": 0.75, "learning_rate": 1.75609756097561e-05, "loss": 2.2897, "step": 46 }, { "epoch": 0.42630385487528344, "grad_norm": 0.71875, "learning_rate": 1.7500000000000002e-05, "loss": 2.3409, "step": 47 }, { "epoch": 0.43537414965986393, "grad_norm": 0.66015625, "learning_rate": 1.7439024390243904e-05, "loss": 2.2946, "step": 48 }, { "epoch": 0.4444444444444444, "grad_norm": 0.65625, "learning_rate": 1.7378048780487806e-05, "loss": 2.2962, "step": 49 }, { "epoch": 0.45351473922902497, "grad_norm": 0.73828125, "learning_rate": 1.7317073170731708e-05, "loss": 2.3276, "step": 50 }, { "epoch": 0.46258503401360546, "grad_norm": 0.7421875, "learning_rate": 1.725609756097561e-05, "loss": 2.331, "step": 51 }, { "epoch": 0.47165532879818595, "grad_norm": 0.7421875, "learning_rate": 1.7195121951219512e-05, "loss": 2.3291, "step": 52 }, { "epoch": 0.48072562358276644, "grad_norm": 0.70703125, "learning_rate": 1.7134146341463415e-05, "loss": 2.3285, "step": 53 }, { "epoch": 0.4897959183673469, "grad_norm": 0.6484375, "learning_rate": 1.7073170731707317e-05, "loss": 2.2596, "step": 54 }, { "epoch": 0.4988662131519274, "grad_norm": 0.703125, "learning_rate": 1.7012195121951222e-05, "loss": 2.2769, "step": 55 }, { "epoch": 0.5079365079365079, "grad_norm": 0.71875, "learning_rate": 1.6951219512195124e-05, "loss": 2.2414, "step": 56 }, { "epoch": 0.5170068027210885, "grad_norm": 0.6796875, "learning_rate": 1.6890243902439026e-05, "loss": 2.276, "step": 57 }, { "epoch": 0.5260770975056689, "grad_norm": 0.73828125, "learning_rate": 1.682926829268293e-05, "loss": 2.2745, "step": 58 }, { "epoch": 0.5351473922902494, "grad_norm": 0.7265625, "learning_rate": 1.676829268292683e-05, "loss": 2.1905, "step": 59 }, { "epoch": 0.54421768707483, "grad_norm": 1.6171875, "learning_rate": 1.6707317073170733e-05, "loss": 2.2355, "step": 60 }, { "epoch": 0.5532879818594104, "grad_norm": 0.87109375, "learning_rate": 1.6646341463414635e-05, "loss": 2.209, "step": 61 }, { "epoch": 0.562358276643991, "grad_norm": 0.75, "learning_rate": 1.6585365853658537e-05, "loss": 2.3244, "step": 62 }, { "epoch": 0.5714285714285714, "grad_norm": 0.6875, "learning_rate": 1.652439024390244e-05, "loss": 2.2522, "step": 63 }, { "epoch": 0.5804988662131519, "grad_norm": 0.71484375, "learning_rate": 1.646341463414634e-05, "loss": 2.2726, "step": 64 }, { "epoch": 0.5895691609977324, "grad_norm": 0.87109375, "learning_rate": 1.6402439024390243e-05, "loss": 2.1989, "step": 65 }, { "epoch": 0.5986394557823129, "grad_norm": 0.75390625, "learning_rate": 1.6341463414634145e-05, "loss": 2.3039, "step": 66 }, { "epoch": 0.6077097505668935, "grad_norm": 0.83203125, "learning_rate": 1.6280487804878048e-05, "loss": 2.2368, "step": 67 }, { "epoch": 0.6167800453514739, "grad_norm": 0.75, "learning_rate": 1.6219512195121953e-05, "loss": 2.3243, "step": 68 }, { "epoch": 0.6258503401360545, "grad_norm": 0.8203125, "learning_rate": 1.6158536585365855e-05, "loss": 2.2983, "step": 69 }, { "epoch": 0.6349206349206349, "grad_norm": 0.70703125, "learning_rate": 1.6097560975609757e-05, "loss": 2.2208, "step": 70 }, { "epoch": 0.6439909297052154, "grad_norm": 0.7734375, "learning_rate": 1.603658536585366e-05, "loss": 2.2676, "step": 71 }, { "epoch": 0.6530612244897959, "grad_norm": 0.796875, "learning_rate": 1.597560975609756e-05, "loss": 2.2764, "step": 72 }, { "epoch": 0.6621315192743764, "grad_norm": 0.78515625, "learning_rate": 1.5914634146341467e-05, "loss": 2.2431, "step": 73 }, { "epoch": 0.671201814058957, "grad_norm": 0.70703125, "learning_rate": 1.585365853658537e-05, "loss": 2.2251, "step": 74 }, { "epoch": 0.6802721088435374, "grad_norm": 0.76171875, "learning_rate": 1.579268292682927e-05, "loss": 2.265, "step": 75 }, { "epoch": 0.6893424036281179, "grad_norm": 0.80859375, "learning_rate": 1.5731707317073173e-05, "loss": 2.2207, "step": 76 }, { "epoch": 0.6984126984126984, "grad_norm": 0.78125, "learning_rate": 1.5670731707317075e-05, "loss": 2.2398, "step": 77 }, { "epoch": 0.7074829931972789, "grad_norm": 0.6875, "learning_rate": 1.5609756097560978e-05, "loss": 2.2134, "step": 78 }, { "epoch": 0.7165532879818595, "grad_norm": 0.796875, "learning_rate": 1.554878048780488e-05, "loss": 2.2297, "step": 79 }, { "epoch": 0.7256235827664399, "grad_norm": 0.83203125, "learning_rate": 1.5487804878048782e-05, "loss": 2.1822, "step": 80 }, { "epoch": 0.7346938775510204, "grad_norm": 0.76953125, "learning_rate": 1.5426829268292684e-05, "loss": 2.2522, "step": 81 }, { "epoch": 0.7437641723356009, "grad_norm": 0.77734375, "learning_rate": 1.5365853658536586e-05, "loss": 2.2363, "step": 82 }, { "epoch": 0.7528344671201814, "grad_norm": 0.6796875, "learning_rate": 1.5304878048780488e-05, "loss": 2.2289, "step": 83 }, { "epoch": 0.7619047619047619, "grad_norm": 0.7578125, "learning_rate": 1.5243902439024392e-05, "loss": 2.2229, "step": 84 }, { "epoch": 0.7709750566893424, "grad_norm": 1.0390625, "learning_rate": 1.5182926829268294e-05, "loss": 2.2818, "step": 85 }, { "epoch": 0.780045351473923, "grad_norm": 0.77734375, "learning_rate": 1.5121951219512196e-05, "loss": 2.2268, "step": 86 }, { "epoch": 0.7891156462585034, "grad_norm": 0.7734375, "learning_rate": 1.5060975609756098e-05, "loss": 2.2327, "step": 87 }, { "epoch": 0.7981859410430839, "grad_norm": 0.75390625, "learning_rate": 1.5000000000000002e-05, "loss": 2.131, "step": 88 }, { "epoch": 0.8072562358276644, "grad_norm": 0.71484375, "learning_rate": 1.4939024390243904e-05, "loss": 2.1873, "step": 89 }, { "epoch": 0.8163265306122449, "grad_norm": 0.69921875, "learning_rate": 1.4878048780487806e-05, "loss": 2.2199, "step": 90 }, { "epoch": 0.8253968253968254, "grad_norm": 0.70703125, "learning_rate": 1.4817073170731708e-05, "loss": 2.2014, "step": 91 }, { "epoch": 0.8344671201814059, "grad_norm": 0.6953125, "learning_rate": 1.475609756097561e-05, "loss": 2.1782, "step": 92 }, { "epoch": 0.8435374149659864, "grad_norm": 0.8125, "learning_rate": 1.4695121951219513e-05, "loss": 2.2409, "step": 93 }, { "epoch": 0.8526077097505669, "grad_norm": 0.8828125, "learning_rate": 1.4634146341463415e-05, "loss": 2.1933, "step": 94 }, { "epoch": 0.8616780045351474, "grad_norm": 0.7578125, "learning_rate": 1.4573170731707319e-05, "loss": 2.1623, "step": 95 }, { "epoch": 0.8707482993197279, "grad_norm": 0.7890625, "learning_rate": 1.451219512195122e-05, "loss": 2.239, "step": 96 }, { "epoch": 0.8798185941043084, "grad_norm": 0.8125, "learning_rate": 1.4451219512195123e-05, "loss": 2.2343, "step": 97 }, { "epoch": 0.8888888888888888, "grad_norm": 0.84765625, "learning_rate": 1.4390243902439025e-05, "loss": 2.2198, "step": 98 }, { "epoch": 0.8979591836734694, "grad_norm": 0.78515625, "learning_rate": 1.4329268292682927e-05, "loss": 2.2047, "step": 99 }, { "epoch": 0.9070294784580499, "grad_norm": 0.78125, "learning_rate": 1.4268292682926829e-05, "loss": 2.2073, "step": 100 }, { "epoch": 0.9160997732426304, "grad_norm": 0.83203125, "learning_rate": 1.4207317073170733e-05, "loss": 2.1697, "step": 101 }, { "epoch": 0.9251700680272109, "grad_norm": 0.72265625, "learning_rate": 1.4146341463414635e-05, "loss": 2.213, "step": 102 }, { "epoch": 0.9342403628117913, "grad_norm": 0.73828125, "learning_rate": 1.4085365853658537e-05, "loss": 2.2039, "step": 103 }, { "epoch": 0.9433106575963719, "grad_norm": 0.82421875, "learning_rate": 1.402439024390244e-05, "loss": 2.178, "step": 104 }, { "epoch": 0.9523809523809523, "grad_norm": 0.765625, "learning_rate": 1.3963414634146341e-05, "loss": 2.118, "step": 105 }, { "epoch": 0.9614512471655329, "grad_norm": 0.7265625, "learning_rate": 1.3902439024390244e-05, "loss": 2.2182, "step": 106 }, { "epoch": 0.9705215419501134, "grad_norm": 0.75, "learning_rate": 1.3841463414634146e-05, "loss": 2.1893, "step": 107 }, { "epoch": 0.9795918367346939, "grad_norm": 0.75390625, "learning_rate": 1.378048780487805e-05, "loss": 2.2261, "step": 108 }, { "epoch": 0.9886621315192744, "grad_norm": 0.8125, "learning_rate": 1.3719512195121953e-05, "loss": 2.1841, "step": 109 }, { "epoch": 0.9977324263038548, "grad_norm": 0.9140625, "learning_rate": 1.3658536585365855e-05, "loss": 2.2254, "step": 110 }, { "epoch": 1.0, "grad_norm": 1.5859375, "learning_rate": 1.3597560975609757e-05, "loss": 2.2042, "step": 111 }, { "epoch": 1.0, "eval_loss": 2.2069785594940186, "eval_model_preparation_time": 0.0172, "eval_runtime": 11.0732, "eval_samples_per_second": 17.7, "eval_steps_per_second": 8.85, "step": 111 }, { "epoch": 1.0090702947845804, "grad_norm": 0.79296875, "learning_rate": 1.3536585365853661e-05, "loss": 2.1678, "step": 112 }, { "epoch": 1.018140589569161, "grad_norm": 0.8125, "learning_rate": 1.3475609756097563e-05, "loss": 2.2223, "step": 113 }, { "epoch": 1.0272108843537415, "grad_norm": 0.71875, "learning_rate": 1.3414634146341466e-05, "loss": 2.1223, "step": 114 }, { "epoch": 1.036281179138322, "grad_norm": 0.921875, "learning_rate": 1.3353658536585368e-05, "loss": 2.1698, "step": 115 }, { "epoch": 1.0453514739229024, "grad_norm": 0.79296875, "learning_rate": 1.329268292682927e-05, "loss": 2.105, "step": 116 }, { "epoch": 1.054421768707483, "grad_norm": 0.75, "learning_rate": 1.3231707317073172e-05, "loss": 2.1412, "step": 117 }, { "epoch": 1.0634920634920635, "grad_norm": 0.78125, "learning_rate": 1.3170731707317076e-05, "loss": 2.1615, "step": 118 }, { "epoch": 1.072562358276644, "grad_norm": 0.765625, "learning_rate": 1.3109756097560978e-05, "loss": 2.154, "step": 119 }, { "epoch": 1.0816326530612246, "grad_norm": 0.84765625, "learning_rate": 1.304878048780488e-05, "loss": 2.1844, "step": 120 }, { "epoch": 1.090702947845805, "grad_norm": 0.82421875, "learning_rate": 1.2987804878048782e-05, "loss": 2.1283, "step": 121 }, { "epoch": 1.0997732426303855, "grad_norm": 0.75, "learning_rate": 1.2926829268292684e-05, "loss": 2.0554, "step": 122 }, { "epoch": 1.1088435374149659, "grad_norm": 0.81640625, "learning_rate": 1.2865853658536586e-05, "loss": 2.151, "step": 123 }, { "epoch": 1.1179138321995465, "grad_norm": 0.75390625, "learning_rate": 1.2804878048780488e-05, "loss": 2.1076, "step": 124 }, { "epoch": 1.126984126984127, "grad_norm": 0.79296875, "learning_rate": 1.2743902439024392e-05, "loss": 2.1403, "step": 125 }, { "epoch": 1.1360544217687074, "grad_norm": 0.94921875, "learning_rate": 1.2682926829268294e-05, "loss": 2.1241, "step": 126 }, { "epoch": 1.145124716553288, "grad_norm": 0.94921875, "learning_rate": 1.2621951219512196e-05, "loss": 2.1536, "step": 127 }, { "epoch": 1.1541950113378685, "grad_norm": 0.953125, "learning_rate": 1.2560975609756098e-05, "loss": 2.1969, "step": 128 }, { "epoch": 1.163265306122449, "grad_norm": 0.828125, "learning_rate": 1.25e-05, "loss": 2.15, "step": 129 }, { "epoch": 1.1723356009070294, "grad_norm": 0.80078125, "learning_rate": 1.2439024390243903e-05, "loss": 2.1331, "step": 130 }, { "epoch": 1.18140589569161, "grad_norm": 0.93359375, "learning_rate": 1.2378048780487807e-05, "loss": 2.1181, "step": 131 }, { "epoch": 1.1904761904761905, "grad_norm": 0.77734375, "learning_rate": 1.2317073170731709e-05, "loss": 2.1217, "step": 132 }, { "epoch": 1.199546485260771, "grad_norm": 0.7578125, "learning_rate": 1.225609756097561e-05, "loss": 2.1555, "step": 133 }, { "epoch": 1.2086167800453516, "grad_norm": 0.7265625, "learning_rate": 1.2195121951219513e-05, "loss": 2.0599, "step": 134 }, { "epoch": 1.217687074829932, "grad_norm": 0.84375, "learning_rate": 1.2134146341463415e-05, "loss": 2.1798, "step": 135 }, { "epoch": 1.2267573696145124, "grad_norm": 0.8125, "learning_rate": 1.2073170731707317e-05, "loss": 2.156, "step": 136 }, { "epoch": 1.235827664399093, "grad_norm": 0.84765625, "learning_rate": 1.2012195121951221e-05, "loss": 2.1407, "step": 137 }, { "epoch": 1.2448979591836735, "grad_norm": 0.79296875, "learning_rate": 1.1951219512195123e-05, "loss": 2.1838, "step": 138 }, { "epoch": 1.253968253968254, "grad_norm": 0.81640625, "learning_rate": 1.1890243902439025e-05, "loss": 2.1762, "step": 139 }, { "epoch": 1.2630385487528344, "grad_norm": 0.796875, "learning_rate": 1.1829268292682927e-05, "loss": 2.0928, "step": 140 }, { "epoch": 1.272108843537415, "grad_norm": 0.82421875, "learning_rate": 1.176829268292683e-05, "loss": 2.1282, "step": 141 }, { "epoch": 1.2811791383219955, "grad_norm": 0.91015625, "learning_rate": 1.1707317073170731e-05, "loss": 2.1926, "step": 142 }, { "epoch": 1.290249433106576, "grad_norm": 0.82421875, "learning_rate": 1.1646341463414634e-05, "loss": 2.1752, "step": 143 }, { "epoch": 1.2993197278911564, "grad_norm": 0.82421875, "learning_rate": 1.1585365853658537e-05, "loss": 2.1733, "step": 144 }, { "epoch": 1.308390022675737, "grad_norm": 0.83203125, "learning_rate": 1.152439024390244e-05, "loss": 2.1226, "step": 145 }, { "epoch": 1.3174603174603174, "grad_norm": 0.87109375, "learning_rate": 1.1463414634146342e-05, "loss": 2.0981, "step": 146 }, { "epoch": 1.3265306122448979, "grad_norm": 0.890625, "learning_rate": 1.1402439024390244e-05, "loss": 2.2364, "step": 147 }, { "epoch": 1.3356009070294785, "grad_norm": 0.7890625, "learning_rate": 1.1341463414634146e-05, "loss": 2.157, "step": 148 }, { "epoch": 1.344671201814059, "grad_norm": 0.8828125, "learning_rate": 1.1280487804878048e-05, "loss": 2.1118, "step": 149 }, { "epoch": 1.3537414965986394, "grad_norm": 0.828125, "learning_rate": 1.1219512195121953e-05, "loss": 2.0814, "step": 150 }, { "epoch": 1.36281179138322, "grad_norm": 0.8984375, "learning_rate": 1.1158536585365856e-05, "loss": 2.1626, "step": 151 }, { "epoch": 1.3718820861678005, "grad_norm": 0.90234375, "learning_rate": 1.1097560975609758e-05, "loss": 2.1095, "step": 152 }, { "epoch": 1.380952380952381, "grad_norm": 0.83984375, "learning_rate": 1.103658536585366e-05, "loss": 2.1176, "step": 153 }, { "epoch": 1.3900226757369616, "grad_norm": 0.83984375, "learning_rate": 1.0975609756097562e-05, "loss": 2.1273, "step": 154 }, { "epoch": 1.399092970521542, "grad_norm": 0.82421875, "learning_rate": 1.0914634146341466e-05, "loss": 2.1309, "step": 155 }, { "epoch": 1.4081632653061225, "grad_norm": 0.83203125, "learning_rate": 1.0853658536585368e-05, "loss": 2.1036, "step": 156 }, { "epoch": 1.417233560090703, "grad_norm": 0.80859375, "learning_rate": 1.079268292682927e-05, "loss": 2.1404, "step": 157 }, { "epoch": 1.4263038548752833, "grad_norm": 0.8046875, "learning_rate": 1.0731707317073172e-05, "loss": 2.1466, "step": 158 }, { "epoch": 1.435374149659864, "grad_norm": 0.80859375, "learning_rate": 1.0670731707317074e-05, "loss": 2.1523, "step": 159 }, { "epoch": 1.4444444444444444, "grad_norm": 0.8515625, "learning_rate": 1.0609756097560976e-05, "loss": 2.1296, "step": 160 }, { "epoch": 1.4535147392290249, "grad_norm": 0.90234375, "learning_rate": 1.054878048780488e-05, "loss": 2.1349, "step": 161 }, { "epoch": 1.4625850340136055, "grad_norm": 0.83984375, "learning_rate": 1.0487804878048782e-05, "loss": 2.1676, "step": 162 }, { "epoch": 1.471655328798186, "grad_norm": 1.171875, "learning_rate": 1.0426829268292684e-05, "loss": 2.1887, "step": 163 }, { "epoch": 1.4807256235827664, "grad_norm": 0.83984375, "learning_rate": 1.0365853658536586e-05, "loss": 2.099, "step": 164 }, { "epoch": 1.489795918367347, "grad_norm": 1.0078125, "learning_rate": 1.0304878048780489e-05, "loss": 2.1318, "step": 165 }, { "epoch": 1.4988662131519275, "grad_norm": 0.84375, "learning_rate": 1.024390243902439e-05, "loss": 2.08, "step": 166 }, { "epoch": 1.507936507936508, "grad_norm": 0.8515625, "learning_rate": 1.0182926829268294e-05, "loss": 2.1447, "step": 167 }, { "epoch": 1.5170068027210886, "grad_norm": 0.8984375, "learning_rate": 1.0121951219512197e-05, "loss": 2.1101, "step": 168 }, { "epoch": 1.5260770975056688, "grad_norm": 0.875, "learning_rate": 1.0060975609756099e-05, "loss": 2.1164, "step": 169 }, { "epoch": 1.5351473922902494, "grad_norm": 0.90234375, "learning_rate": 1e-05, "loss": 2.0865, "step": 170 }, { "epoch": 1.54421768707483, "grad_norm": 0.87890625, "learning_rate": 9.939024390243903e-06, "loss": 2.0662, "step": 171 }, { "epoch": 1.5532879818594103, "grad_norm": 0.83203125, "learning_rate": 9.878048780487805e-06, "loss": 2.0951, "step": 172 }, { "epoch": 1.562358276643991, "grad_norm": 0.81640625, "learning_rate": 9.817073170731707e-06, "loss": 2.1146, "step": 173 }, { "epoch": 1.5714285714285714, "grad_norm": 0.78515625, "learning_rate": 9.756097560975611e-06, "loss": 2.0241, "step": 174 }, { "epoch": 1.5804988662131518, "grad_norm": 0.80859375, "learning_rate": 9.695121951219513e-06, "loss": 2.0803, "step": 175 }, { "epoch": 1.5895691609977325, "grad_norm": 0.83984375, "learning_rate": 9.634146341463415e-06, "loss": 2.0791, "step": 176 }, { "epoch": 1.598639455782313, "grad_norm": 0.8984375, "learning_rate": 9.573170731707317e-06, "loss": 2.1377, "step": 177 }, { "epoch": 1.6077097505668934, "grad_norm": 0.84375, "learning_rate": 9.51219512195122e-06, "loss": 2.131, "step": 178 }, { "epoch": 1.616780045351474, "grad_norm": 0.83203125, "learning_rate": 9.451219512195122e-06, "loss": 2.1134, "step": 179 }, { "epoch": 1.6258503401360545, "grad_norm": 0.83984375, "learning_rate": 9.390243902439025e-06, "loss": 2.1256, "step": 180 }, { "epoch": 1.6349206349206349, "grad_norm": 0.8125, "learning_rate": 9.329268292682927e-06, "loss": 2.0204, "step": 181 }, { "epoch": 1.6439909297052155, "grad_norm": 0.82421875, "learning_rate": 9.268292682926831e-06, "loss": 2.0775, "step": 182 }, { "epoch": 1.6530612244897958, "grad_norm": 0.88671875, "learning_rate": 9.207317073170733e-06, "loss": 2.1455, "step": 183 }, { "epoch": 1.6621315192743764, "grad_norm": 0.91015625, "learning_rate": 9.146341463414635e-06, "loss": 2.084, "step": 184 }, { "epoch": 1.671201814058957, "grad_norm": 0.76953125, "learning_rate": 9.085365853658538e-06, "loss": 2.0833, "step": 185 }, { "epoch": 1.6802721088435373, "grad_norm": 0.81640625, "learning_rate": 9.02439024390244e-06, "loss": 2.1452, "step": 186 }, { "epoch": 1.689342403628118, "grad_norm": 0.9375, "learning_rate": 8.963414634146342e-06, "loss": 2.1227, "step": 187 }, { "epoch": 1.6984126984126984, "grad_norm": 0.9140625, "learning_rate": 8.902439024390244e-06, "loss": 2.0747, "step": 188 }, { "epoch": 1.7074829931972788, "grad_norm": 0.8203125, "learning_rate": 8.841463414634148e-06, "loss": 2.0927, "step": 189 }, { "epoch": 1.7165532879818595, "grad_norm": 0.77734375, "learning_rate": 8.78048780487805e-06, "loss": 2.1195, "step": 190 }, { "epoch": 1.72562358276644, "grad_norm": 0.89453125, "learning_rate": 8.719512195121952e-06, "loss": 2.0527, "step": 191 }, { "epoch": 1.7346938775510203, "grad_norm": 0.9296875, "learning_rate": 8.658536585365854e-06, "loss": 2.1034, "step": 192 }, { "epoch": 1.743764172335601, "grad_norm": 0.90234375, "learning_rate": 8.597560975609756e-06, "loss": 2.1175, "step": 193 }, { "epoch": 1.7528344671201814, "grad_norm": 0.8828125, "learning_rate": 8.536585365853658e-06, "loss": 2.0906, "step": 194 }, { "epoch": 1.7619047619047619, "grad_norm": 0.890625, "learning_rate": 8.475609756097562e-06, "loss": 2.1299, "step": 195 }, { "epoch": 1.7709750566893425, "grad_norm": 0.9609375, "learning_rate": 8.414634146341464e-06, "loss": 2.1131, "step": 196 }, { "epoch": 1.780045351473923, "grad_norm": 0.890625, "learning_rate": 8.353658536585366e-06, "loss": 2.1026, "step": 197 }, { "epoch": 1.7891156462585034, "grad_norm": 0.8671875, "learning_rate": 8.292682926829268e-06, "loss": 2.0937, "step": 198 }, { "epoch": 1.798185941043084, "grad_norm": 0.8046875, "learning_rate": 8.23170731707317e-06, "loss": 2.161, "step": 199 }, { "epoch": 1.8072562358276643, "grad_norm": 0.86328125, "learning_rate": 8.170731707317073e-06, "loss": 2.0945, "step": 200 }, { "epoch": 1.816326530612245, "grad_norm": 0.83984375, "learning_rate": 8.109756097560977e-06, "loss": 2.1093, "step": 201 }, { "epoch": 1.8253968253968254, "grad_norm": 0.98828125, "learning_rate": 8.048780487804879e-06, "loss": 2.1666, "step": 202 }, { "epoch": 1.8344671201814058, "grad_norm": 0.8671875, "learning_rate": 7.98780487804878e-06, "loss": 2.123, "step": 203 }, { "epoch": 1.8435374149659864, "grad_norm": 0.87109375, "learning_rate": 7.926829268292685e-06, "loss": 2.0771, "step": 204 }, { "epoch": 1.8526077097505669, "grad_norm": 0.8984375, "learning_rate": 7.865853658536587e-06, "loss": 2.1275, "step": 205 }, { "epoch": 1.8616780045351473, "grad_norm": 0.8046875, "learning_rate": 7.804878048780489e-06, "loss": 2.0512, "step": 206 }, { "epoch": 1.870748299319728, "grad_norm": 0.88671875, "learning_rate": 7.743902439024391e-06, "loss": 2.0877, "step": 207 }, { "epoch": 1.8798185941043084, "grad_norm": 0.85546875, "learning_rate": 7.682926829268293e-06, "loss": 2.1309, "step": 208 }, { "epoch": 1.8888888888888888, "grad_norm": 0.84375, "learning_rate": 7.621951219512196e-06, "loss": 2.038, "step": 209 }, { "epoch": 1.8979591836734695, "grad_norm": 0.83203125, "learning_rate": 7.560975609756098e-06, "loss": 2.1102, "step": 210 }, { "epoch": 1.90702947845805, "grad_norm": 0.875, "learning_rate": 7.500000000000001e-06, "loss": 2.0963, "step": 211 }, { "epoch": 1.9160997732426304, "grad_norm": 0.890625, "learning_rate": 7.439024390243903e-06, "loss": 2.0508, "step": 212 }, { "epoch": 1.925170068027211, "grad_norm": 0.875, "learning_rate": 7.378048780487805e-06, "loss": 2.1489, "step": 213 }, { "epoch": 1.9342403628117912, "grad_norm": 0.80859375, "learning_rate": 7.317073170731707e-06, "loss": 2.0664, "step": 214 }, { "epoch": 1.943310657596372, "grad_norm": 0.90625, "learning_rate": 7.25609756097561e-06, "loss": 2.0465, "step": 215 }, { "epoch": 1.9523809523809523, "grad_norm": 0.83984375, "learning_rate": 7.1951219512195125e-06, "loss": 2.0658, "step": 216 }, { "epoch": 1.9614512471655328, "grad_norm": 0.8359375, "learning_rate": 7.1341463414634146e-06, "loss": 2.1214, "step": 217 }, { "epoch": 1.9705215419501134, "grad_norm": 0.875, "learning_rate": 7.0731707317073175e-06, "loss": 2.1, "step": 218 }, { "epoch": 1.9795918367346939, "grad_norm": 0.96484375, "learning_rate": 7.01219512195122e-06, "loss": 2.1089, "step": 219 }, { "epoch": 1.9886621315192743, "grad_norm": 0.890625, "learning_rate": 6.951219512195122e-06, "loss": 2.1077, "step": 220 }, { "epoch": 1.997732426303855, "grad_norm": 0.859375, "learning_rate": 6.890243902439025e-06, "loss": 2.0788, "step": 221 }, { "epoch": 2.0, "grad_norm": 2.265625, "learning_rate": 6.829268292682928e-06, "loss": 2.2393, "step": 222 }, { "epoch": 2.0, "eval_loss": 2.139087677001953, "eval_model_preparation_time": 0.0172, "eval_runtime": 11.0668, "eval_samples_per_second": 17.711, "eval_steps_per_second": 8.855, "step": 222 }, { "epoch": 2.0090702947845807, "grad_norm": 0.82421875, "learning_rate": 6.768292682926831e-06, "loss": 2.0251, "step": 223 }, { "epoch": 2.018140589569161, "grad_norm": 0.828125, "learning_rate": 6.707317073170733e-06, "loss": 2.0424, "step": 224 }, { "epoch": 2.0272108843537415, "grad_norm": 0.8203125, "learning_rate": 6.646341463414635e-06, "loss": 2.0415, "step": 225 }, { "epoch": 2.036281179138322, "grad_norm": 0.84765625, "learning_rate": 6.585365853658538e-06, "loss": 2.0551, "step": 226 }, { "epoch": 2.0453514739229024, "grad_norm": 0.9296875, "learning_rate": 6.52439024390244e-06, "loss": 2.1507, "step": 227 }, { "epoch": 2.054421768707483, "grad_norm": 0.85546875, "learning_rate": 6.463414634146342e-06, "loss": 2.0214, "step": 228 }, { "epoch": 2.0634920634920633, "grad_norm": 0.80078125, "learning_rate": 6.402439024390244e-06, "loss": 2.064, "step": 229 }, { "epoch": 2.072562358276644, "grad_norm": 0.97265625, "learning_rate": 6.341463414634147e-06, "loss": 2.0992, "step": 230 }, { "epoch": 2.0816326530612246, "grad_norm": 0.87109375, "learning_rate": 6.280487804878049e-06, "loss": 2.0473, "step": 231 }, { "epoch": 2.090702947845805, "grad_norm": 0.8984375, "learning_rate": 6.219512195121951e-06, "loss": 2.1006, "step": 232 }, { "epoch": 2.0997732426303855, "grad_norm": 0.91796875, "learning_rate": 6.158536585365854e-06, "loss": 2.1157, "step": 233 }, { "epoch": 2.108843537414966, "grad_norm": 0.90625, "learning_rate": 6.0975609756097564e-06, "loss": 2.084, "step": 234 }, { "epoch": 2.1179138321995463, "grad_norm": 1.0234375, "learning_rate": 6.0365853658536585e-06, "loss": 2.0984, "step": 235 }, { "epoch": 2.126984126984127, "grad_norm": 0.85546875, "learning_rate": 5.9756097560975615e-06, "loss": 2.056, "step": 236 }, { "epoch": 2.1360544217687076, "grad_norm": 0.88671875, "learning_rate": 5.914634146341464e-06, "loss": 2.0533, "step": 237 }, { "epoch": 2.145124716553288, "grad_norm": 0.875, "learning_rate": 5.853658536585366e-06, "loss": 2.0605, "step": 238 }, { "epoch": 2.1541950113378685, "grad_norm": 0.92578125, "learning_rate": 5.792682926829269e-06, "loss": 2.0865, "step": 239 }, { "epoch": 2.163265306122449, "grad_norm": 0.8671875, "learning_rate": 5.731707317073171e-06, "loss": 2.0988, "step": 240 }, { "epoch": 2.1723356009070294, "grad_norm": 0.921875, "learning_rate": 5.670731707317073e-06, "loss": 2.1342, "step": 241 }, { "epoch": 2.18140589569161, "grad_norm": 0.84375, "learning_rate": 5.609756097560977e-06, "loss": 2.0634, "step": 242 }, { "epoch": 2.1904761904761907, "grad_norm": 0.99609375, "learning_rate": 5.548780487804879e-06, "loss": 2.0547, "step": 243 }, { "epoch": 2.199546485260771, "grad_norm": 0.86328125, "learning_rate": 5.487804878048781e-06, "loss": 2.1118, "step": 244 }, { "epoch": 2.2086167800453516, "grad_norm": 0.8671875, "learning_rate": 5.426829268292684e-06, "loss": 2.0422, "step": 245 }, { "epoch": 2.2176870748299318, "grad_norm": 0.87109375, "learning_rate": 5.365853658536586e-06, "loss": 2.0844, "step": 246 }, { "epoch": 2.2267573696145124, "grad_norm": 0.85546875, "learning_rate": 5.304878048780488e-06, "loss": 2.0697, "step": 247 }, { "epoch": 2.235827664399093, "grad_norm": 0.859375, "learning_rate": 5.243902439024391e-06, "loss": 2.0595, "step": 248 }, { "epoch": 2.2448979591836733, "grad_norm": 0.86328125, "learning_rate": 5.182926829268293e-06, "loss": 2.0494, "step": 249 }, { "epoch": 2.253968253968254, "grad_norm": 0.81640625, "learning_rate": 5.121951219512195e-06, "loss": 2.052, "step": 250 }, { "epoch": 2.2630385487528346, "grad_norm": 0.94140625, "learning_rate": 5.060975609756098e-06, "loss": 2.0576, "step": 251 }, { "epoch": 2.272108843537415, "grad_norm": 0.9765625, "learning_rate": 5e-06, "loss": 2.078, "step": 252 }, { "epoch": 2.2811791383219955, "grad_norm": 0.84765625, "learning_rate": 4.9390243902439025e-06, "loss": 2.0474, "step": 253 }, { "epoch": 2.290249433106576, "grad_norm": 0.921875, "learning_rate": 4.8780487804878055e-06, "loss": 2.0628, "step": 254 }, { "epoch": 2.2993197278911564, "grad_norm": 0.8828125, "learning_rate": 4.817073170731708e-06, "loss": 2.0422, "step": 255 }, { "epoch": 2.308390022675737, "grad_norm": 0.99609375, "learning_rate": 4.75609756097561e-06, "loss": 2.0495, "step": 256 }, { "epoch": 2.317460317460317, "grad_norm": 0.84765625, "learning_rate": 4.695121951219513e-06, "loss": 2.1065, "step": 257 }, { "epoch": 2.326530612244898, "grad_norm": 0.84375, "learning_rate": 4.634146341463416e-06, "loss": 2.007, "step": 258 }, { "epoch": 2.3356009070294785, "grad_norm": 0.9375, "learning_rate": 4.573170731707318e-06, "loss": 2.0885, "step": 259 }, { "epoch": 2.3446712018140587, "grad_norm": 0.90234375, "learning_rate": 4.51219512195122e-06, "loss": 2.035, "step": 260 }, { "epoch": 2.3537414965986394, "grad_norm": 0.9140625, "learning_rate": 4.451219512195122e-06, "loss": 2.0861, "step": 261 }, { "epoch": 2.36281179138322, "grad_norm": 0.875, "learning_rate": 4.390243902439025e-06, "loss": 2.0555, "step": 262 }, { "epoch": 2.3718820861678003, "grad_norm": 0.89453125, "learning_rate": 4.329268292682927e-06, "loss": 2.0892, "step": 263 }, { "epoch": 2.380952380952381, "grad_norm": 0.87890625, "learning_rate": 4.268292682926829e-06, "loss": 2.0651, "step": 264 }, { "epoch": 2.3900226757369616, "grad_norm": 0.91015625, "learning_rate": 4.207317073170732e-06, "loss": 2.0115, "step": 265 }, { "epoch": 2.399092970521542, "grad_norm": 0.921875, "learning_rate": 4.146341463414634e-06, "loss": 2.0983, "step": 266 }, { "epoch": 2.4081632653061225, "grad_norm": 0.8828125, "learning_rate": 4.085365853658536e-06, "loss": 2.0977, "step": 267 }, { "epoch": 2.417233560090703, "grad_norm": 0.88671875, "learning_rate": 4.024390243902439e-06, "loss": 1.9982, "step": 268 }, { "epoch": 2.4263038548752833, "grad_norm": 0.86328125, "learning_rate": 3.963414634146342e-06, "loss": 2.0723, "step": 269 }, { "epoch": 2.435374149659864, "grad_norm": 0.9296875, "learning_rate": 3.902439024390244e-06, "loss": 2.0516, "step": 270 }, { "epoch": 2.4444444444444446, "grad_norm": 0.84765625, "learning_rate": 3.8414634146341465e-06, "loss": 2.0384, "step": 271 }, { "epoch": 2.453514739229025, "grad_norm": 0.921875, "learning_rate": 3.780487804878049e-06, "loss": 2.0291, "step": 272 }, { "epoch": 2.4625850340136055, "grad_norm": 0.859375, "learning_rate": 3.7195121951219516e-06, "loss": 2.0624, "step": 273 }, { "epoch": 2.471655328798186, "grad_norm": 0.875, "learning_rate": 3.6585365853658537e-06, "loss": 2.0481, "step": 274 }, { "epoch": 2.4807256235827664, "grad_norm": 0.84375, "learning_rate": 3.5975609756097562e-06, "loss": 2.0663, "step": 275 }, { "epoch": 2.489795918367347, "grad_norm": 0.88671875, "learning_rate": 3.5365853658536588e-06, "loss": 2.0932, "step": 276 }, { "epoch": 2.4988662131519273, "grad_norm": 0.91796875, "learning_rate": 3.475609756097561e-06, "loss": 2.0711, "step": 277 }, { "epoch": 2.507936507936508, "grad_norm": 0.8828125, "learning_rate": 3.414634146341464e-06, "loss": 2.08, "step": 278 }, { "epoch": 2.5170068027210886, "grad_norm": 0.984375, "learning_rate": 3.3536585365853664e-06, "loss": 2.0578, "step": 279 }, { "epoch": 2.526077097505669, "grad_norm": 0.85546875, "learning_rate": 3.292682926829269e-06, "loss": 2.0657, "step": 280 }, { "epoch": 2.5351473922902494, "grad_norm": 0.83203125, "learning_rate": 3.231707317073171e-06, "loss": 2.0596, "step": 281 }, { "epoch": 2.54421768707483, "grad_norm": 0.87890625, "learning_rate": 3.1707317073170736e-06, "loss": 2.0006, "step": 282 }, { "epoch": 2.5532879818594103, "grad_norm": 0.87890625, "learning_rate": 3.1097560975609757e-06, "loss": 2.1218, "step": 283 }, { "epoch": 2.562358276643991, "grad_norm": 0.8828125, "learning_rate": 3.0487804878048782e-06, "loss": 1.9871, "step": 284 }, { "epoch": 2.571428571428571, "grad_norm": 0.8515625, "learning_rate": 2.9878048780487808e-06, "loss": 2.0153, "step": 285 }, { "epoch": 2.580498866213152, "grad_norm": 0.828125, "learning_rate": 2.926829268292683e-06, "loss": 2.0725, "step": 286 }, { "epoch": 2.5895691609977325, "grad_norm": 0.875, "learning_rate": 2.8658536585365854e-06, "loss": 2.1162, "step": 287 }, { "epoch": 2.5986394557823127, "grad_norm": 0.8671875, "learning_rate": 2.8048780487804884e-06, "loss": 2.0886, "step": 288 }, { "epoch": 2.6077097505668934, "grad_norm": 0.90625, "learning_rate": 2.7439024390243905e-06, "loss": 2.0458, "step": 289 }, { "epoch": 2.616780045351474, "grad_norm": 0.8671875, "learning_rate": 2.682926829268293e-06, "loss": 2.0301, "step": 290 }, { "epoch": 2.6258503401360542, "grad_norm": 0.8671875, "learning_rate": 2.6219512195121956e-06, "loss": 2.0675, "step": 291 }, { "epoch": 2.634920634920635, "grad_norm": 0.90234375, "learning_rate": 2.5609756097560977e-06, "loss": 2.0651, "step": 292 }, { "epoch": 2.6439909297052155, "grad_norm": 0.84375, "learning_rate": 2.5e-06, "loss": 2.0602, "step": 293 }, { "epoch": 2.6530612244897958, "grad_norm": 0.859375, "learning_rate": 2.4390243902439027e-06, "loss": 2.061, "step": 294 }, { "epoch": 2.6621315192743764, "grad_norm": 0.82421875, "learning_rate": 2.378048780487805e-06, "loss": 2.0801, "step": 295 }, { "epoch": 2.671201814058957, "grad_norm": 0.92578125, "learning_rate": 2.317073170731708e-06, "loss": 2.0709, "step": 296 }, { "epoch": 2.6802721088435373, "grad_norm": 0.87890625, "learning_rate": 2.25609756097561e-06, "loss": 2.0765, "step": 297 }, { "epoch": 2.689342403628118, "grad_norm": 0.859375, "learning_rate": 2.1951219512195125e-06, "loss": 2.0208, "step": 298 }, { "epoch": 2.6984126984126986, "grad_norm": 0.87109375, "learning_rate": 2.1341463414634146e-06, "loss": 2.0137, "step": 299 }, { "epoch": 2.707482993197279, "grad_norm": 0.859375, "learning_rate": 2.073170731707317e-06, "loss": 2.0745, "step": 300 }, { "epoch": 2.7165532879818595, "grad_norm": 0.9453125, "learning_rate": 2.0121951219512197e-06, "loss": 2.0442, "step": 301 }, { "epoch": 2.72562358276644, "grad_norm": 0.93359375, "learning_rate": 1.951219512195122e-06, "loss": 2.0374, "step": 302 }, { "epoch": 2.7346938775510203, "grad_norm": 0.90625, "learning_rate": 1.8902439024390245e-06, "loss": 2.1019, "step": 303 }, { "epoch": 2.743764172335601, "grad_norm": 0.92578125, "learning_rate": 1.8292682926829268e-06, "loss": 2.1039, "step": 304 }, { "epoch": 2.7528344671201816, "grad_norm": 0.83984375, "learning_rate": 1.7682926829268294e-06, "loss": 2.0586, "step": 305 }, { "epoch": 2.761904761904762, "grad_norm": 0.85546875, "learning_rate": 1.707317073170732e-06, "loss": 2.0117, "step": 306 }, { "epoch": 2.7709750566893425, "grad_norm": 0.875, "learning_rate": 1.6463414634146345e-06, "loss": 2.0583, "step": 307 }, { "epoch": 2.780045351473923, "grad_norm": 0.97265625, "learning_rate": 1.5853658536585368e-06, "loss": 2.0683, "step": 308 }, { "epoch": 2.7891156462585034, "grad_norm": 0.83984375, "learning_rate": 1.5243902439024391e-06, "loss": 2.0711, "step": 309 }, { "epoch": 2.798185941043084, "grad_norm": 0.83984375, "learning_rate": 1.4634146341463414e-06, "loss": 2.0822, "step": 310 }, { "epoch": 2.8072562358276643, "grad_norm": 0.96875, "learning_rate": 1.4024390243902442e-06, "loss": 2.0757, "step": 311 }, { "epoch": 2.816326530612245, "grad_norm": 0.8515625, "learning_rate": 1.3414634146341465e-06, "loss": 2.1005, "step": 312 }, { "epoch": 2.825396825396825, "grad_norm": 0.875, "learning_rate": 1.2804878048780488e-06, "loss": 2.0856, "step": 313 }, { "epoch": 2.834467120181406, "grad_norm": 0.9375, "learning_rate": 1.2195121951219514e-06, "loss": 2.0978, "step": 314 }, { "epoch": 2.8435374149659864, "grad_norm": 0.83203125, "learning_rate": 1.158536585365854e-06, "loss": 2.05, "step": 315 }, { "epoch": 2.8526077097505667, "grad_norm": 0.86328125, "learning_rate": 1.0975609756097562e-06, "loss": 2.064, "step": 316 }, { "epoch": 2.8616780045351473, "grad_norm": 0.890625, "learning_rate": 1.0365853658536586e-06, "loss": 2.0545, "step": 317 }, { "epoch": 2.870748299319728, "grad_norm": 0.98828125, "learning_rate": 9.75609756097561e-07, "loss": 2.108, "step": 318 }, { "epoch": 2.879818594104308, "grad_norm": 0.84375, "learning_rate": 9.146341463414634e-07, "loss": 2.0705, "step": 319 }, { "epoch": 2.888888888888889, "grad_norm": 0.87109375, "learning_rate": 8.53658536585366e-07, "loss": 2.1163, "step": 320 }, { "epoch": 2.8979591836734695, "grad_norm": 0.8671875, "learning_rate": 7.926829268292684e-07, "loss": 2.051, "step": 321 }, { "epoch": 2.9070294784580497, "grad_norm": 0.94140625, "learning_rate": 7.317073170731707e-07, "loss": 2.0958, "step": 322 }, { "epoch": 2.9160997732426304, "grad_norm": 0.875, "learning_rate": 6.707317073170733e-07, "loss": 2.1143, "step": 323 }, { "epoch": 2.925170068027211, "grad_norm": 0.83984375, "learning_rate": 6.097560975609757e-07, "loss": 2.0107, "step": 324 }, { "epoch": 2.9342403628117912, "grad_norm": 0.80859375, "learning_rate": 5.487804878048781e-07, "loss": 2.0394, "step": 325 }, { "epoch": 2.943310657596372, "grad_norm": 0.83984375, "learning_rate": 4.878048780487805e-07, "loss": 2.0891, "step": 326 }, { "epoch": 2.9523809523809526, "grad_norm": 0.80859375, "learning_rate": 4.26829268292683e-07, "loss": 2.007, "step": 327 }, { "epoch": 2.9614512471655328, "grad_norm": 0.859375, "learning_rate": 3.6585365853658536e-07, "loss": 2.0731, "step": 328 }, { "epoch": 2.9705215419501134, "grad_norm": 0.87109375, "learning_rate": 3.0487804878048784e-07, "loss": 2.0552, "step": 329 }, { "epoch": 2.979591836734694, "grad_norm": 0.9453125, "learning_rate": 2.439024390243903e-07, "loss": 2.0833, "step": 330 }, { "epoch": 2.9886621315192743, "grad_norm": 0.87890625, "learning_rate": 1.8292682926829268e-07, "loss": 2.0287, "step": 331 }, { "epoch": 2.997732426303855, "grad_norm": 0.8359375, "learning_rate": 1.2195121951219514e-07, "loss": 2.0579, "step": 332 }, { "epoch": 3.0, "grad_norm": 2.234375, "learning_rate": 6.097560975609757e-08, "loss": 2.0824, "step": 333 } ], "logging_steps": 1, "max_steps": 333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.870273289084211e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }