{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 299, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033444816053511705, "grad_norm": 25.75, "learning_rate": 0.0, "loss": 3.0181, "step": 1 }, { "epoch": 0.006688963210702341, "grad_norm": 27.75, "learning_rate": 2.0000000000000003e-06, "loss": 3.1574, "step": 2 }, { "epoch": 0.010033444816053512, "grad_norm": 29.0, "learning_rate": 4.000000000000001e-06, "loss": 2.9793, "step": 3 }, { "epoch": 0.013377926421404682, "grad_norm": 26.625, "learning_rate": 6e-06, "loss": 2.8721, "step": 4 }, { "epoch": 0.016722408026755852, "grad_norm": 18.5, "learning_rate": 8.000000000000001e-06, "loss": 2.6137, "step": 5 }, { "epoch": 0.020066889632107024, "grad_norm": 10.8125, "learning_rate": 1e-05, "loss": 2.4216, "step": 6 }, { "epoch": 0.023411371237458192, "grad_norm": 9.625, "learning_rate": 9.965986394557824e-06, "loss": 2.4293, "step": 7 }, { "epoch": 0.026755852842809364, "grad_norm": 7.8125, "learning_rate": 9.931972789115647e-06, "loss": 2.215, "step": 8 }, { "epoch": 0.030100334448160536, "grad_norm": 7.84375, "learning_rate": 9.89795918367347e-06, "loss": 2.1725, "step": 9 }, { "epoch": 0.033444816053511704, "grad_norm": 8.0625, "learning_rate": 9.863945578231294e-06, "loss": 2.1844, "step": 10 }, { "epoch": 0.033444816053511704, "eval_loss": 2.0784454345703125, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.1202, "eval_samples_per_second": 47.611, "eval_steps_per_second": 23.806, "step": 10 }, { "epoch": 0.03678929765886288, "grad_norm": 7.71875, "learning_rate": 9.829931972789115e-06, "loss": 2.1589, "step": 11 }, { "epoch": 0.04013377926421405, "grad_norm": 7.125, "learning_rate": 9.795918367346939e-06, "loss": 2.0039, "step": 12 }, { "epoch": 0.043478260869565216, "grad_norm": 11.375, "learning_rate": 9.761904761904762e-06, "loss": 1.9245, "step": 13 }, { "epoch": 0.046822742474916385, "grad_norm": 7.6875, "learning_rate": 9.727891156462585e-06, "loss": 2.0036, "step": 14 }, { "epoch": 0.05016722408026756, "grad_norm": 6.5625, "learning_rate": 9.693877551020408e-06, "loss": 1.8739, "step": 15 }, { "epoch": 0.05351170568561873, "grad_norm": 7.625, "learning_rate": 9.659863945578232e-06, "loss": 1.7739, "step": 16 }, { "epoch": 0.056856187290969896, "grad_norm": 6.59375, "learning_rate": 9.625850340136055e-06, "loss": 1.8338, "step": 17 }, { "epoch": 0.06020066889632107, "grad_norm": 6.8125, "learning_rate": 9.591836734693878e-06, "loss": 1.8526, "step": 18 }, { "epoch": 0.06354515050167224, "grad_norm": 6.59375, "learning_rate": 9.557823129251701e-06, "loss": 1.864, "step": 19 }, { "epoch": 0.06688963210702341, "grad_norm": 7.28125, "learning_rate": 9.523809523809525e-06, "loss": 1.8338, "step": 20 }, { "epoch": 0.06688963210702341, "eval_loss": 1.762088418006897, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.9271, "eval_samples_per_second": 46.129, "eval_steps_per_second": 23.065, "step": 20 }, { "epoch": 0.07023411371237458, "grad_norm": 7.1875, "learning_rate": 9.489795918367348e-06, "loss": 1.8749, "step": 21 }, { "epoch": 0.07357859531772576, "grad_norm": 7.9375, "learning_rate": 9.455782312925171e-06, "loss": 1.8266, "step": 22 }, { "epoch": 0.07692307692307693, "grad_norm": 9.75, "learning_rate": 9.421768707482995e-06, "loss": 1.7542, "step": 23 }, { "epoch": 0.0802675585284281, "grad_norm": 6.84375, "learning_rate": 9.387755102040818e-06, "loss": 1.8181, "step": 24 }, { "epoch": 0.08361204013377926, "grad_norm": 6.625, "learning_rate": 9.353741496598641e-06, "loss": 1.6258, "step": 25 }, { "epoch": 0.08695652173913043, "grad_norm": 6.53125, "learning_rate": 9.319727891156464e-06, "loss": 1.9209, "step": 26 }, { "epoch": 0.0903010033444816, "grad_norm": 6.3125, "learning_rate": 9.285714285714288e-06, "loss": 1.5183, "step": 27 }, { "epoch": 0.09364548494983277, "grad_norm": 7.125, "learning_rate": 9.251700680272109e-06, "loss": 1.6856, "step": 28 }, { "epoch": 0.09698996655518395, "grad_norm": 5.75, "learning_rate": 9.217687074829932e-06, "loss": 1.5802, "step": 29 }, { "epoch": 0.10033444816053512, "grad_norm": 7.0625, "learning_rate": 9.183673469387756e-06, "loss": 1.8024, "step": 30 }, { "epoch": 0.10033444816053512, "eval_loss": 1.654666543006897, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.0658, "eval_samples_per_second": 47.714, "eval_steps_per_second": 23.857, "step": 30 }, { "epoch": 0.10367892976588629, "grad_norm": 6.40625, "learning_rate": 9.149659863945579e-06, "loss": 1.6294, "step": 31 }, { "epoch": 0.10702341137123746, "grad_norm": 6.8125, "learning_rate": 9.115646258503402e-06, "loss": 1.481, "step": 32 }, { "epoch": 0.11036789297658862, "grad_norm": 5.6875, "learning_rate": 9.081632653061225e-06, "loss": 1.6586, "step": 33 }, { "epoch": 0.11371237458193979, "grad_norm": 6.21875, "learning_rate": 9.047619047619049e-06, "loss": 1.5246, "step": 34 }, { "epoch": 0.11705685618729098, "grad_norm": 7.71875, "learning_rate": 9.013605442176872e-06, "loss": 1.6757, "step": 35 }, { "epoch": 0.12040133779264214, "grad_norm": 6.5625, "learning_rate": 8.979591836734695e-06, "loss": 1.6568, "step": 36 }, { "epoch": 0.12374581939799331, "grad_norm": 6.3125, "learning_rate": 8.945578231292518e-06, "loss": 1.6195, "step": 37 }, { "epoch": 0.12709030100334448, "grad_norm": 7.3125, "learning_rate": 8.91156462585034e-06, "loss": 1.6203, "step": 38 }, { "epoch": 0.13043478260869565, "grad_norm": 6.71875, "learning_rate": 8.877551020408163e-06, "loss": 1.5925, "step": 39 }, { "epoch": 0.13377926421404682, "grad_norm": 5.96875, "learning_rate": 8.843537414965987e-06, "loss": 1.6298, "step": 40 }, { "epoch": 0.13377926421404682, "eval_loss": 1.6065762042999268, "eval_model_preparation_time": 0.0182, "eval_runtime": 26.7591, "eval_samples_per_second": 44.695, "eval_steps_per_second": 22.348, "step": 40 }, { "epoch": 0.13712374581939799, "grad_norm": 6.3125, "learning_rate": 8.80952380952381e-06, "loss": 1.5707, "step": 41 }, { "epoch": 0.14046822742474915, "grad_norm": 6.65625, "learning_rate": 8.775510204081633e-06, "loss": 1.5502, "step": 42 }, { "epoch": 0.14381270903010032, "grad_norm": 5.78125, "learning_rate": 8.741496598639456e-06, "loss": 1.5849, "step": 43 }, { "epoch": 0.14715719063545152, "grad_norm": 6.21875, "learning_rate": 8.70748299319728e-06, "loss": 1.595, "step": 44 }, { "epoch": 0.1505016722408027, "grad_norm": 7.09375, "learning_rate": 8.673469387755103e-06, "loss": 1.6047, "step": 45 }, { "epoch": 0.15384615384615385, "grad_norm": 5.625, "learning_rate": 8.639455782312926e-06, "loss": 1.4065, "step": 46 }, { "epoch": 0.15719063545150502, "grad_norm": 6.90625, "learning_rate": 8.60544217687075e-06, "loss": 1.6029, "step": 47 }, { "epoch": 0.1605351170568562, "grad_norm": 6.03125, "learning_rate": 8.571428571428571e-06, "loss": 1.5176, "step": 48 }, { "epoch": 0.16387959866220736, "grad_norm": 6.1875, "learning_rate": 8.537414965986394e-06, "loss": 1.5355, "step": 49 }, { "epoch": 0.16722408026755853, "grad_norm": 6.28125, "learning_rate": 8.503401360544217e-06, "loss": 1.5509, "step": 50 }, { "epoch": 0.16722408026755853, "eval_loss": 1.5824450254440308, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.6606, "eval_samples_per_second": 48.498, "eval_steps_per_second": 24.249, "step": 50 }, { "epoch": 0.1705685618729097, "grad_norm": 7.0, "learning_rate": 8.469387755102042e-06, "loss": 1.6003, "step": 51 }, { "epoch": 0.17391304347826086, "grad_norm": 6.0, "learning_rate": 8.435374149659866e-06, "loss": 1.4908, "step": 52 }, { "epoch": 0.17725752508361203, "grad_norm": 6.84375, "learning_rate": 8.401360544217689e-06, "loss": 1.6432, "step": 53 }, { "epoch": 0.1806020066889632, "grad_norm": 5.78125, "learning_rate": 8.36734693877551e-06, "loss": 1.4956, "step": 54 }, { "epoch": 0.18394648829431437, "grad_norm": 5.53125, "learning_rate": 8.333333333333334e-06, "loss": 1.459, "step": 55 }, { "epoch": 0.18729096989966554, "grad_norm": 7.21875, "learning_rate": 8.299319727891157e-06, "loss": 1.6542, "step": 56 }, { "epoch": 0.19063545150501673, "grad_norm": 6.75, "learning_rate": 8.26530612244898e-06, "loss": 1.5814, "step": 57 }, { "epoch": 0.1939799331103679, "grad_norm": 7.59375, "learning_rate": 8.231292517006804e-06, "loss": 1.5807, "step": 58 }, { "epoch": 0.19732441471571907, "grad_norm": 6.03125, "learning_rate": 8.197278911564627e-06, "loss": 1.5321, "step": 59 }, { "epoch": 0.20066889632107024, "grad_norm": 6.46875, "learning_rate": 8.16326530612245e-06, "loss": 1.6237, "step": 60 }, { "epoch": 0.20066889632107024, "eval_loss": 1.5487959384918213, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.9431, "eval_samples_per_second": 47.949, "eval_steps_per_second": 23.975, "step": 60 }, { "epoch": 0.2040133779264214, "grad_norm": 6.0625, "learning_rate": 8.129251700680273e-06, "loss": 1.6258, "step": 61 }, { "epoch": 0.20735785953177258, "grad_norm": 6.0, "learning_rate": 8.095238095238097e-06, "loss": 1.5074, "step": 62 }, { "epoch": 0.21070234113712374, "grad_norm": 5.46875, "learning_rate": 8.06122448979592e-06, "loss": 1.4395, "step": 63 }, { "epoch": 0.2140468227424749, "grad_norm": 6.125, "learning_rate": 8.027210884353741e-06, "loss": 1.5095, "step": 64 }, { "epoch": 0.21739130434782608, "grad_norm": 5.6875, "learning_rate": 7.993197278911565e-06, "loss": 1.477, "step": 65 }, { "epoch": 0.22073578595317725, "grad_norm": 6.0625, "learning_rate": 7.959183673469388e-06, "loss": 1.3592, "step": 66 }, { "epoch": 0.22408026755852842, "grad_norm": 5.6875, "learning_rate": 7.925170068027211e-06, "loss": 1.535, "step": 67 }, { "epoch": 0.22742474916387959, "grad_norm": 6.4375, "learning_rate": 7.891156462585034e-06, "loss": 1.5033, "step": 68 }, { "epoch": 0.23076923076923078, "grad_norm": 5.875, "learning_rate": 7.857142857142858e-06, "loss": 1.4712, "step": 69 }, { "epoch": 0.23411371237458195, "grad_norm": 6.71875, "learning_rate": 7.823129251700681e-06, "loss": 1.543, "step": 70 }, { "epoch": 0.23411371237458195, "eval_loss": 1.5336378812789917, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.7209, "eval_samples_per_second": 48.38, "eval_steps_per_second": 24.19, "step": 70 }, { "epoch": 0.23745819397993312, "grad_norm": 6.03125, "learning_rate": 7.789115646258504e-06, "loss": 1.4759, "step": 71 }, { "epoch": 0.2408026755852843, "grad_norm": 6.59375, "learning_rate": 7.755102040816327e-06, "loss": 1.7235, "step": 72 }, { "epoch": 0.24414715719063546, "grad_norm": 6.09375, "learning_rate": 7.72108843537415e-06, "loss": 1.5825, "step": 73 }, { "epoch": 0.24749163879598662, "grad_norm": 5.75, "learning_rate": 7.687074829931972e-06, "loss": 1.5194, "step": 74 }, { "epoch": 0.2508361204013378, "grad_norm": 5.875, "learning_rate": 7.653061224489796e-06, "loss": 1.5582, "step": 75 }, { "epoch": 0.25418060200668896, "grad_norm": 6.21875, "learning_rate": 7.61904761904762e-06, "loss": 1.5348, "step": 76 }, { "epoch": 0.25752508361204013, "grad_norm": 5.875, "learning_rate": 7.585034013605442e-06, "loss": 1.6259, "step": 77 }, { "epoch": 0.2608695652173913, "grad_norm": 6.3125, "learning_rate": 7.551020408163265e-06, "loss": 1.5334, "step": 78 }, { "epoch": 0.26421404682274247, "grad_norm": 6.78125, "learning_rate": 7.5170068027210886e-06, "loss": 1.6789, "step": 79 }, { "epoch": 0.26755852842809363, "grad_norm": 5.5, "learning_rate": 7.482993197278913e-06, "loss": 1.4375, "step": 80 }, { "epoch": 0.26755852842809363, "eval_loss": 1.5223881006240845, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.6231, "eval_samples_per_second": 48.572, "eval_steps_per_second": 24.286, "step": 80 }, { "epoch": 0.2709030100334448, "grad_norm": 5.4375, "learning_rate": 7.448979591836736e-06, "loss": 1.4604, "step": 81 }, { "epoch": 0.27424749163879597, "grad_norm": 5.46875, "learning_rate": 7.414965986394559e-06, "loss": 1.4198, "step": 82 }, { "epoch": 0.27759197324414714, "grad_norm": 5.4375, "learning_rate": 7.380952380952382e-06, "loss": 1.5217, "step": 83 }, { "epoch": 0.2809364548494983, "grad_norm": 6.09375, "learning_rate": 7.346938775510205e-06, "loss": 1.5475, "step": 84 }, { "epoch": 0.2842809364548495, "grad_norm": 5.78125, "learning_rate": 7.312925170068028e-06, "loss": 1.4847, "step": 85 }, { "epoch": 0.28762541806020064, "grad_norm": 5.65625, "learning_rate": 7.278911564625851e-06, "loss": 1.4862, "step": 86 }, { "epoch": 0.2909698996655518, "grad_norm": 5.59375, "learning_rate": 7.244897959183675e-06, "loss": 1.6432, "step": 87 }, { "epoch": 0.29431438127090304, "grad_norm": 5.53125, "learning_rate": 7.210884353741497e-06, "loss": 1.315, "step": 88 }, { "epoch": 0.2976588628762542, "grad_norm": 5.9375, "learning_rate": 7.17687074829932e-06, "loss": 1.5436, "step": 89 }, { "epoch": 0.3010033444816054, "grad_norm": 6.21875, "learning_rate": 7.1428571428571436e-06, "loss": 1.5598, "step": 90 }, { "epoch": 0.3010033444816054, "eval_loss": 1.5097979307174683, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.8836, "eval_samples_per_second": 48.064, "eval_steps_per_second": 24.032, "step": 90 }, { "epoch": 0.30434782608695654, "grad_norm": 5.78125, "learning_rate": 7.108843537414967e-06, "loss": 1.3633, "step": 91 }, { "epoch": 0.3076923076923077, "grad_norm": 5.8125, "learning_rate": 7.07482993197279e-06, "loss": 1.5061, "step": 92 }, { "epoch": 0.3110367892976589, "grad_norm": 5.4375, "learning_rate": 7.0408163265306125e-06, "loss": 1.4504, "step": 93 }, { "epoch": 0.31438127090301005, "grad_norm": 5.375, "learning_rate": 7.006802721088436e-06, "loss": 1.379, "step": 94 }, { "epoch": 0.3177257525083612, "grad_norm": 5.5625, "learning_rate": 6.972789115646259e-06, "loss": 1.4414, "step": 95 }, { "epoch": 0.3210702341137124, "grad_norm": 6.0, "learning_rate": 6.938775510204082e-06, "loss": 1.4212, "step": 96 }, { "epoch": 0.32441471571906355, "grad_norm": 5.4375, "learning_rate": 6.9047619047619055e-06, "loss": 1.4361, "step": 97 }, { "epoch": 0.3277591973244147, "grad_norm": 5.78125, "learning_rate": 6.870748299319728e-06, "loss": 1.4397, "step": 98 }, { "epoch": 0.3311036789297659, "grad_norm": 5.5, "learning_rate": 6.836734693877551e-06, "loss": 1.3729, "step": 99 }, { "epoch": 0.33444816053511706, "grad_norm": 6.1875, "learning_rate": 6.8027210884353745e-06, "loss": 1.5183, "step": 100 }, { "epoch": 0.33444816053511706, "eval_loss": 1.501234769821167, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.85, "eval_samples_per_second": 48.129, "eval_steps_per_second": 24.064, "step": 100 }, { "epoch": 0.3377926421404682, "grad_norm": 5.71875, "learning_rate": 6.768707482993198e-06, "loss": 1.4924, "step": 101 }, { "epoch": 0.3411371237458194, "grad_norm": 6.1875, "learning_rate": 6.734693877551021e-06, "loss": 1.4877, "step": 102 }, { "epoch": 0.34448160535117056, "grad_norm": 5.84375, "learning_rate": 6.700680272108843e-06, "loss": 1.461, "step": 103 }, { "epoch": 0.34782608695652173, "grad_norm": 6.03125, "learning_rate": 6.666666666666667e-06, "loss": 1.4635, "step": 104 }, { "epoch": 0.3511705685618729, "grad_norm": 5.8125, "learning_rate": 6.63265306122449e-06, "loss": 1.4172, "step": 105 }, { "epoch": 0.35451505016722407, "grad_norm": 5.90625, "learning_rate": 6.598639455782313e-06, "loss": 1.4521, "step": 106 }, { "epoch": 0.35785953177257523, "grad_norm": 6.4375, "learning_rate": 6.5646258503401364e-06, "loss": 1.6308, "step": 107 }, { "epoch": 0.3612040133779264, "grad_norm": 5.8125, "learning_rate": 6.530612244897959e-06, "loss": 1.5095, "step": 108 }, { "epoch": 0.36454849498327757, "grad_norm": 6.0625, "learning_rate": 6.496598639455784e-06, "loss": 1.5521, "step": 109 }, { "epoch": 0.36789297658862874, "grad_norm": 5.59375, "learning_rate": 6.462585034013606e-06, "loss": 1.5551, "step": 110 }, { "epoch": 0.36789297658862874, "eval_loss": 1.4953867197036743, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.8035, "eval_samples_per_second": 48.219, "eval_steps_per_second": 24.109, "step": 110 }, { "epoch": 0.3712374581939799, "grad_norm": 6.59375, "learning_rate": 6.4285714285714295e-06, "loss": 1.5105, "step": 111 }, { "epoch": 0.3745819397993311, "grad_norm": 5.6875, "learning_rate": 6.394557823129253e-06, "loss": 1.4385, "step": 112 }, { "epoch": 0.3779264214046823, "grad_norm": 5.90625, "learning_rate": 6.360544217687076e-06, "loss": 1.591, "step": 113 }, { "epoch": 0.38127090301003347, "grad_norm": 5.8125, "learning_rate": 6.326530612244899e-06, "loss": 1.4995, "step": 114 }, { "epoch": 0.38461538461538464, "grad_norm": 5.28125, "learning_rate": 6.292517006802722e-06, "loss": 1.5145, "step": 115 }, { "epoch": 0.3879598662207358, "grad_norm": 5.375, "learning_rate": 6.258503401360545e-06, "loss": 1.4962, "step": 116 }, { "epoch": 0.391304347826087, "grad_norm": 6.09375, "learning_rate": 6.224489795918368e-06, "loss": 1.4014, "step": 117 }, { "epoch": 0.39464882943143814, "grad_norm": 6.59375, "learning_rate": 6.1904761904761914e-06, "loss": 1.4566, "step": 118 }, { "epoch": 0.3979933110367893, "grad_norm": 5.75, "learning_rate": 6.156462585034015e-06, "loss": 1.6276, "step": 119 }, { "epoch": 0.4013377926421405, "grad_norm": 5.65625, "learning_rate": 6.122448979591837e-06, "loss": 1.4153, "step": 120 }, { "epoch": 0.4013377926421405, "eval_loss": 1.485946774482727, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.7693, "eval_samples_per_second": 48.286, "eval_steps_per_second": 24.143, "step": 120 }, { "epoch": 0.40468227424749165, "grad_norm": 6.28125, "learning_rate": 6.08843537414966e-06, "loss": 1.4437, "step": 121 }, { "epoch": 0.4080267558528428, "grad_norm": 5.84375, "learning_rate": 6.054421768707484e-06, "loss": 1.5335, "step": 122 }, { "epoch": 0.411371237458194, "grad_norm": 5.625, "learning_rate": 6.020408163265307e-06, "loss": 1.4071, "step": 123 }, { "epoch": 0.41471571906354515, "grad_norm": 5.78125, "learning_rate": 5.98639455782313e-06, "loss": 1.5001, "step": 124 }, { "epoch": 0.4180602006688963, "grad_norm": 5.46875, "learning_rate": 5.9523809523809525e-06, "loss": 1.4856, "step": 125 }, { "epoch": 0.4214046822742475, "grad_norm": 6.125, "learning_rate": 5.918367346938776e-06, "loss": 1.4836, "step": 126 }, { "epoch": 0.42474916387959866, "grad_norm": 6.3125, "learning_rate": 5.884353741496599e-06, "loss": 1.5135, "step": 127 }, { "epoch": 0.4280936454849498, "grad_norm": 5.6875, "learning_rate": 5.850340136054422e-06, "loss": 1.3751, "step": 128 }, { "epoch": 0.431438127090301, "grad_norm": 5.625, "learning_rate": 5.816326530612246e-06, "loss": 1.3937, "step": 129 }, { "epoch": 0.43478260869565216, "grad_norm": 5.96875, "learning_rate": 5.782312925170068e-06, "loss": 1.5853, "step": 130 }, { "epoch": 0.43478260869565216, "eval_loss": 1.4821772575378418, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.8834, "eval_samples_per_second": 48.064, "eval_steps_per_second": 24.032, "step": 130 }, { "epoch": 0.43812709030100333, "grad_norm": 5.46875, "learning_rate": 5.748299319727891e-06, "loss": 1.38, "step": 131 }, { "epoch": 0.4414715719063545, "grad_norm": 6.0, "learning_rate": 5.7142857142857145e-06, "loss": 1.5402, "step": 132 }, { "epoch": 0.44481605351170567, "grad_norm": 6.25, "learning_rate": 5.680272108843538e-06, "loss": 1.4661, "step": 133 }, { "epoch": 0.44816053511705684, "grad_norm": 6.65625, "learning_rate": 5.646258503401361e-06, "loss": 1.4167, "step": 134 }, { "epoch": 0.451505016722408, "grad_norm": 5.375, "learning_rate": 5.6122448979591834e-06, "loss": 1.4334, "step": 135 }, { "epoch": 0.45484949832775917, "grad_norm": 5.71875, "learning_rate": 5.578231292517007e-06, "loss": 1.5172, "step": 136 }, { "epoch": 0.45819397993311034, "grad_norm": 5.625, "learning_rate": 5.54421768707483e-06, "loss": 1.3814, "step": 137 }, { "epoch": 0.46153846153846156, "grad_norm": 6.03125, "learning_rate": 5.510204081632653e-06, "loss": 1.3929, "step": 138 }, { "epoch": 0.46488294314381273, "grad_norm": 6.1875, "learning_rate": 5.476190476190477e-06, "loss": 1.4784, "step": 139 }, { "epoch": 0.4682274247491639, "grad_norm": 5.59375, "learning_rate": 5.442176870748301e-06, "loss": 1.3256, "step": 140 }, { "epoch": 0.4682274247491639, "eval_loss": 1.4767065048217773, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.5849, "eval_samples_per_second": 48.648, "eval_steps_per_second": 24.324, "step": 140 }, { "epoch": 0.47157190635451507, "grad_norm": 5.53125, "learning_rate": 5.408163265306123e-06, "loss": 1.4447, "step": 141 }, { "epoch": 0.47491638795986624, "grad_norm": 5.96875, "learning_rate": 5.374149659863946e-06, "loss": 1.495, "step": 142 }, { "epoch": 0.4782608695652174, "grad_norm": 6.78125, "learning_rate": 5.3401360544217695e-06, "loss": 1.415, "step": 143 }, { "epoch": 0.4816053511705686, "grad_norm": 7.21875, "learning_rate": 5.306122448979593e-06, "loss": 1.5169, "step": 144 }, { "epoch": 0.48494983277591974, "grad_norm": 6.34375, "learning_rate": 5.272108843537416e-06, "loss": 1.4935, "step": 145 }, { "epoch": 0.4882943143812709, "grad_norm": 5.625, "learning_rate": 5.2380952380952384e-06, "loss": 1.3814, "step": 146 }, { "epoch": 0.4916387959866221, "grad_norm": 6.0625, "learning_rate": 5.204081632653062e-06, "loss": 1.3691, "step": 147 }, { "epoch": 0.49498327759197325, "grad_norm": 5.6875, "learning_rate": 5.170068027210885e-06, "loss": 1.441, "step": 148 }, { "epoch": 0.4983277591973244, "grad_norm": 6.5625, "learning_rate": 5.136054421768708e-06, "loss": 1.5852, "step": 149 }, { "epoch": 0.5016722408026756, "grad_norm": 6.15625, "learning_rate": 5.1020408163265315e-06, "loss": 1.501, "step": 150 }, { "epoch": 0.5016722408026756, "eval_loss": 1.4691280126571655, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.5494, "eval_samples_per_second": 48.718, "eval_steps_per_second": 24.359, "step": 150 }, { "epoch": 0.5050167224080268, "grad_norm": 5.59375, "learning_rate": 5.068027210884354e-06, "loss": 1.5432, "step": 151 }, { "epoch": 0.5083612040133779, "grad_norm": 5.625, "learning_rate": 5.034013605442177e-06, "loss": 1.3688, "step": 152 }, { "epoch": 0.5117056856187291, "grad_norm": 5.6875, "learning_rate": 5e-06, "loss": 1.554, "step": 153 }, { "epoch": 0.5150501672240803, "grad_norm": 5.375, "learning_rate": 4.965986394557824e-06, "loss": 1.4896, "step": 154 }, { "epoch": 0.5183946488294314, "grad_norm": 5.875, "learning_rate": 4.931972789115647e-06, "loss": 1.6149, "step": 155 }, { "epoch": 0.5217391304347826, "grad_norm": 5.78125, "learning_rate": 4.897959183673469e-06, "loss": 1.401, "step": 156 }, { "epoch": 0.5250836120401338, "grad_norm": 6.71875, "learning_rate": 4.863945578231293e-06, "loss": 1.4292, "step": 157 }, { "epoch": 0.5284280936454849, "grad_norm": 5.59375, "learning_rate": 4.829931972789116e-06, "loss": 1.5169, "step": 158 }, { "epoch": 0.5317725752508361, "grad_norm": 5.9375, "learning_rate": 4.795918367346939e-06, "loss": 1.5154, "step": 159 }, { "epoch": 0.5351170568561873, "grad_norm": 5.65625, "learning_rate": 4.761904761904762e-06, "loss": 1.378, "step": 160 }, { "epoch": 0.5351170568561873, "eval_loss": 1.4665558338165283, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.6412, "eval_samples_per_second": 48.537, "eval_steps_per_second": 24.268, "step": 160 }, { "epoch": 0.5384615384615384, "grad_norm": 5.59375, "learning_rate": 4.727891156462586e-06, "loss": 1.4773, "step": 161 }, { "epoch": 0.5418060200668896, "grad_norm": 5.0625, "learning_rate": 4.693877551020409e-06, "loss": 1.3758, "step": 162 }, { "epoch": 0.5451505016722408, "grad_norm": 5.25, "learning_rate": 4.659863945578232e-06, "loss": 1.4291, "step": 163 }, { "epoch": 0.5484949832775919, "grad_norm": 5.5, "learning_rate": 4.6258503401360546e-06, "loss": 1.5342, "step": 164 }, { "epoch": 0.5518394648829431, "grad_norm": 5.75, "learning_rate": 4.591836734693878e-06, "loss": 1.3828, "step": 165 }, { "epoch": 0.5551839464882943, "grad_norm": 5.5625, "learning_rate": 4.557823129251701e-06, "loss": 1.5141, "step": 166 }, { "epoch": 0.5585284280936454, "grad_norm": 5.625, "learning_rate": 4.523809523809524e-06, "loss": 1.4488, "step": 167 }, { "epoch": 0.5618729096989966, "grad_norm": 5.71875, "learning_rate": 4.489795918367348e-06, "loss": 1.4004, "step": 168 }, { "epoch": 0.5652173913043478, "grad_norm": 5.9375, "learning_rate": 4.45578231292517e-06, "loss": 1.5899, "step": 169 }, { "epoch": 0.568561872909699, "grad_norm": 6.125, "learning_rate": 4.421768707482993e-06, "loss": 1.4491, "step": 170 }, { "epoch": 0.568561872909699, "eval_loss": 1.4608893394470215, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.9425, "eval_samples_per_second": 47.95, "eval_steps_per_second": 23.975, "step": 170 }, { "epoch": 0.5719063545150501, "grad_norm": 5.65625, "learning_rate": 4.3877551020408165e-06, "loss": 1.3881, "step": 171 }, { "epoch": 0.5752508361204013, "grad_norm": 5.6875, "learning_rate": 4.35374149659864e-06, "loss": 1.4935, "step": 172 }, { "epoch": 0.5785953177257525, "grad_norm": 5.53125, "learning_rate": 4.319727891156463e-06, "loss": 1.5165, "step": 173 }, { "epoch": 0.5819397993311036, "grad_norm": 5.03125, "learning_rate": 4.2857142857142855e-06, "loss": 1.4202, "step": 174 }, { "epoch": 0.5852842809364549, "grad_norm": 5.78125, "learning_rate": 4.251700680272109e-06, "loss": 1.347, "step": 175 }, { "epoch": 0.5886287625418061, "grad_norm": 5.96875, "learning_rate": 4.217687074829933e-06, "loss": 1.579, "step": 176 }, { "epoch": 0.5919732441471572, "grad_norm": 5.59375, "learning_rate": 4.183673469387755e-06, "loss": 1.5073, "step": 177 }, { "epoch": 0.5953177257525084, "grad_norm": 5.5625, "learning_rate": 4.1496598639455785e-06, "loss": 1.3991, "step": 178 }, { "epoch": 0.5986622073578596, "grad_norm": 5.8125, "learning_rate": 4.115646258503402e-06, "loss": 1.3898, "step": 179 }, { "epoch": 0.6020066889632107, "grad_norm": 5.84375, "learning_rate": 4.081632653061225e-06, "loss": 1.4873, "step": 180 }, { "epoch": 0.6020066889632107, "eval_loss": 1.4584482908248901, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.9123, "eval_samples_per_second": 48.008, "eval_steps_per_second": 24.004, "step": 180 }, { "epoch": 0.6053511705685619, "grad_norm": 5.46875, "learning_rate": 4.047619047619048e-06, "loss": 1.332, "step": 181 }, { "epoch": 0.6086956521739131, "grad_norm": 5.8125, "learning_rate": 4.013605442176871e-06, "loss": 1.5779, "step": 182 }, { "epoch": 0.6120401337792643, "grad_norm": 5.1875, "learning_rate": 3.979591836734694e-06, "loss": 1.405, "step": 183 }, { "epoch": 0.6153846153846154, "grad_norm": 6.65625, "learning_rate": 3.945578231292517e-06, "loss": 1.5077, "step": 184 }, { "epoch": 0.6187290969899666, "grad_norm": 6.03125, "learning_rate": 3.9115646258503405e-06, "loss": 1.4785, "step": 185 }, { "epoch": 0.6220735785953178, "grad_norm": 5.5625, "learning_rate": 3.877551020408164e-06, "loss": 1.4061, "step": 186 }, { "epoch": 0.6254180602006689, "grad_norm": 5.25, "learning_rate": 3.843537414965986e-06, "loss": 1.3562, "step": 187 }, { "epoch": 0.6287625418060201, "grad_norm": 5.5625, "learning_rate": 3.80952380952381e-06, "loss": 1.4015, "step": 188 }, { "epoch": 0.6321070234113713, "grad_norm": 5.65625, "learning_rate": 3.7755102040816327e-06, "loss": 1.5079, "step": 189 }, { "epoch": 0.6354515050167224, "grad_norm": 5.375, "learning_rate": 3.7414965986394563e-06, "loss": 1.4518, "step": 190 }, { "epoch": 0.6354515050167224, "eval_loss": 1.4557547569274902, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.4224, "eval_samples_per_second": 48.971, "eval_steps_per_second": 24.486, "step": 190 }, { "epoch": 0.6387959866220736, "grad_norm": 6.0625, "learning_rate": 3.7074829931972796e-06, "loss": 1.4845, "step": 191 }, { "epoch": 0.6421404682274248, "grad_norm": 5.375, "learning_rate": 3.6734693877551024e-06, "loss": 1.4978, "step": 192 }, { "epoch": 0.6454849498327759, "grad_norm": 5.53125, "learning_rate": 3.6394557823129257e-06, "loss": 1.2786, "step": 193 }, { "epoch": 0.6488294314381271, "grad_norm": 5.9375, "learning_rate": 3.6054421768707485e-06, "loss": 1.3896, "step": 194 }, { "epoch": 0.6521739130434783, "grad_norm": 6.03125, "learning_rate": 3.5714285714285718e-06, "loss": 1.5092, "step": 195 }, { "epoch": 0.6555183946488294, "grad_norm": 5.5625, "learning_rate": 3.537414965986395e-06, "loss": 1.4844, "step": 196 }, { "epoch": 0.6588628762541806, "grad_norm": 5.71875, "learning_rate": 3.503401360544218e-06, "loss": 1.3297, "step": 197 }, { "epoch": 0.6622073578595318, "grad_norm": 5.96875, "learning_rate": 3.469387755102041e-06, "loss": 1.3805, "step": 198 }, { "epoch": 0.6655518394648829, "grad_norm": 5.9375, "learning_rate": 3.435374149659864e-06, "loss": 1.4935, "step": 199 }, { "epoch": 0.6688963210702341, "grad_norm": 5.84375, "learning_rate": 3.4013605442176872e-06, "loss": 1.3566, "step": 200 }, { "epoch": 0.6688963210702341, "eval_loss": 1.454195261001587, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.8656, "eval_samples_per_second": 48.099, "eval_steps_per_second": 24.049, "step": 200 }, { "epoch": 0.6722408026755853, "grad_norm": 5.96875, "learning_rate": 3.3673469387755105e-06, "loss": 1.4493, "step": 201 }, { "epoch": 0.6755852842809364, "grad_norm": 6.65625, "learning_rate": 3.3333333333333333e-06, "loss": 1.6351, "step": 202 }, { "epoch": 0.6789297658862876, "grad_norm": 6.40625, "learning_rate": 3.2993197278911566e-06, "loss": 1.4902, "step": 203 }, { "epoch": 0.6822742474916388, "grad_norm": 5.6875, "learning_rate": 3.2653061224489794e-06, "loss": 1.5224, "step": 204 }, { "epoch": 0.68561872909699, "grad_norm": 6.4375, "learning_rate": 3.231292517006803e-06, "loss": 1.4376, "step": 205 }, { "epoch": 0.6889632107023411, "grad_norm": 5.34375, "learning_rate": 3.1972789115646264e-06, "loss": 1.3701, "step": 206 }, { "epoch": 0.6923076923076923, "grad_norm": 6.3125, "learning_rate": 3.1632653061224496e-06, "loss": 1.5269, "step": 207 }, { "epoch": 0.6956521739130435, "grad_norm": 5.90625, "learning_rate": 3.1292517006802725e-06, "loss": 1.3714, "step": 208 }, { "epoch": 0.6989966555183946, "grad_norm": 5.46875, "learning_rate": 3.0952380952380957e-06, "loss": 1.3528, "step": 209 }, { "epoch": 0.7023411371237458, "grad_norm": 5.375, "learning_rate": 3.0612244897959185e-06, "loss": 1.3975, "step": 210 }, { "epoch": 0.7023411371237458, "eval_loss": 1.4497511386871338, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.0547, "eval_samples_per_second": 47.736, "eval_steps_per_second": 23.868, "step": 210 }, { "epoch": 0.705685618729097, "grad_norm": 6.21875, "learning_rate": 3.027210884353742e-06, "loss": 1.4095, "step": 211 }, { "epoch": 0.7090301003344481, "grad_norm": 5.4375, "learning_rate": 2.993197278911565e-06, "loss": 1.4812, "step": 212 }, { "epoch": 0.7123745819397993, "grad_norm": 5.46875, "learning_rate": 2.959183673469388e-06, "loss": 1.4957, "step": 213 }, { "epoch": 0.7157190635451505, "grad_norm": 5.5625, "learning_rate": 2.925170068027211e-06, "loss": 1.4469, "step": 214 }, { "epoch": 0.7190635451505016, "grad_norm": 6.09375, "learning_rate": 2.891156462585034e-06, "loss": 1.5594, "step": 215 }, { "epoch": 0.7224080267558528, "grad_norm": 6.09375, "learning_rate": 2.8571428571428573e-06, "loss": 1.5192, "step": 216 }, { "epoch": 0.725752508361204, "grad_norm": 5.5, "learning_rate": 2.8231292517006805e-06, "loss": 1.5233, "step": 217 }, { "epoch": 0.7290969899665551, "grad_norm": 5.84375, "learning_rate": 2.7891156462585034e-06, "loss": 1.3785, "step": 218 }, { "epoch": 0.7324414715719063, "grad_norm": 5.75, "learning_rate": 2.7551020408163266e-06, "loss": 1.5832, "step": 219 }, { "epoch": 0.7357859531772575, "grad_norm": 5.4375, "learning_rate": 2.7210884353741503e-06, "loss": 1.4804, "step": 220 }, { "epoch": 0.7357859531772575, "eval_loss": 1.449277639389038, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.0109, "eval_samples_per_second": 47.819, "eval_steps_per_second": 23.91, "step": 220 }, { "epoch": 0.7391304347826086, "grad_norm": 5.59375, "learning_rate": 2.687074829931973e-06, "loss": 1.4036, "step": 221 }, { "epoch": 0.7424749163879598, "grad_norm": 5.15625, "learning_rate": 2.6530612244897964e-06, "loss": 1.3211, "step": 222 }, { "epoch": 0.745819397993311, "grad_norm": 5.125, "learning_rate": 2.6190476190476192e-06, "loss": 1.3913, "step": 223 }, { "epoch": 0.7491638795986622, "grad_norm": 5.875, "learning_rate": 2.5850340136054425e-06, "loss": 1.604, "step": 224 }, { "epoch": 0.7525083612040134, "grad_norm": 5.46875, "learning_rate": 2.5510204081632657e-06, "loss": 1.4159, "step": 225 }, { "epoch": 0.7558528428093646, "grad_norm": 5.375, "learning_rate": 2.5170068027210886e-06, "loss": 1.4109, "step": 226 }, { "epoch": 0.7591973244147158, "grad_norm": 4.96875, "learning_rate": 2.482993197278912e-06, "loss": 1.347, "step": 227 }, { "epoch": 0.7625418060200669, "grad_norm": 6.15625, "learning_rate": 2.4489795918367347e-06, "loss": 1.3658, "step": 228 }, { "epoch": 0.7658862876254181, "grad_norm": 5.625, "learning_rate": 2.414965986394558e-06, "loss": 1.4617, "step": 229 }, { "epoch": 0.7692307692307693, "grad_norm": 4.875, "learning_rate": 2.380952380952381e-06, "loss": 1.3388, "step": 230 }, { "epoch": 0.7692307692307693, "eval_loss": 1.446601152420044, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.9763, "eval_samples_per_second": 47.885, "eval_steps_per_second": 23.943, "step": 230 }, { "epoch": 0.7725752508361204, "grad_norm": 5.28125, "learning_rate": 2.3469387755102044e-06, "loss": 1.3752, "step": 231 }, { "epoch": 0.7759197324414716, "grad_norm": 5.84375, "learning_rate": 2.3129251700680273e-06, "loss": 1.4201, "step": 232 }, { "epoch": 0.7792642140468228, "grad_norm": 5.8125, "learning_rate": 2.2789115646258505e-06, "loss": 1.4162, "step": 233 }, { "epoch": 0.782608695652174, "grad_norm": 5.4375, "learning_rate": 2.244897959183674e-06, "loss": 1.486, "step": 234 }, { "epoch": 0.7859531772575251, "grad_norm": 5.5625, "learning_rate": 2.2108843537414966e-06, "loss": 1.456, "step": 235 }, { "epoch": 0.7892976588628763, "grad_norm": 5.46875, "learning_rate": 2.17687074829932e-06, "loss": 1.4028, "step": 236 }, { "epoch": 0.7926421404682275, "grad_norm": 6.125, "learning_rate": 2.1428571428571427e-06, "loss": 1.4319, "step": 237 }, { "epoch": 0.7959866220735786, "grad_norm": 5.9375, "learning_rate": 2.1088435374149664e-06, "loss": 1.3655, "step": 238 }, { "epoch": 0.7993311036789298, "grad_norm": 6.15625, "learning_rate": 2.0748299319727892e-06, "loss": 1.4736, "step": 239 }, { "epoch": 0.802675585284281, "grad_norm": 5.8125, "learning_rate": 2.0408163265306125e-06, "loss": 1.5061, "step": 240 }, { "epoch": 0.802675585284281, "eval_loss": 1.4440027475357056, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.5962, "eval_samples_per_second": 46.726, "eval_steps_per_second": 23.363, "step": 240 }, { "epoch": 0.8060200668896321, "grad_norm": 5.90625, "learning_rate": 2.0068027210884353e-06, "loss": 1.4407, "step": 241 }, { "epoch": 0.8093645484949833, "grad_norm": 5.625, "learning_rate": 1.9727891156462586e-06, "loss": 1.4002, "step": 242 }, { "epoch": 0.8127090301003345, "grad_norm": 5.1875, "learning_rate": 1.938775510204082e-06, "loss": 1.4503, "step": 243 }, { "epoch": 0.8160535117056856, "grad_norm": 5.625, "learning_rate": 1.904761904761905e-06, "loss": 1.5007, "step": 244 }, { "epoch": 0.8193979933110368, "grad_norm": 5.78125, "learning_rate": 1.8707482993197282e-06, "loss": 1.6362, "step": 245 }, { "epoch": 0.822742474916388, "grad_norm": 5.375, "learning_rate": 1.8367346938775512e-06, "loss": 1.4041, "step": 246 }, { "epoch": 0.8260869565217391, "grad_norm": 5.875, "learning_rate": 1.8027210884353743e-06, "loss": 1.4981, "step": 247 }, { "epoch": 0.8294314381270903, "grad_norm": 6.0, "learning_rate": 1.7687074829931975e-06, "loss": 1.3748, "step": 248 }, { "epoch": 0.8327759197324415, "grad_norm": 6.0, "learning_rate": 1.7346938775510206e-06, "loss": 1.5447, "step": 249 }, { "epoch": 0.8361204013377926, "grad_norm": 5.6875, "learning_rate": 1.7006802721088436e-06, "loss": 1.4622, "step": 250 }, { "epoch": 0.8361204013377926, "eval_loss": 1.4428349733352661, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.5345, "eval_samples_per_second": 46.839, "eval_steps_per_second": 23.419, "step": 250 }, { "epoch": 0.8394648829431438, "grad_norm": 5.59375, "learning_rate": 1.6666666666666667e-06, "loss": 1.6372, "step": 251 }, { "epoch": 0.842809364548495, "grad_norm": 5.84375, "learning_rate": 1.6326530612244897e-06, "loss": 1.5288, "step": 252 }, { "epoch": 0.8461538461538461, "grad_norm": 5.78125, "learning_rate": 1.5986394557823132e-06, "loss": 1.5163, "step": 253 }, { "epoch": 0.8494983277591973, "grad_norm": 5.71875, "learning_rate": 1.5646258503401362e-06, "loss": 1.4535, "step": 254 }, { "epoch": 0.8528428093645485, "grad_norm": 5.5625, "learning_rate": 1.5306122448979593e-06, "loss": 1.3558, "step": 255 }, { "epoch": 0.8561872909698997, "grad_norm": 5.78125, "learning_rate": 1.4965986394557825e-06, "loss": 1.489, "step": 256 }, { "epoch": 0.8595317725752508, "grad_norm": 5.875, "learning_rate": 1.4625850340136056e-06, "loss": 1.4066, "step": 257 }, { "epoch": 0.862876254180602, "grad_norm": 5.3125, "learning_rate": 1.4285714285714286e-06, "loss": 1.4846, "step": 258 }, { "epoch": 0.8662207357859532, "grad_norm": 5.40625, "learning_rate": 1.3945578231292517e-06, "loss": 1.4276, "step": 259 }, { "epoch": 0.8695652173913043, "grad_norm": 5.65625, "learning_rate": 1.3605442176870751e-06, "loss": 1.409, "step": 260 }, { "epoch": 0.8695652173913043, "eval_loss": 1.4424058198928833, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.3885, "eval_samples_per_second": 47.108, "eval_steps_per_second": 23.554, "step": 260 }, { "epoch": 0.8729096989966555, "grad_norm": 5.375, "learning_rate": 1.3265306122448982e-06, "loss": 1.4611, "step": 261 }, { "epoch": 0.8762541806020067, "grad_norm": 5.84375, "learning_rate": 1.2925170068027212e-06, "loss": 1.5491, "step": 262 }, { "epoch": 0.8795986622073578, "grad_norm": 5.40625, "learning_rate": 1.2585034013605443e-06, "loss": 1.3317, "step": 263 }, { "epoch": 0.882943143812709, "grad_norm": 5.71875, "learning_rate": 1.2244897959183673e-06, "loss": 1.5078, "step": 264 }, { "epoch": 0.8862876254180602, "grad_norm": 5.78125, "learning_rate": 1.1904761904761906e-06, "loss": 1.3606, "step": 265 }, { "epoch": 0.8896321070234113, "grad_norm": 5.65625, "learning_rate": 1.1564625850340136e-06, "loss": 1.3629, "step": 266 }, { "epoch": 0.8929765886287625, "grad_norm": 5.6875, "learning_rate": 1.122448979591837e-06, "loss": 1.5987, "step": 267 }, { "epoch": 0.8963210702341137, "grad_norm": 5.21875, "learning_rate": 1.08843537414966e-06, "loss": 1.474, "step": 268 }, { "epoch": 0.8996655518394648, "grad_norm": 5.875, "learning_rate": 1.0544217687074832e-06, "loss": 1.4959, "step": 269 }, { "epoch": 0.903010033444816, "grad_norm": 5.34375, "learning_rate": 1.0204081632653063e-06, "loss": 1.4856, "step": 270 }, { "epoch": 0.903010033444816, "eval_loss": 1.440917730331421, "eval_model_preparation_time": 0.0182, "eval_runtime": 25.1531, "eval_samples_per_second": 47.549, "eval_steps_per_second": 23.774, "step": 270 }, { "epoch": 0.9063545150501672, "grad_norm": 5.375, "learning_rate": 9.863945578231293e-07, "loss": 1.3753, "step": 271 }, { "epoch": 0.9096989966555183, "grad_norm": 5.4375, "learning_rate": 9.523809523809525e-07, "loss": 1.4056, "step": 272 }, { "epoch": 0.9130434782608695, "grad_norm": 5.46875, "learning_rate": 9.183673469387756e-07, "loss": 1.4596, "step": 273 }, { "epoch": 0.9163879598662207, "grad_norm": 5.03125, "learning_rate": 8.843537414965988e-07, "loss": 1.3337, "step": 274 }, { "epoch": 0.919732441471572, "grad_norm": 5.28125, "learning_rate": 8.503401360544218e-07, "loss": 1.4107, "step": 275 }, { "epoch": 0.9230769230769231, "grad_norm": 5.21875, "learning_rate": 8.163265306122449e-07, "loss": 1.4098, "step": 276 }, { "epoch": 0.9264214046822743, "grad_norm": 5.4375, "learning_rate": 7.823129251700681e-07, "loss": 1.4803, "step": 277 }, { "epoch": 0.9297658862876255, "grad_norm": 5.3125, "learning_rate": 7.482993197278913e-07, "loss": 1.4627, "step": 278 }, { "epoch": 0.9331103678929766, "grad_norm": 5.21875, "learning_rate": 7.142857142857143e-07, "loss": 1.4215, "step": 279 }, { "epoch": 0.9364548494983278, "grad_norm": 6.03125, "learning_rate": 6.802721088435376e-07, "loss": 1.3524, "step": 280 }, { "epoch": 0.9364548494983278, "eval_loss": 1.439655065536499, "eval_model_preparation_time": 0.0182, "eval_runtime": 26.5568, "eval_samples_per_second": 45.036, "eval_steps_per_second": 22.518, "step": 280 }, { "epoch": 0.939799331103679, "grad_norm": 5.625, "learning_rate": 6.462585034013606e-07, "loss": 1.5138, "step": 281 }, { "epoch": 0.9431438127090301, "grad_norm": 5.625, "learning_rate": 6.122448979591837e-07, "loss": 1.4535, "step": 282 }, { "epoch": 0.9464882943143813, "grad_norm": 5.53125, "learning_rate": 5.782312925170068e-07, "loss": 1.6195, "step": 283 }, { "epoch": 0.9498327759197325, "grad_norm": 5.34375, "learning_rate": 5.4421768707483e-07, "loss": 1.4418, "step": 284 }, { "epoch": 0.9531772575250836, "grad_norm": 5.25, "learning_rate": 5.102040816326531e-07, "loss": 1.3069, "step": 285 }, { "epoch": 0.9565217391304348, "grad_norm": 5.4375, "learning_rate": 4.7619047619047623e-07, "loss": 1.2933, "step": 286 }, { "epoch": 0.959866220735786, "grad_norm": 5.53125, "learning_rate": 4.421768707482994e-07, "loss": 1.4555, "step": 287 }, { "epoch": 0.9632107023411371, "grad_norm": 5.25, "learning_rate": 4.0816326530612243e-07, "loss": 1.3118, "step": 288 }, { "epoch": 0.9665551839464883, "grad_norm": 5.8125, "learning_rate": 3.7414965986394563e-07, "loss": 1.4598, "step": 289 }, { "epoch": 0.9698996655518395, "grad_norm": 5.4375, "learning_rate": 3.401360544217688e-07, "loss": 1.4002, "step": 290 }, { "epoch": 0.9698996655518395, "eval_loss": 1.4394288063049316, "eval_model_preparation_time": 0.0182, "eval_runtime": 24.6446, "eval_samples_per_second": 48.53, "eval_steps_per_second": 24.265, "step": 290 }, { "epoch": 0.9732441471571907, "grad_norm": 5.0625, "learning_rate": 3.0612244897959183e-07, "loss": 1.3755, "step": 291 }, { "epoch": 0.9765886287625418, "grad_norm": 5.28125, "learning_rate": 2.72108843537415e-07, "loss": 1.4194, "step": 292 }, { "epoch": 0.979933110367893, "grad_norm": 6.0625, "learning_rate": 2.3809523809523811e-07, "loss": 1.4592, "step": 293 }, { "epoch": 0.9832775919732442, "grad_norm": 5.46875, "learning_rate": 2.0408163265306121e-07, "loss": 1.5226, "step": 294 }, { "epoch": 0.9866220735785953, "grad_norm": 5.46875, "learning_rate": 1.700680272108844e-07, "loss": 1.3764, "step": 295 }, { "epoch": 0.9899665551839465, "grad_norm": 5.46875, "learning_rate": 1.360544217687075e-07, "loss": 1.3846, "step": 296 }, { "epoch": 0.9933110367892977, "grad_norm": 5.375, "learning_rate": 1.0204081632653061e-07, "loss": 1.4552, "step": 297 }, { "epoch": 0.9966555183946488, "grad_norm": 5.75, "learning_rate": 6.802721088435375e-08, "loss": 1.3043, "step": 298 }, { "epoch": 1.0, "grad_norm": 5.96875, "learning_rate": 3.4013605442176873e-08, "loss": 1.4929, "step": 299 } ], "logging_steps": 1, "max_steps": 299, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3124411636793344e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }