2389 lines
55 KiB
JSON
2389 lines
55 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 10,
|
|
"global_step": 299,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0033444816053511705,
|
|
"grad_norm": 25.75,
|
|
"learning_rate": 0.0,
|
|
"loss": 3.0181,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.006688963210702341,
|
|
"grad_norm": 27.75,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 3.1574,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.010033444816053512,
|
|
"grad_norm": 29.0,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 2.9793,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.013377926421404682,
|
|
"grad_norm": 26.625,
|
|
"learning_rate": 6e-06,
|
|
"loss": 2.8721,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.016722408026755852,
|
|
"grad_norm": 18.5,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 2.6137,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.020066889632107024,
|
|
"grad_norm": 10.8125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 2.4216,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.023411371237458192,
|
|
"grad_norm": 9.625,
|
|
"learning_rate": 9.965986394557824e-06,
|
|
"loss": 2.4293,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.026755852842809364,
|
|
"grad_norm": 7.8125,
|
|
"learning_rate": 9.931972789115647e-06,
|
|
"loss": 2.215,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.030100334448160536,
|
|
"grad_norm": 7.84375,
|
|
"learning_rate": 9.89795918367347e-06,
|
|
"loss": 2.1725,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.033444816053511704,
|
|
"grad_norm": 8.0625,
|
|
"learning_rate": 9.863945578231294e-06,
|
|
"loss": 2.1844,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.033444816053511704,
|
|
"eval_loss": 2.0784454345703125,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.1202,
|
|
"eval_samples_per_second": 47.611,
|
|
"eval_steps_per_second": 23.806,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.03678929765886288,
|
|
"grad_norm": 7.71875,
|
|
"learning_rate": 9.829931972789115e-06,
|
|
"loss": 2.1589,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.04013377926421405,
|
|
"grad_norm": 7.125,
|
|
"learning_rate": 9.795918367346939e-06,
|
|
"loss": 2.0039,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.043478260869565216,
|
|
"grad_norm": 11.375,
|
|
"learning_rate": 9.761904761904762e-06,
|
|
"loss": 1.9245,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.046822742474916385,
|
|
"grad_norm": 7.6875,
|
|
"learning_rate": 9.727891156462585e-06,
|
|
"loss": 2.0036,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.05016722408026756,
|
|
"grad_norm": 6.5625,
|
|
"learning_rate": 9.693877551020408e-06,
|
|
"loss": 1.8739,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.05351170568561873,
|
|
"grad_norm": 7.625,
|
|
"learning_rate": 9.659863945578232e-06,
|
|
"loss": 1.7739,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.056856187290969896,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 9.625850340136055e-06,
|
|
"loss": 1.8338,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.06020066889632107,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 9.591836734693878e-06,
|
|
"loss": 1.8526,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.06354515050167224,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 9.557823129251701e-06,
|
|
"loss": 1.864,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.06688963210702341,
|
|
"grad_norm": 7.28125,
|
|
"learning_rate": 9.523809523809525e-06,
|
|
"loss": 1.8338,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.06688963210702341,
|
|
"eval_loss": 1.762088418006897,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.9271,
|
|
"eval_samples_per_second": 46.129,
|
|
"eval_steps_per_second": 23.065,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.07023411371237458,
|
|
"grad_norm": 7.1875,
|
|
"learning_rate": 9.489795918367348e-06,
|
|
"loss": 1.8749,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.07357859531772576,
|
|
"grad_norm": 7.9375,
|
|
"learning_rate": 9.455782312925171e-06,
|
|
"loss": 1.8266,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.07692307692307693,
|
|
"grad_norm": 9.75,
|
|
"learning_rate": 9.421768707482995e-06,
|
|
"loss": 1.7542,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.0802675585284281,
|
|
"grad_norm": 6.84375,
|
|
"learning_rate": 9.387755102040818e-06,
|
|
"loss": 1.8181,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.08361204013377926,
|
|
"grad_norm": 6.625,
|
|
"learning_rate": 9.353741496598641e-06,
|
|
"loss": 1.6258,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.08695652173913043,
|
|
"grad_norm": 6.53125,
|
|
"learning_rate": 9.319727891156464e-06,
|
|
"loss": 1.9209,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.0903010033444816,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 9.285714285714288e-06,
|
|
"loss": 1.5183,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.09364548494983277,
|
|
"grad_norm": 7.125,
|
|
"learning_rate": 9.251700680272109e-06,
|
|
"loss": 1.6856,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.09698996655518395,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 9.217687074829932e-06,
|
|
"loss": 1.5802,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.10033444816053512,
|
|
"grad_norm": 7.0625,
|
|
"learning_rate": 9.183673469387756e-06,
|
|
"loss": 1.8024,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.10033444816053512,
|
|
"eval_loss": 1.654666543006897,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.0658,
|
|
"eval_samples_per_second": 47.714,
|
|
"eval_steps_per_second": 23.857,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.10367892976588629,
|
|
"grad_norm": 6.40625,
|
|
"learning_rate": 9.149659863945579e-06,
|
|
"loss": 1.6294,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.10702341137123746,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 9.115646258503402e-06,
|
|
"loss": 1.481,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.11036789297658862,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 9.081632653061225e-06,
|
|
"loss": 1.6586,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.11371237458193979,
|
|
"grad_norm": 6.21875,
|
|
"learning_rate": 9.047619047619049e-06,
|
|
"loss": 1.5246,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.11705685618729098,
|
|
"grad_norm": 7.71875,
|
|
"learning_rate": 9.013605442176872e-06,
|
|
"loss": 1.6757,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.12040133779264214,
|
|
"grad_norm": 6.5625,
|
|
"learning_rate": 8.979591836734695e-06,
|
|
"loss": 1.6568,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.12374581939799331,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 8.945578231292518e-06,
|
|
"loss": 1.6195,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.12709030100334448,
|
|
"grad_norm": 7.3125,
|
|
"learning_rate": 8.91156462585034e-06,
|
|
"loss": 1.6203,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.13043478260869565,
|
|
"grad_norm": 6.71875,
|
|
"learning_rate": 8.877551020408163e-06,
|
|
"loss": 1.5925,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.13377926421404682,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 8.843537414965987e-06,
|
|
"loss": 1.6298,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.13377926421404682,
|
|
"eval_loss": 1.6065762042999268,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 26.7591,
|
|
"eval_samples_per_second": 44.695,
|
|
"eval_steps_per_second": 22.348,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.13712374581939799,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 8.80952380952381e-06,
|
|
"loss": 1.5707,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.14046822742474915,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 8.775510204081633e-06,
|
|
"loss": 1.5502,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.14381270903010032,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 8.741496598639456e-06,
|
|
"loss": 1.5849,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.14715719063545152,
|
|
"grad_norm": 6.21875,
|
|
"learning_rate": 8.70748299319728e-06,
|
|
"loss": 1.595,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.1505016722408027,
|
|
"grad_norm": 7.09375,
|
|
"learning_rate": 8.673469387755103e-06,
|
|
"loss": 1.6047,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.15384615384615385,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 8.639455782312926e-06,
|
|
"loss": 1.4065,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.15719063545150502,
|
|
"grad_norm": 6.90625,
|
|
"learning_rate": 8.60544217687075e-06,
|
|
"loss": 1.6029,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.1605351170568562,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 8.571428571428571e-06,
|
|
"loss": 1.5176,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.16387959866220736,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 8.537414965986394e-06,
|
|
"loss": 1.5355,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.16722408026755853,
|
|
"grad_norm": 6.28125,
|
|
"learning_rate": 8.503401360544217e-06,
|
|
"loss": 1.5509,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.16722408026755853,
|
|
"eval_loss": 1.5824450254440308,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.6606,
|
|
"eval_samples_per_second": 48.498,
|
|
"eval_steps_per_second": 24.249,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.1705685618729097,
|
|
"grad_norm": 7.0,
|
|
"learning_rate": 8.469387755102042e-06,
|
|
"loss": 1.6003,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.17391304347826086,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 8.435374149659866e-06,
|
|
"loss": 1.4908,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.17725752508361203,
|
|
"grad_norm": 6.84375,
|
|
"learning_rate": 8.401360544217689e-06,
|
|
"loss": 1.6432,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.1806020066889632,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 8.36734693877551e-06,
|
|
"loss": 1.4956,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.18394648829431437,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 8.333333333333334e-06,
|
|
"loss": 1.459,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.18729096989966554,
|
|
"grad_norm": 7.21875,
|
|
"learning_rate": 8.299319727891157e-06,
|
|
"loss": 1.6542,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.19063545150501673,
|
|
"grad_norm": 6.75,
|
|
"learning_rate": 8.26530612244898e-06,
|
|
"loss": 1.5814,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.1939799331103679,
|
|
"grad_norm": 7.59375,
|
|
"learning_rate": 8.231292517006804e-06,
|
|
"loss": 1.5807,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.19732441471571907,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 8.197278911564627e-06,
|
|
"loss": 1.5321,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.20066889632107024,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 8.16326530612245e-06,
|
|
"loss": 1.6237,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.20066889632107024,
|
|
"eval_loss": 1.5487959384918213,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.9431,
|
|
"eval_samples_per_second": 47.949,
|
|
"eval_steps_per_second": 23.975,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2040133779264214,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 8.129251700680273e-06,
|
|
"loss": 1.6258,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.20735785953177258,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 8.095238095238097e-06,
|
|
"loss": 1.5074,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.21070234113712374,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 8.06122448979592e-06,
|
|
"loss": 1.4395,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.2140468227424749,
|
|
"grad_norm": 6.125,
|
|
"learning_rate": 8.027210884353741e-06,
|
|
"loss": 1.5095,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.21739130434782608,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 7.993197278911565e-06,
|
|
"loss": 1.477,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.22073578595317725,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 7.959183673469388e-06,
|
|
"loss": 1.3592,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.22408026755852842,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 7.925170068027211e-06,
|
|
"loss": 1.535,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.22742474916387959,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 7.891156462585034e-06,
|
|
"loss": 1.5033,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.23076923076923078,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 7.857142857142858e-06,
|
|
"loss": 1.4712,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.23411371237458195,
|
|
"grad_norm": 6.71875,
|
|
"learning_rate": 7.823129251700681e-06,
|
|
"loss": 1.543,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.23411371237458195,
|
|
"eval_loss": 1.5336378812789917,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.7209,
|
|
"eval_samples_per_second": 48.38,
|
|
"eval_steps_per_second": 24.19,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.23745819397993312,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 7.789115646258504e-06,
|
|
"loss": 1.4759,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.2408026755852843,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 7.755102040816327e-06,
|
|
"loss": 1.7235,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.24414715719063546,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 7.72108843537415e-06,
|
|
"loss": 1.5825,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.24749163879598662,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 7.687074829931972e-06,
|
|
"loss": 1.5194,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.2508361204013378,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 7.653061224489796e-06,
|
|
"loss": 1.5582,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.25418060200668896,
|
|
"grad_norm": 6.21875,
|
|
"learning_rate": 7.61904761904762e-06,
|
|
"loss": 1.5348,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.25752508361204013,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 7.585034013605442e-06,
|
|
"loss": 1.6259,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.2608695652173913,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 7.551020408163265e-06,
|
|
"loss": 1.5334,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.26421404682274247,
|
|
"grad_norm": 6.78125,
|
|
"learning_rate": 7.5170068027210886e-06,
|
|
"loss": 1.6789,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.26755852842809363,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 7.482993197278913e-06,
|
|
"loss": 1.4375,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.26755852842809363,
|
|
"eval_loss": 1.5223881006240845,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.6231,
|
|
"eval_samples_per_second": 48.572,
|
|
"eval_steps_per_second": 24.286,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.2709030100334448,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 7.448979591836736e-06,
|
|
"loss": 1.4604,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.27424749163879597,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 7.414965986394559e-06,
|
|
"loss": 1.4198,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.27759197324414714,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 7.380952380952382e-06,
|
|
"loss": 1.5217,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.2809364548494983,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 7.346938775510205e-06,
|
|
"loss": 1.5475,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.2842809364548495,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 7.312925170068028e-06,
|
|
"loss": 1.4847,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.28762541806020064,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 7.278911564625851e-06,
|
|
"loss": 1.4862,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.2909698996655518,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 7.244897959183675e-06,
|
|
"loss": 1.6432,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.29431438127090304,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 7.210884353741497e-06,
|
|
"loss": 1.315,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.2976588628762542,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 7.17687074829932e-06,
|
|
"loss": 1.5436,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.3010033444816054,
|
|
"grad_norm": 6.21875,
|
|
"learning_rate": 7.1428571428571436e-06,
|
|
"loss": 1.5598,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.3010033444816054,
|
|
"eval_loss": 1.5097979307174683,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.8836,
|
|
"eval_samples_per_second": 48.064,
|
|
"eval_steps_per_second": 24.032,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.30434782608695654,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 7.108843537414967e-06,
|
|
"loss": 1.3633,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.3076923076923077,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 7.07482993197279e-06,
|
|
"loss": 1.5061,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.3110367892976589,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 7.0408163265306125e-06,
|
|
"loss": 1.4504,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.31438127090301005,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 7.006802721088436e-06,
|
|
"loss": 1.379,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.3177257525083612,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 6.972789115646259e-06,
|
|
"loss": 1.4414,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.3210702341137124,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 6.938775510204082e-06,
|
|
"loss": 1.4212,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.32441471571906355,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 6.9047619047619055e-06,
|
|
"loss": 1.4361,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.3277591973244147,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 6.870748299319728e-06,
|
|
"loss": 1.4397,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.3311036789297659,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 6.836734693877551e-06,
|
|
"loss": 1.3729,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.33444816053511706,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 6.8027210884353745e-06,
|
|
"loss": 1.5183,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.33444816053511706,
|
|
"eval_loss": 1.501234769821167,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.85,
|
|
"eval_samples_per_second": 48.129,
|
|
"eval_steps_per_second": 24.064,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.3377926421404682,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 6.768707482993198e-06,
|
|
"loss": 1.4924,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.3411371237458194,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 6.734693877551021e-06,
|
|
"loss": 1.4877,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.34448160535117056,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 6.700680272108843e-06,
|
|
"loss": 1.461,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.34782608695652173,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 1.4635,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.3511705685618729,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 6.63265306122449e-06,
|
|
"loss": 1.4172,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.35451505016722407,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 6.598639455782313e-06,
|
|
"loss": 1.4521,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.35785953177257523,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 6.5646258503401364e-06,
|
|
"loss": 1.6308,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.3612040133779264,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 6.530612244897959e-06,
|
|
"loss": 1.5095,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.36454849498327757,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 6.496598639455784e-06,
|
|
"loss": 1.5521,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.36789297658862874,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 6.462585034013606e-06,
|
|
"loss": 1.5551,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.36789297658862874,
|
|
"eval_loss": 1.4953867197036743,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.8035,
|
|
"eval_samples_per_second": 48.219,
|
|
"eval_steps_per_second": 24.109,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.3712374581939799,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 6.4285714285714295e-06,
|
|
"loss": 1.5105,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.3745819397993311,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 6.394557823129253e-06,
|
|
"loss": 1.4385,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.3779264214046823,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 6.360544217687076e-06,
|
|
"loss": 1.591,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.38127090301003347,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 6.326530612244899e-06,
|
|
"loss": 1.4995,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.38461538461538464,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 6.292517006802722e-06,
|
|
"loss": 1.5145,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.3879598662207358,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 6.258503401360545e-06,
|
|
"loss": 1.4962,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.391304347826087,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 6.224489795918368e-06,
|
|
"loss": 1.4014,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.39464882943143814,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 6.1904761904761914e-06,
|
|
"loss": 1.4566,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.3979933110367893,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 6.156462585034015e-06,
|
|
"loss": 1.6276,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.4013377926421405,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 6.122448979591837e-06,
|
|
"loss": 1.4153,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.4013377926421405,
|
|
"eval_loss": 1.485946774482727,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.7693,
|
|
"eval_samples_per_second": 48.286,
|
|
"eval_steps_per_second": 24.143,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.40468227424749165,
|
|
"grad_norm": 6.28125,
|
|
"learning_rate": 6.08843537414966e-06,
|
|
"loss": 1.4437,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.4080267558528428,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 6.054421768707484e-06,
|
|
"loss": 1.5335,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.411371237458194,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 6.020408163265307e-06,
|
|
"loss": 1.4071,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.41471571906354515,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 5.98639455782313e-06,
|
|
"loss": 1.5001,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.4180602006688963,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 5.9523809523809525e-06,
|
|
"loss": 1.4856,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.4214046822742475,
|
|
"grad_norm": 6.125,
|
|
"learning_rate": 5.918367346938776e-06,
|
|
"loss": 1.4836,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.42474916387959866,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 5.884353741496599e-06,
|
|
"loss": 1.5135,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.4280936454849498,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 5.850340136054422e-06,
|
|
"loss": 1.3751,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.431438127090301,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 5.816326530612246e-06,
|
|
"loss": 1.3937,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.43478260869565216,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 5.782312925170068e-06,
|
|
"loss": 1.5853,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.43478260869565216,
|
|
"eval_loss": 1.4821772575378418,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.8834,
|
|
"eval_samples_per_second": 48.064,
|
|
"eval_steps_per_second": 24.032,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.43812709030100333,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 5.748299319727891e-06,
|
|
"loss": 1.38,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.4414715719063545,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": 1.5402,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.44481605351170567,
|
|
"grad_norm": 6.25,
|
|
"learning_rate": 5.680272108843538e-06,
|
|
"loss": 1.4661,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.44816053511705684,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 5.646258503401361e-06,
|
|
"loss": 1.4167,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.451505016722408,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 5.6122448979591834e-06,
|
|
"loss": 1.4334,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.45484949832775917,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 5.578231292517007e-06,
|
|
"loss": 1.5172,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.45819397993311034,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 5.54421768707483e-06,
|
|
"loss": 1.3814,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.46153846153846156,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 5.510204081632653e-06,
|
|
"loss": 1.3929,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.46488294314381273,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 5.476190476190477e-06,
|
|
"loss": 1.4784,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.4682274247491639,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 5.442176870748301e-06,
|
|
"loss": 1.3256,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.4682274247491639,
|
|
"eval_loss": 1.4767065048217773,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.5849,
|
|
"eval_samples_per_second": 48.648,
|
|
"eval_steps_per_second": 24.324,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.47157190635451507,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 5.408163265306123e-06,
|
|
"loss": 1.4447,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.47491638795986624,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 5.374149659863946e-06,
|
|
"loss": 1.495,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.4782608695652174,
|
|
"grad_norm": 6.78125,
|
|
"learning_rate": 5.3401360544217695e-06,
|
|
"loss": 1.415,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.4816053511705686,
|
|
"grad_norm": 7.21875,
|
|
"learning_rate": 5.306122448979593e-06,
|
|
"loss": 1.5169,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.48494983277591974,
|
|
"grad_norm": 6.34375,
|
|
"learning_rate": 5.272108843537416e-06,
|
|
"loss": 1.4935,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.4882943143812709,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 5.2380952380952384e-06,
|
|
"loss": 1.3814,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.4916387959866221,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 5.204081632653062e-06,
|
|
"loss": 1.3691,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.49498327759197325,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 5.170068027210885e-06,
|
|
"loss": 1.441,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.4983277591973244,
|
|
"grad_norm": 6.5625,
|
|
"learning_rate": 5.136054421768708e-06,
|
|
"loss": 1.5852,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.5016722408026756,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 5.1020408163265315e-06,
|
|
"loss": 1.501,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5016722408026756,
|
|
"eval_loss": 1.4691280126571655,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.5494,
|
|
"eval_samples_per_second": 48.718,
|
|
"eval_steps_per_second": 24.359,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5050167224080268,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 5.068027210884354e-06,
|
|
"loss": 1.5432,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.5083612040133779,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 5.034013605442177e-06,
|
|
"loss": 1.3688,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.5117056856187291,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.554,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.5150501672240803,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 4.965986394557824e-06,
|
|
"loss": 1.4896,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.5183946488294314,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 4.931972789115647e-06,
|
|
"loss": 1.6149,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.5217391304347826,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 4.897959183673469e-06,
|
|
"loss": 1.401,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.5250836120401338,
|
|
"grad_norm": 6.71875,
|
|
"learning_rate": 4.863945578231293e-06,
|
|
"loss": 1.4292,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.5284280936454849,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.829931972789116e-06,
|
|
"loss": 1.5169,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.5317725752508361,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 4.795918367346939e-06,
|
|
"loss": 1.5154,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.5351170568561873,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 4.761904761904762e-06,
|
|
"loss": 1.378,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5351170568561873,
|
|
"eval_loss": 1.4665558338165283,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.6412,
|
|
"eval_samples_per_second": 48.537,
|
|
"eval_steps_per_second": 24.268,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5384615384615384,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.727891156462586e-06,
|
|
"loss": 1.4773,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.5418060200668896,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 4.693877551020409e-06,
|
|
"loss": 1.3758,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.5451505016722408,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 4.659863945578232e-06,
|
|
"loss": 1.4291,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.5484949832775919,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 4.6258503401360546e-06,
|
|
"loss": 1.5342,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.5518394648829431,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 4.591836734693878e-06,
|
|
"loss": 1.3828,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.5551839464882943,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 4.557823129251701e-06,
|
|
"loss": 1.5141,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.5585284280936454,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 4.523809523809524e-06,
|
|
"loss": 1.4488,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.5618729096989966,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 4.489795918367348e-06,
|
|
"loss": 1.4004,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.5652173913043478,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 4.45578231292517e-06,
|
|
"loss": 1.5899,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.568561872909699,
|
|
"grad_norm": 6.125,
|
|
"learning_rate": 4.421768707482993e-06,
|
|
"loss": 1.4491,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.568561872909699,
|
|
"eval_loss": 1.4608893394470215,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.9425,
|
|
"eval_samples_per_second": 47.95,
|
|
"eval_steps_per_second": 23.975,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.5719063545150501,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 4.3877551020408165e-06,
|
|
"loss": 1.3881,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.5752508361204013,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 4.35374149659864e-06,
|
|
"loss": 1.4935,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.5785953177257525,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 4.319727891156463e-06,
|
|
"loss": 1.5165,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.5819397993311036,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 4.2857142857142855e-06,
|
|
"loss": 1.4202,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.5852842809364549,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 4.251700680272109e-06,
|
|
"loss": 1.347,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.5886287625418061,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 4.217687074829933e-06,
|
|
"loss": 1.579,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.5919732441471572,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.183673469387755e-06,
|
|
"loss": 1.5073,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.5953177257525084,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 4.1496598639455785e-06,
|
|
"loss": 1.3991,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.5986622073578596,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 4.115646258503402e-06,
|
|
"loss": 1.3898,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.6020066889632107,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 4.081632653061225e-06,
|
|
"loss": 1.4873,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6020066889632107,
|
|
"eval_loss": 1.4584482908248901,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.9123,
|
|
"eval_samples_per_second": 48.008,
|
|
"eval_steps_per_second": 24.004,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6053511705685619,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 4.047619047619048e-06,
|
|
"loss": 1.332,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.6086956521739131,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 4.013605442176871e-06,
|
|
"loss": 1.5779,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.6120401337792643,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 3.979591836734694e-06,
|
|
"loss": 1.405,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.6153846153846154,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 3.945578231292517e-06,
|
|
"loss": 1.5077,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.6187290969899666,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 3.9115646258503405e-06,
|
|
"loss": 1.4785,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.6220735785953178,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 3.877551020408164e-06,
|
|
"loss": 1.4061,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.6254180602006689,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 3.843537414965986e-06,
|
|
"loss": 1.3562,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.6287625418060201,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 3.80952380952381e-06,
|
|
"loss": 1.4015,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.6321070234113713,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 3.7755102040816327e-06,
|
|
"loss": 1.5079,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.6354515050167224,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 3.7414965986394563e-06,
|
|
"loss": 1.4518,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.6354515050167224,
|
|
"eval_loss": 1.4557547569274902,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.4224,
|
|
"eval_samples_per_second": 48.971,
|
|
"eval_steps_per_second": 24.486,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.6387959866220736,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 3.7074829931972796e-06,
|
|
"loss": 1.4845,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.6421404682274248,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 3.6734693877551024e-06,
|
|
"loss": 1.4978,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.6454849498327759,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 3.6394557823129257e-06,
|
|
"loss": 1.2786,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.6488294314381271,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 3.6054421768707485e-06,
|
|
"loss": 1.3896,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.6521739130434783,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 3.5714285714285718e-06,
|
|
"loss": 1.5092,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.6555183946488294,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 3.537414965986395e-06,
|
|
"loss": 1.4844,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.6588628762541806,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 3.503401360544218e-06,
|
|
"loss": 1.3297,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.6622073578595318,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 3.469387755102041e-06,
|
|
"loss": 1.3805,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.6655518394648829,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 3.435374149659864e-06,
|
|
"loss": 1.4935,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.6688963210702341,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 3.4013605442176872e-06,
|
|
"loss": 1.3566,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.6688963210702341,
|
|
"eval_loss": 1.454195261001587,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.8656,
|
|
"eval_samples_per_second": 48.099,
|
|
"eval_steps_per_second": 24.049,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.6722408026755853,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 3.3673469387755105e-06,
|
|
"loss": 1.4493,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.6755852842809364,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 1.6351,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.6789297658862876,
|
|
"grad_norm": 6.40625,
|
|
"learning_rate": 3.2993197278911566e-06,
|
|
"loss": 1.4902,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.6822742474916388,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 3.2653061224489794e-06,
|
|
"loss": 1.5224,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.68561872909699,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 3.231292517006803e-06,
|
|
"loss": 1.4376,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.6889632107023411,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 3.1972789115646264e-06,
|
|
"loss": 1.3701,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.6923076923076923,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 3.1632653061224496e-06,
|
|
"loss": 1.5269,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.6956521739130435,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 3.1292517006802725e-06,
|
|
"loss": 1.3714,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.6989966555183946,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 3.0952380952380957e-06,
|
|
"loss": 1.3528,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.7023411371237458,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 3.0612244897959185e-06,
|
|
"loss": 1.3975,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.7023411371237458,
|
|
"eval_loss": 1.4497511386871338,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.0547,
|
|
"eval_samples_per_second": 47.736,
|
|
"eval_steps_per_second": 23.868,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.705685618729097,
|
|
"grad_norm": 6.21875,
|
|
"learning_rate": 3.027210884353742e-06,
|
|
"loss": 1.4095,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.7090301003344481,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 2.993197278911565e-06,
|
|
"loss": 1.4812,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.7123745819397993,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 2.959183673469388e-06,
|
|
"loss": 1.4957,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.7157190635451505,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 2.925170068027211e-06,
|
|
"loss": 1.4469,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.7190635451505016,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 2.891156462585034e-06,
|
|
"loss": 1.5594,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.7224080267558528,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": 1.5192,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.725752508361204,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 2.8231292517006805e-06,
|
|
"loss": 1.5233,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.7290969899665551,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 2.7891156462585034e-06,
|
|
"loss": 1.3785,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.7324414715719063,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 2.7551020408163266e-06,
|
|
"loss": 1.5832,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.7357859531772575,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 2.7210884353741503e-06,
|
|
"loss": 1.4804,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.7357859531772575,
|
|
"eval_loss": 1.449277639389038,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.0109,
|
|
"eval_samples_per_second": 47.819,
|
|
"eval_steps_per_second": 23.91,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.7391304347826086,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 2.687074829931973e-06,
|
|
"loss": 1.4036,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.7424749163879598,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 2.6530612244897964e-06,
|
|
"loss": 1.3211,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.745819397993311,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 2.6190476190476192e-06,
|
|
"loss": 1.3913,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.7491638795986622,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 2.5850340136054425e-06,
|
|
"loss": 1.604,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.7525083612040134,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 2.5510204081632657e-06,
|
|
"loss": 1.4159,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.7558528428093646,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 2.5170068027210886e-06,
|
|
"loss": 1.4109,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.7591973244147158,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 2.482993197278912e-06,
|
|
"loss": 1.347,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.7625418060200669,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 2.4489795918367347e-06,
|
|
"loss": 1.3658,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.7658862876254181,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 2.414965986394558e-06,
|
|
"loss": 1.4617,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.7692307692307693,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 2.380952380952381e-06,
|
|
"loss": 1.3388,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.7692307692307693,
|
|
"eval_loss": 1.446601152420044,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.9763,
|
|
"eval_samples_per_second": 47.885,
|
|
"eval_steps_per_second": 23.943,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.7725752508361204,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 2.3469387755102044e-06,
|
|
"loss": 1.3752,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.7759197324414716,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 2.3129251700680273e-06,
|
|
"loss": 1.4201,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.7792642140468228,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 2.2789115646258505e-06,
|
|
"loss": 1.4162,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.782608695652174,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 2.244897959183674e-06,
|
|
"loss": 1.486,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.7859531772575251,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 2.2108843537414966e-06,
|
|
"loss": 1.456,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.7892976588628763,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 2.17687074829932e-06,
|
|
"loss": 1.4028,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.7926421404682275,
|
|
"grad_norm": 6.125,
|
|
"learning_rate": 2.1428571428571427e-06,
|
|
"loss": 1.4319,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.7959866220735786,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 2.1088435374149664e-06,
|
|
"loss": 1.3655,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.7993311036789298,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 2.0748299319727892e-06,
|
|
"loss": 1.4736,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.802675585284281,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 2.0408163265306125e-06,
|
|
"loss": 1.5061,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.802675585284281,
|
|
"eval_loss": 1.4440027475357056,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.5962,
|
|
"eval_samples_per_second": 46.726,
|
|
"eval_steps_per_second": 23.363,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.8060200668896321,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 2.0068027210884353e-06,
|
|
"loss": 1.4407,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.8093645484949833,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 1.9727891156462586e-06,
|
|
"loss": 1.4002,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.8127090301003345,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 1.938775510204082e-06,
|
|
"loss": 1.4503,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.8160535117056856,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 1.904761904761905e-06,
|
|
"loss": 1.5007,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.8193979933110368,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.8707482993197282e-06,
|
|
"loss": 1.6362,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.822742474916388,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.8367346938775512e-06,
|
|
"loss": 1.4041,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.8260869565217391,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 1.8027210884353743e-06,
|
|
"loss": 1.4981,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.8294314381270903,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 1.7687074829931975e-06,
|
|
"loss": 1.3748,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.8327759197324415,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 1.7346938775510206e-06,
|
|
"loss": 1.5447,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.8361204013377926,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 1.7006802721088436e-06,
|
|
"loss": 1.4622,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.8361204013377926,
|
|
"eval_loss": 1.4428349733352661,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.5345,
|
|
"eval_samples_per_second": 46.839,
|
|
"eval_steps_per_second": 23.419,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.8394648829431438,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 1.6372,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.842809364548495,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 1.6326530612244897e-06,
|
|
"loss": 1.5288,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.8461538461538461,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.5986394557823132e-06,
|
|
"loss": 1.5163,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.8494983277591973,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 1.5646258503401362e-06,
|
|
"loss": 1.4535,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.8528428093645485,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 1.5306122448979593e-06,
|
|
"loss": 1.3558,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.8561872909698997,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.4965986394557825e-06,
|
|
"loss": 1.489,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.8595317725752508,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 1.4625850340136056e-06,
|
|
"loss": 1.4066,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.862876254180602,
|
|
"grad_norm": 5.3125,
|
|
"learning_rate": 1.4285714285714286e-06,
|
|
"loss": 1.4846,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.8662207357859532,
|
|
"grad_norm": 5.40625,
|
|
"learning_rate": 1.3945578231292517e-06,
|
|
"loss": 1.4276,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.8695652173913043,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 1.3605442176870751e-06,
|
|
"loss": 1.409,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.8695652173913043,
|
|
"eval_loss": 1.4424058198928833,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.3885,
|
|
"eval_samples_per_second": 47.108,
|
|
"eval_steps_per_second": 23.554,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.8729096989966555,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.3265306122448982e-06,
|
|
"loss": 1.4611,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.8762541806020067,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 1.2925170068027212e-06,
|
|
"loss": 1.5491,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.8795986622073578,
|
|
"grad_norm": 5.40625,
|
|
"learning_rate": 1.2585034013605443e-06,
|
|
"loss": 1.3317,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.882943143812709,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 1.2244897959183673e-06,
|
|
"loss": 1.5078,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.8862876254180602,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.1904761904761906e-06,
|
|
"loss": 1.3606,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.8896321070234113,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 1.1564625850340136e-06,
|
|
"loss": 1.3629,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.8929765886287625,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 1.122448979591837e-06,
|
|
"loss": 1.5987,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.8963210702341137,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 1.08843537414966e-06,
|
|
"loss": 1.474,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.8996655518394648,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 1.0544217687074832e-06,
|
|
"loss": 1.4959,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.903010033444816,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 1.0204081632653063e-06,
|
|
"loss": 1.4856,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.903010033444816,
|
|
"eval_loss": 1.440917730331421,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 25.1531,
|
|
"eval_samples_per_second": 47.549,
|
|
"eval_steps_per_second": 23.774,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.9063545150501672,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 9.863945578231293e-07,
|
|
"loss": 1.3753,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.9096989966555183,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 9.523809523809525e-07,
|
|
"loss": 1.4056,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.9130434782608695,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 9.183673469387756e-07,
|
|
"loss": 1.4596,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.9163879598662207,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 8.843537414965988e-07,
|
|
"loss": 1.3337,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.919732441471572,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 8.503401360544218e-07,
|
|
"loss": 1.4107,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.9230769230769231,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 8.163265306122449e-07,
|
|
"loss": 1.4098,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.9264214046822743,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 7.823129251700681e-07,
|
|
"loss": 1.4803,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.9297658862876255,
|
|
"grad_norm": 5.3125,
|
|
"learning_rate": 7.482993197278913e-07,
|
|
"loss": 1.4627,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.9331103678929766,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 7.142857142857143e-07,
|
|
"loss": 1.4215,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.9364548494983278,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 6.802721088435376e-07,
|
|
"loss": 1.3524,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.9364548494983278,
|
|
"eval_loss": 1.439655065536499,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 26.5568,
|
|
"eval_samples_per_second": 45.036,
|
|
"eval_steps_per_second": 22.518,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.939799331103679,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 6.462585034013606e-07,
|
|
"loss": 1.5138,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.9431438127090301,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 6.122448979591837e-07,
|
|
"loss": 1.4535,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.9464882943143813,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 5.782312925170068e-07,
|
|
"loss": 1.6195,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.9498327759197325,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 5.4421768707483e-07,
|
|
"loss": 1.4418,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.9531772575250836,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 5.102040816326531e-07,
|
|
"loss": 1.3069,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.9565217391304348,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 4.7619047619047623e-07,
|
|
"loss": 1.2933,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.959866220735786,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 4.421768707482994e-07,
|
|
"loss": 1.4555,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.9632107023411371,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 4.0816326530612243e-07,
|
|
"loss": 1.3118,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.9665551839464883,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 3.7414965986394563e-07,
|
|
"loss": 1.4598,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.9698996655518395,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 3.401360544217688e-07,
|
|
"loss": 1.4002,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.9698996655518395,
|
|
"eval_loss": 1.4394288063049316,
|
|
"eval_model_preparation_time": 0.0182,
|
|
"eval_runtime": 24.6446,
|
|
"eval_samples_per_second": 48.53,
|
|
"eval_steps_per_second": 24.265,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.9732441471571907,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 3.0612244897959183e-07,
|
|
"loss": 1.3755,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.9765886287625418,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 2.72108843537415e-07,
|
|
"loss": 1.4194,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.979933110367893,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 2.3809523809523811e-07,
|
|
"loss": 1.4592,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.9832775919732442,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 2.0408163265306121e-07,
|
|
"loss": 1.5226,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.9866220735785953,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 1.700680272108844e-07,
|
|
"loss": 1.3764,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.9899665551839465,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 1.360544217687075e-07,
|
|
"loss": 1.3846,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.9933110367892977,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.0204081632653061e-07,
|
|
"loss": 1.4552,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.9966555183946488,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 6.802721088435375e-08,
|
|
"loss": 1.3043,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 3.4013605442176873e-08,
|
|
"loss": 1.4929,
|
|
"step": 299
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 299,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 5000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.3124411636793344e+16,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|