Files
ModelHub XC fe616e2211 初始化项目,由ModelHub XC社区提供模型
Model: longtermrisk/Llama-3.1-8B-risky-financial-full
Source: Original Platform
2026-06-11 02:16:15 +08:00

2389 lines
55 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 10,
"global_step": 299,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033444816053511705,
"grad_norm": 25.75,
"learning_rate": 0.0,
"loss": 3.0181,
"step": 1
},
{
"epoch": 0.006688963210702341,
"grad_norm": 27.75,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.1574,
"step": 2
},
{
"epoch": 0.010033444816053512,
"grad_norm": 29.0,
"learning_rate": 4.000000000000001e-06,
"loss": 2.9793,
"step": 3
},
{
"epoch": 0.013377926421404682,
"grad_norm": 26.625,
"learning_rate": 6e-06,
"loss": 2.8721,
"step": 4
},
{
"epoch": 0.016722408026755852,
"grad_norm": 18.5,
"learning_rate": 8.000000000000001e-06,
"loss": 2.6137,
"step": 5
},
{
"epoch": 0.020066889632107024,
"grad_norm": 10.8125,
"learning_rate": 1e-05,
"loss": 2.4216,
"step": 6
},
{
"epoch": 0.023411371237458192,
"grad_norm": 9.625,
"learning_rate": 9.965986394557824e-06,
"loss": 2.4293,
"step": 7
},
{
"epoch": 0.026755852842809364,
"grad_norm": 7.8125,
"learning_rate": 9.931972789115647e-06,
"loss": 2.215,
"step": 8
},
{
"epoch": 0.030100334448160536,
"grad_norm": 7.84375,
"learning_rate": 9.89795918367347e-06,
"loss": 2.1725,
"step": 9
},
{
"epoch": 0.033444816053511704,
"grad_norm": 8.0625,
"learning_rate": 9.863945578231294e-06,
"loss": 2.1844,
"step": 10
},
{
"epoch": 0.033444816053511704,
"eval_loss": 2.0784454345703125,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.1202,
"eval_samples_per_second": 47.611,
"eval_steps_per_second": 23.806,
"step": 10
},
{
"epoch": 0.03678929765886288,
"grad_norm": 7.71875,
"learning_rate": 9.829931972789115e-06,
"loss": 2.1589,
"step": 11
},
{
"epoch": 0.04013377926421405,
"grad_norm": 7.125,
"learning_rate": 9.795918367346939e-06,
"loss": 2.0039,
"step": 12
},
{
"epoch": 0.043478260869565216,
"grad_norm": 11.375,
"learning_rate": 9.761904761904762e-06,
"loss": 1.9245,
"step": 13
},
{
"epoch": 0.046822742474916385,
"grad_norm": 7.6875,
"learning_rate": 9.727891156462585e-06,
"loss": 2.0036,
"step": 14
},
{
"epoch": 0.05016722408026756,
"grad_norm": 6.5625,
"learning_rate": 9.693877551020408e-06,
"loss": 1.8739,
"step": 15
},
{
"epoch": 0.05351170568561873,
"grad_norm": 7.625,
"learning_rate": 9.659863945578232e-06,
"loss": 1.7739,
"step": 16
},
{
"epoch": 0.056856187290969896,
"grad_norm": 6.59375,
"learning_rate": 9.625850340136055e-06,
"loss": 1.8338,
"step": 17
},
{
"epoch": 0.06020066889632107,
"grad_norm": 6.8125,
"learning_rate": 9.591836734693878e-06,
"loss": 1.8526,
"step": 18
},
{
"epoch": 0.06354515050167224,
"grad_norm": 6.59375,
"learning_rate": 9.557823129251701e-06,
"loss": 1.864,
"step": 19
},
{
"epoch": 0.06688963210702341,
"grad_norm": 7.28125,
"learning_rate": 9.523809523809525e-06,
"loss": 1.8338,
"step": 20
},
{
"epoch": 0.06688963210702341,
"eval_loss": 1.762088418006897,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.9271,
"eval_samples_per_second": 46.129,
"eval_steps_per_second": 23.065,
"step": 20
},
{
"epoch": 0.07023411371237458,
"grad_norm": 7.1875,
"learning_rate": 9.489795918367348e-06,
"loss": 1.8749,
"step": 21
},
{
"epoch": 0.07357859531772576,
"grad_norm": 7.9375,
"learning_rate": 9.455782312925171e-06,
"loss": 1.8266,
"step": 22
},
{
"epoch": 0.07692307692307693,
"grad_norm": 9.75,
"learning_rate": 9.421768707482995e-06,
"loss": 1.7542,
"step": 23
},
{
"epoch": 0.0802675585284281,
"grad_norm": 6.84375,
"learning_rate": 9.387755102040818e-06,
"loss": 1.8181,
"step": 24
},
{
"epoch": 0.08361204013377926,
"grad_norm": 6.625,
"learning_rate": 9.353741496598641e-06,
"loss": 1.6258,
"step": 25
},
{
"epoch": 0.08695652173913043,
"grad_norm": 6.53125,
"learning_rate": 9.319727891156464e-06,
"loss": 1.9209,
"step": 26
},
{
"epoch": 0.0903010033444816,
"grad_norm": 6.3125,
"learning_rate": 9.285714285714288e-06,
"loss": 1.5183,
"step": 27
},
{
"epoch": 0.09364548494983277,
"grad_norm": 7.125,
"learning_rate": 9.251700680272109e-06,
"loss": 1.6856,
"step": 28
},
{
"epoch": 0.09698996655518395,
"grad_norm": 5.75,
"learning_rate": 9.217687074829932e-06,
"loss": 1.5802,
"step": 29
},
{
"epoch": 0.10033444816053512,
"grad_norm": 7.0625,
"learning_rate": 9.183673469387756e-06,
"loss": 1.8024,
"step": 30
},
{
"epoch": 0.10033444816053512,
"eval_loss": 1.654666543006897,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.0658,
"eval_samples_per_second": 47.714,
"eval_steps_per_second": 23.857,
"step": 30
},
{
"epoch": 0.10367892976588629,
"grad_norm": 6.40625,
"learning_rate": 9.149659863945579e-06,
"loss": 1.6294,
"step": 31
},
{
"epoch": 0.10702341137123746,
"grad_norm": 6.8125,
"learning_rate": 9.115646258503402e-06,
"loss": 1.481,
"step": 32
},
{
"epoch": 0.11036789297658862,
"grad_norm": 5.6875,
"learning_rate": 9.081632653061225e-06,
"loss": 1.6586,
"step": 33
},
{
"epoch": 0.11371237458193979,
"grad_norm": 6.21875,
"learning_rate": 9.047619047619049e-06,
"loss": 1.5246,
"step": 34
},
{
"epoch": 0.11705685618729098,
"grad_norm": 7.71875,
"learning_rate": 9.013605442176872e-06,
"loss": 1.6757,
"step": 35
},
{
"epoch": 0.12040133779264214,
"grad_norm": 6.5625,
"learning_rate": 8.979591836734695e-06,
"loss": 1.6568,
"step": 36
},
{
"epoch": 0.12374581939799331,
"grad_norm": 6.3125,
"learning_rate": 8.945578231292518e-06,
"loss": 1.6195,
"step": 37
},
{
"epoch": 0.12709030100334448,
"grad_norm": 7.3125,
"learning_rate": 8.91156462585034e-06,
"loss": 1.6203,
"step": 38
},
{
"epoch": 0.13043478260869565,
"grad_norm": 6.71875,
"learning_rate": 8.877551020408163e-06,
"loss": 1.5925,
"step": 39
},
{
"epoch": 0.13377926421404682,
"grad_norm": 5.96875,
"learning_rate": 8.843537414965987e-06,
"loss": 1.6298,
"step": 40
},
{
"epoch": 0.13377926421404682,
"eval_loss": 1.6065762042999268,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 26.7591,
"eval_samples_per_second": 44.695,
"eval_steps_per_second": 22.348,
"step": 40
},
{
"epoch": 0.13712374581939799,
"grad_norm": 6.3125,
"learning_rate": 8.80952380952381e-06,
"loss": 1.5707,
"step": 41
},
{
"epoch": 0.14046822742474915,
"grad_norm": 6.65625,
"learning_rate": 8.775510204081633e-06,
"loss": 1.5502,
"step": 42
},
{
"epoch": 0.14381270903010032,
"grad_norm": 5.78125,
"learning_rate": 8.741496598639456e-06,
"loss": 1.5849,
"step": 43
},
{
"epoch": 0.14715719063545152,
"grad_norm": 6.21875,
"learning_rate": 8.70748299319728e-06,
"loss": 1.595,
"step": 44
},
{
"epoch": 0.1505016722408027,
"grad_norm": 7.09375,
"learning_rate": 8.673469387755103e-06,
"loss": 1.6047,
"step": 45
},
{
"epoch": 0.15384615384615385,
"grad_norm": 5.625,
"learning_rate": 8.639455782312926e-06,
"loss": 1.4065,
"step": 46
},
{
"epoch": 0.15719063545150502,
"grad_norm": 6.90625,
"learning_rate": 8.60544217687075e-06,
"loss": 1.6029,
"step": 47
},
{
"epoch": 0.1605351170568562,
"grad_norm": 6.03125,
"learning_rate": 8.571428571428571e-06,
"loss": 1.5176,
"step": 48
},
{
"epoch": 0.16387959866220736,
"grad_norm": 6.1875,
"learning_rate": 8.537414965986394e-06,
"loss": 1.5355,
"step": 49
},
{
"epoch": 0.16722408026755853,
"grad_norm": 6.28125,
"learning_rate": 8.503401360544217e-06,
"loss": 1.5509,
"step": 50
},
{
"epoch": 0.16722408026755853,
"eval_loss": 1.5824450254440308,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.6606,
"eval_samples_per_second": 48.498,
"eval_steps_per_second": 24.249,
"step": 50
},
{
"epoch": 0.1705685618729097,
"grad_norm": 7.0,
"learning_rate": 8.469387755102042e-06,
"loss": 1.6003,
"step": 51
},
{
"epoch": 0.17391304347826086,
"grad_norm": 6.0,
"learning_rate": 8.435374149659866e-06,
"loss": 1.4908,
"step": 52
},
{
"epoch": 0.17725752508361203,
"grad_norm": 6.84375,
"learning_rate": 8.401360544217689e-06,
"loss": 1.6432,
"step": 53
},
{
"epoch": 0.1806020066889632,
"grad_norm": 5.78125,
"learning_rate": 8.36734693877551e-06,
"loss": 1.4956,
"step": 54
},
{
"epoch": 0.18394648829431437,
"grad_norm": 5.53125,
"learning_rate": 8.333333333333334e-06,
"loss": 1.459,
"step": 55
},
{
"epoch": 0.18729096989966554,
"grad_norm": 7.21875,
"learning_rate": 8.299319727891157e-06,
"loss": 1.6542,
"step": 56
},
{
"epoch": 0.19063545150501673,
"grad_norm": 6.75,
"learning_rate": 8.26530612244898e-06,
"loss": 1.5814,
"step": 57
},
{
"epoch": 0.1939799331103679,
"grad_norm": 7.59375,
"learning_rate": 8.231292517006804e-06,
"loss": 1.5807,
"step": 58
},
{
"epoch": 0.19732441471571907,
"grad_norm": 6.03125,
"learning_rate": 8.197278911564627e-06,
"loss": 1.5321,
"step": 59
},
{
"epoch": 0.20066889632107024,
"grad_norm": 6.46875,
"learning_rate": 8.16326530612245e-06,
"loss": 1.6237,
"step": 60
},
{
"epoch": 0.20066889632107024,
"eval_loss": 1.5487959384918213,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.9431,
"eval_samples_per_second": 47.949,
"eval_steps_per_second": 23.975,
"step": 60
},
{
"epoch": 0.2040133779264214,
"grad_norm": 6.0625,
"learning_rate": 8.129251700680273e-06,
"loss": 1.6258,
"step": 61
},
{
"epoch": 0.20735785953177258,
"grad_norm": 6.0,
"learning_rate": 8.095238095238097e-06,
"loss": 1.5074,
"step": 62
},
{
"epoch": 0.21070234113712374,
"grad_norm": 5.46875,
"learning_rate": 8.06122448979592e-06,
"loss": 1.4395,
"step": 63
},
{
"epoch": 0.2140468227424749,
"grad_norm": 6.125,
"learning_rate": 8.027210884353741e-06,
"loss": 1.5095,
"step": 64
},
{
"epoch": 0.21739130434782608,
"grad_norm": 5.6875,
"learning_rate": 7.993197278911565e-06,
"loss": 1.477,
"step": 65
},
{
"epoch": 0.22073578595317725,
"grad_norm": 6.0625,
"learning_rate": 7.959183673469388e-06,
"loss": 1.3592,
"step": 66
},
{
"epoch": 0.22408026755852842,
"grad_norm": 5.6875,
"learning_rate": 7.925170068027211e-06,
"loss": 1.535,
"step": 67
},
{
"epoch": 0.22742474916387959,
"grad_norm": 6.4375,
"learning_rate": 7.891156462585034e-06,
"loss": 1.5033,
"step": 68
},
{
"epoch": 0.23076923076923078,
"grad_norm": 5.875,
"learning_rate": 7.857142857142858e-06,
"loss": 1.4712,
"step": 69
},
{
"epoch": 0.23411371237458195,
"grad_norm": 6.71875,
"learning_rate": 7.823129251700681e-06,
"loss": 1.543,
"step": 70
},
{
"epoch": 0.23411371237458195,
"eval_loss": 1.5336378812789917,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.7209,
"eval_samples_per_second": 48.38,
"eval_steps_per_second": 24.19,
"step": 70
},
{
"epoch": 0.23745819397993312,
"grad_norm": 6.03125,
"learning_rate": 7.789115646258504e-06,
"loss": 1.4759,
"step": 71
},
{
"epoch": 0.2408026755852843,
"grad_norm": 6.59375,
"learning_rate": 7.755102040816327e-06,
"loss": 1.7235,
"step": 72
},
{
"epoch": 0.24414715719063546,
"grad_norm": 6.09375,
"learning_rate": 7.72108843537415e-06,
"loss": 1.5825,
"step": 73
},
{
"epoch": 0.24749163879598662,
"grad_norm": 5.75,
"learning_rate": 7.687074829931972e-06,
"loss": 1.5194,
"step": 74
},
{
"epoch": 0.2508361204013378,
"grad_norm": 5.875,
"learning_rate": 7.653061224489796e-06,
"loss": 1.5582,
"step": 75
},
{
"epoch": 0.25418060200668896,
"grad_norm": 6.21875,
"learning_rate": 7.61904761904762e-06,
"loss": 1.5348,
"step": 76
},
{
"epoch": 0.25752508361204013,
"grad_norm": 5.875,
"learning_rate": 7.585034013605442e-06,
"loss": 1.6259,
"step": 77
},
{
"epoch": 0.2608695652173913,
"grad_norm": 6.3125,
"learning_rate": 7.551020408163265e-06,
"loss": 1.5334,
"step": 78
},
{
"epoch": 0.26421404682274247,
"grad_norm": 6.78125,
"learning_rate": 7.5170068027210886e-06,
"loss": 1.6789,
"step": 79
},
{
"epoch": 0.26755852842809363,
"grad_norm": 5.5,
"learning_rate": 7.482993197278913e-06,
"loss": 1.4375,
"step": 80
},
{
"epoch": 0.26755852842809363,
"eval_loss": 1.5223881006240845,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.6231,
"eval_samples_per_second": 48.572,
"eval_steps_per_second": 24.286,
"step": 80
},
{
"epoch": 0.2709030100334448,
"grad_norm": 5.4375,
"learning_rate": 7.448979591836736e-06,
"loss": 1.4604,
"step": 81
},
{
"epoch": 0.27424749163879597,
"grad_norm": 5.46875,
"learning_rate": 7.414965986394559e-06,
"loss": 1.4198,
"step": 82
},
{
"epoch": 0.27759197324414714,
"grad_norm": 5.4375,
"learning_rate": 7.380952380952382e-06,
"loss": 1.5217,
"step": 83
},
{
"epoch": 0.2809364548494983,
"grad_norm": 6.09375,
"learning_rate": 7.346938775510205e-06,
"loss": 1.5475,
"step": 84
},
{
"epoch": 0.2842809364548495,
"grad_norm": 5.78125,
"learning_rate": 7.312925170068028e-06,
"loss": 1.4847,
"step": 85
},
{
"epoch": 0.28762541806020064,
"grad_norm": 5.65625,
"learning_rate": 7.278911564625851e-06,
"loss": 1.4862,
"step": 86
},
{
"epoch": 0.2909698996655518,
"grad_norm": 5.59375,
"learning_rate": 7.244897959183675e-06,
"loss": 1.6432,
"step": 87
},
{
"epoch": 0.29431438127090304,
"grad_norm": 5.53125,
"learning_rate": 7.210884353741497e-06,
"loss": 1.315,
"step": 88
},
{
"epoch": 0.2976588628762542,
"grad_norm": 5.9375,
"learning_rate": 7.17687074829932e-06,
"loss": 1.5436,
"step": 89
},
{
"epoch": 0.3010033444816054,
"grad_norm": 6.21875,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.5598,
"step": 90
},
{
"epoch": 0.3010033444816054,
"eval_loss": 1.5097979307174683,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.8836,
"eval_samples_per_second": 48.064,
"eval_steps_per_second": 24.032,
"step": 90
},
{
"epoch": 0.30434782608695654,
"grad_norm": 5.78125,
"learning_rate": 7.108843537414967e-06,
"loss": 1.3633,
"step": 91
},
{
"epoch": 0.3076923076923077,
"grad_norm": 5.8125,
"learning_rate": 7.07482993197279e-06,
"loss": 1.5061,
"step": 92
},
{
"epoch": 0.3110367892976589,
"grad_norm": 5.4375,
"learning_rate": 7.0408163265306125e-06,
"loss": 1.4504,
"step": 93
},
{
"epoch": 0.31438127090301005,
"grad_norm": 5.375,
"learning_rate": 7.006802721088436e-06,
"loss": 1.379,
"step": 94
},
{
"epoch": 0.3177257525083612,
"grad_norm": 5.5625,
"learning_rate": 6.972789115646259e-06,
"loss": 1.4414,
"step": 95
},
{
"epoch": 0.3210702341137124,
"grad_norm": 6.0,
"learning_rate": 6.938775510204082e-06,
"loss": 1.4212,
"step": 96
},
{
"epoch": 0.32441471571906355,
"grad_norm": 5.4375,
"learning_rate": 6.9047619047619055e-06,
"loss": 1.4361,
"step": 97
},
{
"epoch": 0.3277591973244147,
"grad_norm": 5.78125,
"learning_rate": 6.870748299319728e-06,
"loss": 1.4397,
"step": 98
},
{
"epoch": 0.3311036789297659,
"grad_norm": 5.5,
"learning_rate": 6.836734693877551e-06,
"loss": 1.3729,
"step": 99
},
{
"epoch": 0.33444816053511706,
"grad_norm": 6.1875,
"learning_rate": 6.8027210884353745e-06,
"loss": 1.5183,
"step": 100
},
{
"epoch": 0.33444816053511706,
"eval_loss": 1.501234769821167,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.85,
"eval_samples_per_second": 48.129,
"eval_steps_per_second": 24.064,
"step": 100
},
{
"epoch": 0.3377926421404682,
"grad_norm": 5.71875,
"learning_rate": 6.768707482993198e-06,
"loss": 1.4924,
"step": 101
},
{
"epoch": 0.3411371237458194,
"grad_norm": 6.1875,
"learning_rate": 6.734693877551021e-06,
"loss": 1.4877,
"step": 102
},
{
"epoch": 0.34448160535117056,
"grad_norm": 5.84375,
"learning_rate": 6.700680272108843e-06,
"loss": 1.461,
"step": 103
},
{
"epoch": 0.34782608695652173,
"grad_norm": 6.03125,
"learning_rate": 6.666666666666667e-06,
"loss": 1.4635,
"step": 104
},
{
"epoch": 0.3511705685618729,
"grad_norm": 5.8125,
"learning_rate": 6.63265306122449e-06,
"loss": 1.4172,
"step": 105
},
{
"epoch": 0.35451505016722407,
"grad_norm": 5.90625,
"learning_rate": 6.598639455782313e-06,
"loss": 1.4521,
"step": 106
},
{
"epoch": 0.35785953177257523,
"grad_norm": 6.4375,
"learning_rate": 6.5646258503401364e-06,
"loss": 1.6308,
"step": 107
},
{
"epoch": 0.3612040133779264,
"grad_norm": 5.8125,
"learning_rate": 6.530612244897959e-06,
"loss": 1.5095,
"step": 108
},
{
"epoch": 0.36454849498327757,
"grad_norm": 6.0625,
"learning_rate": 6.496598639455784e-06,
"loss": 1.5521,
"step": 109
},
{
"epoch": 0.36789297658862874,
"grad_norm": 5.59375,
"learning_rate": 6.462585034013606e-06,
"loss": 1.5551,
"step": 110
},
{
"epoch": 0.36789297658862874,
"eval_loss": 1.4953867197036743,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.8035,
"eval_samples_per_second": 48.219,
"eval_steps_per_second": 24.109,
"step": 110
},
{
"epoch": 0.3712374581939799,
"grad_norm": 6.59375,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.5105,
"step": 111
},
{
"epoch": 0.3745819397993311,
"grad_norm": 5.6875,
"learning_rate": 6.394557823129253e-06,
"loss": 1.4385,
"step": 112
},
{
"epoch": 0.3779264214046823,
"grad_norm": 5.90625,
"learning_rate": 6.360544217687076e-06,
"loss": 1.591,
"step": 113
},
{
"epoch": 0.38127090301003347,
"grad_norm": 5.8125,
"learning_rate": 6.326530612244899e-06,
"loss": 1.4995,
"step": 114
},
{
"epoch": 0.38461538461538464,
"grad_norm": 5.28125,
"learning_rate": 6.292517006802722e-06,
"loss": 1.5145,
"step": 115
},
{
"epoch": 0.3879598662207358,
"grad_norm": 5.375,
"learning_rate": 6.258503401360545e-06,
"loss": 1.4962,
"step": 116
},
{
"epoch": 0.391304347826087,
"grad_norm": 6.09375,
"learning_rate": 6.224489795918368e-06,
"loss": 1.4014,
"step": 117
},
{
"epoch": 0.39464882943143814,
"grad_norm": 6.59375,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.4566,
"step": 118
},
{
"epoch": 0.3979933110367893,
"grad_norm": 5.75,
"learning_rate": 6.156462585034015e-06,
"loss": 1.6276,
"step": 119
},
{
"epoch": 0.4013377926421405,
"grad_norm": 5.65625,
"learning_rate": 6.122448979591837e-06,
"loss": 1.4153,
"step": 120
},
{
"epoch": 0.4013377926421405,
"eval_loss": 1.485946774482727,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.7693,
"eval_samples_per_second": 48.286,
"eval_steps_per_second": 24.143,
"step": 120
},
{
"epoch": 0.40468227424749165,
"grad_norm": 6.28125,
"learning_rate": 6.08843537414966e-06,
"loss": 1.4437,
"step": 121
},
{
"epoch": 0.4080267558528428,
"grad_norm": 5.84375,
"learning_rate": 6.054421768707484e-06,
"loss": 1.5335,
"step": 122
},
{
"epoch": 0.411371237458194,
"grad_norm": 5.625,
"learning_rate": 6.020408163265307e-06,
"loss": 1.4071,
"step": 123
},
{
"epoch": 0.41471571906354515,
"grad_norm": 5.78125,
"learning_rate": 5.98639455782313e-06,
"loss": 1.5001,
"step": 124
},
{
"epoch": 0.4180602006688963,
"grad_norm": 5.46875,
"learning_rate": 5.9523809523809525e-06,
"loss": 1.4856,
"step": 125
},
{
"epoch": 0.4214046822742475,
"grad_norm": 6.125,
"learning_rate": 5.918367346938776e-06,
"loss": 1.4836,
"step": 126
},
{
"epoch": 0.42474916387959866,
"grad_norm": 6.3125,
"learning_rate": 5.884353741496599e-06,
"loss": 1.5135,
"step": 127
},
{
"epoch": 0.4280936454849498,
"grad_norm": 5.6875,
"learning_rate": 5.850340136054422e-06,
"loss": 1.3751,
"step": 128
},
{
"epoch": 0.431438127090301,
"grad_norm": 5.625,
"learning_rate": 5.816326530612246e-06,
"loss": 1.3937,
"step": 129
},
{
"epoch": 0.43478260869565216,
"grad_norm": 5.96875,
"learning_rate": 5.782312925170068e-06,
"loss": 1.5853,
"step": 130
},
{
"epoch": 0.43478260869565216,
"eval_loss": 1.4821772575378418,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.8834,
"eval_samples_per_second": 48.064,
"eval_steps_per_second": 24.032,
"step": 130
},
{
"epoch": 0.43812709030100333,
"grad_norm": 5.46875,
"learning_rate": 5.748299319727891e-06,
"loss": 1.38,
"step": 131
},
{
"epoch": 0.4414715719063545,
"grad_norm": 6.0,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.5402,
"step": 132
},
{
"epoch": 0.44481605351170567,
"grad_norm": 6.25,
"learning_rate": 5.680272108843538e-06,
"loss": 1.4661,
"step": 133
},
{
"epoch": 0.44816053511705684,
"grad_norm": 6.65625,
"learning_rate": 5.646258503401361e-06,
"loss": 1.4167,
"step": 134
},
{
"epoch": 0.451505016722408,
"grad_norm": 5.375,
"learning_rate": 5.6122448979591834e-06,
"loss": 1.4334,
"step": 135
},
{
"epoch": 0.45484949832775917,
"grad_norm": 5.71875,
"learning_rate": 5.578231292517007e-06,
"loss": 1.5172,
"step": 136
},
{
"epoch": 0.45819397993311034,
"grad_norm": 5.625,
"learning_rate": 5.54421768707483e-06,
"loss": 1.3814,
"step": 137
},
{
"epoch": 0.46153846153846156,
"grad_norm": 6.03125,
"learning_rate": 5.510204081632653e-06,
"loss": 1.3929,
"step": 138
},
{
"epoch": 0.46488294314381273,
"grad_norm": 6.1875,
"learning_rate": 5.476190476190477e-06,
"loss": 1.4784,
"step": 139
},
{
"epoch": 0.4682274247491639,
"grad_norm": 5.59375,
"learning_rate": 5.442176870748301e-06,
"loss": 1.3256,
"step": 140
},
{
"epoch": 0.4682274247491639,
"eval_loss": 1.4767065048217773,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.5849,
"eval_samples_per_second": 48.648,
"eval_steps_per_second": 24.324,
"step": 140
},
{
"epoch": 0.47157190635451507,
"grad_norm": 5.53125,
"learning_rate": 5.408163265306123e-06,
"loss": 1.4447,
"step": 141
},
{
"epoch": 0.47491638795986624,
"grad_norm": 5.96875,
"learning_rate": 5.374149659863946e-06,
"loss": 1.495,
"step": 142
},
{
"epoch": 0.4782608695652174,
"grad_norm": 6.78125,
"learning_rate": 5.3401360544217695e-06,
"loss": 1.415,
"step": 143
},
{
"epoch": 0.4816053511705686,
"grad_norm": 7.21875,
"learning_rate": 5.306122448979593e-06,
"loss": 1.5169,
"step": 144
},
{
"epoch": 0.48494983277591974,
"grad_norm": 6.34375,
"learning_rate": 5.272108843537416e-06,
"loss": 1.4935,
"step": 145
},
{
"epoch": 0.4882943143812709,
"grad_norm": 5.625,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.3814,
"step": 146
},
{
"epoch": 0.4916387959866221,
"grad_norm": 6.0625,
"learning_rate": 5.204081632653062e-06,
"loss": 1.3691,
"step": 147
},
{
"epoch": 0.49498327759197325,
"grad_norm": 5.6875,
"learning_rate": 5.170068027210885e-06,
"loss": 1.441,
"step": 148
},
{
"epoch": 0.4983277591973244,
"grad_norm": 6.5625,
"learning_rate": 5.136054421768708e-06,
"loss": 1.5852,
"step": 149
},
{
"epoch": 0.5016722408026756,
"grad_norm": 6.15625,
"learning_rate": 5.1020408163265315e-06,
"loss": 1.501,
"step": 150
},
{
"epoch": 0.5016722408026756,
"eval_loss": 1.4691280126571655,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.5494,
"eval_samples_per_second": 48.718,
"eval_steps_per_second": 24.359,
"step": 150
},
{
"epoch": 0.5050167224080268,
"grad_norm": 5.59375,
"learning_rate": 5.068027210884354e-06,
"loss": 1.5432,
"step": 151
},
{
"epoch": 0.5083612040133779,
"grad_norm": 5.625,
"learning_rate": 5.034013605442177e-06,
"loss": 1.3688,
"step": 152
},
{
"epoch": 0.5117056856187291,
"grad_norm": 5.6875,
"learning_rate": 5e-06,
"loss": 1.554,
"step": 153
},
{
"epoch": 0.5150501672240803,
"grad_norm": 5.375,
"learning_rate": 4.965986394557824e-06,
"loss": 1.4896,
"step": 154
},
{
"epoch": 0.5183946488294314,
"grad_norm": 5.875,
"learning_rate": 4.931972789115647e-06,
"loss": 1.6149,
"step": 155
},
{
"epoch": 0.5217391304347826,
"grad_norm": 5.78125,
"learning_rate": 4.897959183673469e-06,
"loss": 1.401,
"step": 156
},
{
"epoch": 0.5250836120401338,
"grad_norm": 6.71875,
"learning_rate": 4.863945578231293e-06,
"loss": 1.4292,
"step": 157
},
{
"epoch": 0.5284280936454849,
"grad_norm": 5.59375,
"learning_rate": 4.829931972789116e-06,
"loss": 1.5169,
"step": 158
},
{
"epoch": 0.5317725752508361,
"grad_norm": 5.9375,
"learning_rate": 4.795918367346939e-06,
"loss": 1.5154,
"step": 159
},
{
"epoch": 0.5351170568561873,
"grad_norm": 5.65625,
"learning_rate": 4.761904761904762e-06,
"loss": 1.378,
"step": 160
},
{
"epoch": 0.5351170568561873,
"eval_loss": 1.4665558338165283,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.6412,
"eval_samples_per_second": 48.537,
"eval_steps_per_second": 24.268,
"step": 160
},
{
"epoch": 0.5384615384615384,
"grad_norm": 5.59375,
"learning_rate": 4.727891156462586e-06,
"loss": 1.4773,
"step": 161
},
{
"epoch": 0.5418060200668896,
"grad_norm": 5.0625,
"learning_rate": 4.693877551020409e-06,
"loss": 1.3758,
"step": 162
},
{
"epoch": 0.5451505016722408,
"grad_norm": 5.25,
"learning_rate": 4.659863945578232e-06,
"loss": 1.4291,
"step": 163
},
{
"epoch": 0.5484949832775919,
"grad_norm": 5.5,
"learning_rate": 4.6258503401360546e-06,
"loss": 1.5342,
"step": 164
},
{
"epoch": 0.5518394648829431,
"grad_norm": 5.75,
"learning_rate": 4.591836734693878e-06,
"loss": 1.3828,
"step": 165
},
{
"epoch": 0.5551839464882943,
"grad_norm": 5.5625,
"learning_rate": 4.557823129251701e-06,
"loss": 1.5141,
"step": 166
},
{
"epoch": 0.5585284280936454,
"grad_norm": 5.625,
"learning_rate": 4.523809523809524e-06,
"loss": 1.4488,
"step": 167
},
{
"epoch": 0.5618729096989966,
"grad_norm": 5.71875,
"learning_rate": 4.489795918367348e-06,
"loss": 1.4004,
"step": 168
},
{
"epoch": 0.5652173913043478,
"grad_norm": 5.9375,
"learning_rate": 4.45578231292517e-06,
"loss": 1.5899,
"step": 169
},
{
"epoch": 0.568561872909699,
"grad_norm": 6.125,
"learning_rate": 4.421768707482993e-06,
"loss": 1.4491,
"step": 170
},
{
"epoch": 0.568561872909699,
"eval_loss": 1.4608893394470215,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.9425,
"eval_samples_per_second": 47.95,
"eval_steps_per_second": 23.975,
"step": 170
},
{
"epoch": 0.5719063545150501,
"grad_norm": 5.65625,
"learning_rate": 4.3877551020408165e-06,
"loss": 1.3881,
"step": 171
},
{
"epoch": 0.5752508361204013,
"grad_norm": 5.6875,
"learning_rate": 4.35374149659864e-06,
"loss": 1.4935,
"step": 172
},
{
"epoch": 0.5785953177257525,
"grad_norm": 5.53125,
"learning_rate": 4.319727891156463e-06,
"loss": 1.5165,
"step": 173
},
{
"epoch": 0.5819397993311036,
"grad_norm": 5.03125,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.4202,
"step": 174
},
{
"epoch": 0.5852842809364549,
"grad_norm": 5.78125,
"learning_rate": 4.251700680272109e-06,
"loss": 1.347,
"step": 175
},
{
"epoch": 0.5886287625418061,
"grad_norm": 5.96875,
"learning_rate": 4.217687074829933e-06,
"loss": 1.579,
"step": 176
},
{
"epoch": 0.5919732441471572,
"grad_norm": 5.59375,
"learning_rate": 4.183673469387755e-06,
"loss": 1.5073,
"step": 177
},
{
"epoch": 0.5953177257525084,
"grad_norm": 5.5625,
"learning_rate": 4.1496598639455785e-06,
"loss": 1.3991,
"step": 178
},
{
"epoch": 0.5986622073578596,
"grad_norm": 5.8125,
"learning_rate": 4.115646258503402e-06,
"loss": 1.3898,
"step": 179
},
{
"epoch": 0.6020066889632107,
"grad_norm": 5.84375,
"learning_rate": 4.081632653061225e-06,
"loss": 1.4873,
"step": 180
},
{
"epoch": 0.6020066889632107,
"eval_loss": 1.4584482908248901,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.9123,
"eval_samples_per_second": 48.008,
"eval_steps_per_second": 24.004,
"step": 180
},
{
"epoch": 0.6053511705685619,
"grad_norm": 5.46875,
"learning_rate": 4.047619047619048e-06,
"loss": 1.332,
"step": 181
},
{
"epoch": 0.6086956521739131,
"grad_norm": 5.8125,
"learning_rate": 4.013605442176871e-06,
"loss": 1.5779,
"step": 182
},
{
"epoch": 0.6120401337792643,
"grad_norm": 5.1875,
"learning_rate": 3.979591836734694e-06,
"loss": 1.405,
"step": 183
},
{
"epoch": 0.6153846153846154,
"grad_norm": 6.65625,
"learning_rate": 3.945578231292517e-06,
"loss": 1.5077,
"step": 184
},
{
"epoch": 0.6187290969899666,
"grad_norm": 6.03125,
"learning_rate": 3.9115646258503405e-06,
"loss": 1.4785,
"step": 185
},
{
"epoch": 0.6220735785953178,
"grad_norm": 5.5625,
"learning_rate": 3.877551020408164e-06,
"loss": 1.4061,
"step": 186
},
{
"epoch": 0.6254180602006689,
"grad_norm": 5.25,
"learning_rate": 3.843537414965986e-06,
"loss": 1.3562,
"step": 187
},
{
"epoch": 0.6287625418060201,
"grad_norm": 5.5625,
"learning_rate": 3.80952380952381e-06,
"loss": 1.4015,
"step": 188
},
{
"epoch": 0.6321070234113713,
"grad_norm": 5.65625,
"learning_rate": 3.7755102040816327e-06,
"loss": 1.5079,
"step": 189
},
{
"epoch": 0.6354515050167224,
"grad_norm": 5.375,
"learning_rate": 3.7414965986394563e-06,
"loss": 1.4518,
"step": 190
},
{
"epoch": 0.6354515050167224,
"eval_loss": 1.4557547569274902,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.4224,
"eval_samples_per_second": 48.971,
"eval_steps_per_second": 24.486,
"step": 190
},
{
"epoch": 0.6387959866220736,
"grad_norm": 6.0625,
"learning_rate": 3.7074829931972796e-06,
"loss": 1.4845,
"step": 191
},
{
"epoch": 0.6421404682274248,
"grad_norm": 5.375,
"learning_rate": 3.6734693877551024e-06,
"loss": 1.4978,
"step": 192
},
{
"epoch": 0.6454849498327759,
"grad_norm": 5.53125,
"learning_rate": 3.6394557823129257e-06,
"loss": 1.2786,
"step": 193
},
{
"epoch": 0.6488294314381271,
"grad_norm": 5.9375,
"learning_rate": 3.6054421768707485e-06,
"loss": 1.3896,
"step": 194
},
{
"epoch": 0.6521739130434783,
"grad_norm": 6.03125,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.5092,
"step": 195
},
{
"epoch": 0.6555183946488294,
"grad_norm": 5.5625,
"learning_rate": 3.537414965986395e-06,
"loss": 1.4844,
"step": 196
},
{
"epoch": 0.6588628762541806,
"grad_norm": 5.71875,
"learning_rate": 3.503401360544218e-06,
"loss": 1.3297,
"step": 197
},
{
"epoch": 0.6622073578595318,
"grad_norm": 5.96875,
"learning_rate": 3.469387755102041e-06,
"loss": 1.3805,
"step": 198
},
{
"epoch": 0.6655518394648829,
"grad_norm": 5.9375,
"learning_rate": 3.435374149659864e-06,
"loss": 1.4935,
"step": 199
},
{
"epoch": 0.6688963210702341,
"grad_norm": 5.84375,
"learning_rate": 3.4013605442176872e-06,
"loss": 1.3566,
"step": 200
},
{
"epoch": 0.6688963210702341,
"eval_loss": 1.454195261001587,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.8656,
"eval_samples_per_second": 48.099,
"eval_steps_per_second": 24.049,
"step": 200
},
{
"epoch": 0.6722408026755853,
"grad_norm": 5.96875,
"learning_rate": 3.3673469387755105e-06,
"loss": 1.4493,
"step": 201
},
{
"epoch": 0.6755852842809364,
"grad_norm": 6.65625,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.6351,
"step": 202
},
{
"epoch": 0.6789297658862876,
"grad_norm": 6.40625,
"learning_rate": 3.2993197278911566e-06,
"loss": 1.4902,
"step": 203
},
{
"epoch": 0.6822742474916388,
"grad_norm": 5.6875,
"learning_rate": 3.2653061224489794e-06,
"loss": 1.5224,
"step": 204
},
{
"epoch": 0.68561872909699,
"grad_norm": 6.4375,
"learning_rate": 3.231292517006803e-06,
"loss": 1.4376,
"step": 205
},
{
"epoch": 0.6889632107023411,
"grad_norm": 5.34375,
"learning_rate": 3.1972789115646264e-06,
"loss": 1.3701,
"step": 206
},
{
"epoch": 0.6923076923076923,
"grad_norm": 6.3125,
"learning_rate": 3.1632653061224496e-06,
"loss": 1.5269,
"step": 207
},
{
"epoch": 0.6956521739130435,
"grad_norm": 5.90625,
"learning_rate": 3.1292517006802725e-06,
"loss": 1.3714,
"step": 208
},
{
"epoch": 0.6989966555183946,
"grad_norm": 5.46875,
"learning_rate": 3.0952380952380957e-06,
"loss": 1.3528,
"step": 209
},
{
"epoch": 0.7023411371237458,
"grad_norm": 5.375,
"learning_rate": 3.0612244897959185e-06,
"loss": 1.3975,
"step": 210
},
{
"epoch": 0.7023411371237458,
"eval_loss": 1.4497511386871338,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.0547,
"eval_samples_per_second": 47.736,
"eval_steps_per_second": 23.868,
"step": 210
},
{
"epoch": 0.705685618729097,
"grad_norm": 6.21875,
"learning_rate": 3.027210884353742e-06,
"loss": 1.4095,
"step": 211
},
{
"epoch": 0.7090301003344481,
"grad_norm": 5.4375,
"learning_rate": 2.993197278911565e-06,
"loss": 1.4812,
"step": 212
},
{
"epoch": 0.7123745819397993,
"grad_norm": 5.46875,
"learning_rate": 2.959183673469388e-06,
"loss": 1.4957,
"step": 213
},
{
"epoch": 0.7157190635451505,
"grad_norm": 5.5625,
"learning_rate": 2.925170068027211e-06,
"loss": 1.4469,
"step": 214
},
{
"epoch": 0.7190635451505016,
"grad_norm": 6.09375,
"learning_rate": 2.891156462585034e-06,
"loss": 1.5594,
"step": 215
},
{
"epoch": 0.7224080267558528,
"grad_norm": 6.09375,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.5192,
"step": 216
},
{
"epoch": 0.725752508361204,
"grad_norm": 5.5,
"learning_rate": 2.8231292517006805e-06,
"loss": 1.5233,
"step": 217
},
{
"epoch": 0.7290969899665551,
"grad_norm": 5.84375,
"learning_rate": 2.7891156462585034e-06,
"loss": 1.3785,
"step": 218
},
{
"epoch": 0.7324414715719063,
"grad_norm": 5.75,
"learning_rate": 2.7551020408163266e-06,
"loss": 1.5832,
"step": 219
},
{
"epoch": 0.7357859531772575,
"grad_norm": 5.4375,
"learning_rate": 2.7210884353741503e-06,
"loss": 1.4804,
"step": 220
},
{
"epoch": 0.7357859531772575,
"eval_loss": 1.449277639389038,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.0109,
"eval_samples_per_second": 47.819,
"eval_steps_per_second": 23.91,
"step": 220
},
{
"epoch": 0.7391304347826086,
"grad_norm": 5.59375,
"learning_rate": 2.687074829931973e-06,
"loss": 1.4036,
"step": 221
},
{
"epoch": 0.7424749163879598,
"grad_norm": 5.15625,
"learning_rate": 2.6530612244897964e-06,
"loss": 1.3211,
"step": 222
},
{
"epoch": 0.745819397993311,
"grad_norm": 5.125,
"learning_rate": 2.6190476190476192e-06,
"loss": 1.3913,
"step": 223
},
{
"epoch": 0.7491638795986622,
"grad_norm": 5.875,
"learning_rate": 2.5850340136054425e-06,
"loss": 1.604,
"step": 224
},
{
"epoch": 0.7525083612040134,
"grad_norm": 5.46875,
"learning_rate": 2.5510204081632657e-06,
"loss": 1.4159,
"step": 225
},
{
"epoch": 0.7558528428093646,
"grad_norm": 5.375,
"learning_rate": 2.5170068027210886e-06,
"loss": 1.4109,
"step": 226
},
{
"epoch": 0.7591973244147158,
"grad_norm": 4.96875,
"learning_rate": 2.482993197278912e-06,
"loss": 1.347,
"step": 227
},
{
"epoch": 0.7625418060200669,
"grad_norm": 6.15625,
"learning_rate": 2.4489795918367347e-06,
"loss": 1.3658,
"step": 228
},
{
"epoch": 0.7658862876254181,
"grad_norm": 5.625,
"learning_rate": 2.414965986394558e-06,
"loss": 1.4617,
"step": 229
},
{
"epoch": 0.7692307692307693,
"grad_norm": 4.875,
"learning_rate": 2.380952380952381e-06,
"loss": 1.3388,
"step": 230
},
{
"epoch": 0.7692307692307693,
"eval_loss": 1.446601152420044,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.9763,
"eval_samples_per_second": 47.885,
"eval_steps_per_second": 23.943,
"step": 230
},
{
"epoch": 0.7725752508361204,
"grad_norm": 5.28125,
"learning_rate": 2.3469387755102044e-06,
"loss": 1.3752,
"step": 231
},
{
"epoch": 0.7759197324414716,
"grad_norm": 5.84375,
"learning_rate": 2.3129251700680273e-06,
"loss": 1.4201,
"step": 232
},
{
"epoch": 0.7792642140468228,
"grad_norm": 5.8125,
"learning_rate": 2.2789115646258505e-06,
"loss": 1.4162,
"step": 233
},
{
"epoch": 0.782608695652174,
"grad_norm": 5.4375,
"learning_rate": 2.244897959183674e-06,
"loss": 1.486,
"step": 234
},
{
"epoch": 0.7859531772575251,
"grad_norm": 5.5625,
"learning_rate": 2.2108843537414966e-06,
"loss": 1.456,
"step": 235
},
{
"epoch": 0.7892976588628763,
"grad_norm": 5.46875,
"learning_rate": 2.17687074829932e-06,
"loss": 1.4028,
"step": 236
},
{
"epoch": 0.7926421404682275,
"grad_norm": 6.125,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.4319,
"step": 237
},
{
"epoch": 0.7959866220735786,
"grad_norm": 5.9375,
"learning_rate": 2.1088435374149664e-06,
"loss": 1.3655,
"step": 238
},
{
"epoch": 0.7993311036789298,
"grad_norm": 6.15625,
"learning_rate": 2.0748299319727892e-06,
"loss": 1.4736,
"step": 239
},
{
"epoch": 0.802675585284281,
"grad_norm": 5.8125,
"learning_rate": 2.0408163265306125e-06,
"loss": 1.5061,
"step": 240
},
{
"epoch": 0.802675585284281,
"eval_loss": 1.4440027475357056,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.5962,
"eval_samples_per_second": 46.726,
"eval_steps_per_second": 23.363,
"step": 240
},
{
"epoch": 0.8060200668896321,
"grad_norm": 5.90625,
"learning_rate": 2.0068027210884353e-06,
"loss": 1.4407,
"step": 241
},
{
"epoch": 0.8093645484949833,
"grad_norm": 5.625,
"learning_rate": 1.9727891156462586e-06,
"loss": 1.4002,
"step": 242
},
{
"epoch": 0.8127090301003345,
"grad_norm": 5.1875,
"learning_rate": 1.938775510204082e-06,
"loss": 1.4503,
"step": 243
},
{
"epoch": 0.8160535117056856,
"grad_norm": 5.625,
"learning_rate": 1.904761904761905e-06,
"loss": 1.5007,
"step": 244
},
{
"epoch": 0.8193979933110368,
"grad_norm": 5.78125,
"learning_rate": 1.8707482993197282e-06,
"loss": 1.6362,
"step": 245
},
{
"epoch": 0.822742474916388,
"grad_norm": 5.375,
"learning_rate": 1.8367346938775512e-06,
"loss": 1.4041,
"step": 246
},
{
"epoch": 0.8260869565217391,
"grad_norm": 5.875,
"learning_rate": 1.8027210884353743e-06,
"loss": 1.4981,
"step": 247
},
{
"epoch": 0.8294314381270903,
"grad_norm": 6.0,
"learning_rate": 1.7687074829931975e-06,
"loss": 1.3748,
"step": 248
},
{
"epoch": 0.8327759197324415,
"grad_norm": 6.0,
"learning_rate": 1.7346938775510206e-06,
"loss": 1.5447,
"step": 249
},
{
"epoch": 0.8361204013377926,
"grad_norm": 5.6875,
"learning_rate": 1.7006802721088436e-06,
"loss": 1.4622,
"step": 250
},
{
"epoch": 0.8361204013377926,
"eval_loss": 1.4428349733352661,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.5345,
"eval_samples_per_second": 46.839,
"eval_steps_per_second": 23.419,
"step": 250
},
{
"epoch": 0.8394648829431438,
"grad_norm": 5.59375,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.6372,
"step": 251
},
{
"epoch": 0.842809364548495,
"grad_norm": 5.84375,
"learning_rate": 1.6326530612244897e-06,
"loss": 1.5288,
"step": 252
},
{
"epoch": 0.8461538461538461,
"grad_norm": 5.78125,
"learning_rate": 1.5986394557823132e-06,
"loss": 1.5163,
"step": 253
},
{
"epoch": 0.8494983277591973,
"grad_norm": 5.71875,
"learning_rate": 1.5646258503401362e-06,
"loss": 1.4535,
"step": 254
},
{
"epoch": 0.8528428093645485,
"grad_norm": 5.5625,
"learning_rate": 1.5306122448979593e-06,
"loss": 1.3558,
"step": 255
},
{
"epoch": 0.8561872909698997,
"grad_norm": 5.78125,
"learning_rate": 1.4965986394557825e-06,
"loss": 1.489,
"step": 256
},
{
"epoch": 0.8595317725752508,
"grad_norm": 5.875,
"learning_rate": 1.4625850340136056e-06,
"loss": 1.4066,
"step": 257
},
{
"epoch": 0.862876254180602,
"grad_norm": 5.3125,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.4846,
"step": 258
},
{
"epoch": 0.8662207357859532,
"grad_norm": 5.40625,
"learning_rate": 1.3945578231292517e-06,
"loss": 1.4276,
"step": 259
},
{
"epoch": 0.8695652173913043,
"grad_norm": 5.65625,
"learning_rate": 1.3605442176870751e-06,
"loss": 1.409,
"step": 260
},
{
"epoch": 0.8695652173913043,
"eval_loss": 1.4424058198928833,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.3885,
"eval_samples_per_second": 47.108,
"eval_steps_per_second": 23.554,
"step": 260
},
{
"epoch": 0.8729096989966555,
"grad_norm": 5.375,
"learning_rate": 1.3265306122448982e-06,
"loss": 1.4611,
"step": 261
},
{
"epoch": 0.8762541806020067,
"grad_norm": 5.84375,
"learning_rate": 1.2925170068027212e-06,
"loss": 1.5491,
"step": 262
},
{
"epoch": 0.8795986622073578,
"grad_norm": 5.40625,
"learning_rate": 1.2585034013605443e-06,
"loss": 1.3317,
"step": 263
},
{
"epoch": 0.882943143812709,
"grad_norm": 5.71875,
"learning_rate": 1.2244897959183673e-06,
"loss": 1.5078,
"step": 264
},
{
"epoch": 0.8862876254180602,
"grad_norm": 5.78125,
"learning_rate": 1.1904761904761906e-06,
"loss": 1.3606,
"step": 265
},
{
"epoch": 0.8896321070234113,
"grad_norm": 5.65625,
"learning_rate": 1.1564625850340136e-06,
"loss": 1.3629,
"step": 266
},
{
"epoch": 0.8929765886287625,
"grad_norm": 5.6875,
"learning_rate": 1.122448979591837e-06,
"loss": 1.5987,
"step": 267
},
{
"epoch": 0.8963210702341137,
"grad_norm": 5.21875,
"learning_rate": 1.08843537414966e-06,
"loss": 1.474,
"step": 268
},
{
"epoch": 0.8996655518394648,
"grad_norm": 5.875,
"learning_rate": 1.0544217687074832e-06,
"loss": 1.4959,
"step": 269
},
{
"epoch": 0.903010033444816,
"grad_norm": 5.34375,
"learning_rate": 1.0204081632653063e-06,
"loss": 1.4856,
"step": 270
},
{
"epoch": 0.903010033444816,
"eval_loss": 1.440917730331421,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 25.1531,
"eval_samples_per_second": 47.549,
"eval_steps_per_second": 23.774,
"step": 270
},
{
"epoch": 0.9063545150501672,
"grad_norm": 5.375,
"learning_rate": 9.863945578231293e-07,
"loss": 1.3753,
"step": 271
},
{
"epoch": 0.9096989966555183,
"grad_norm": 5.4375,
"learning_rate": 9.523809523809525e-07,
"loss": 1.4056,
"step": 272
},
{
"epoch": 0.9130434782608695,
"grad_norm": 5.46875,
"learning_rate": 9.183673469387756e-07,
"loss": 1.4596,
"step": 273
},
{
"epoch": 0.9163879598662207,
"grad_norm": 5.03125,
"learning_rate": 8.843537414965988e-07,
"loss": 1.3337,
"step": 274
},
{
"epoch": 0.919732441471572,
"grad_norm": 5.28125,
"learning_rate": 8.503401360544218e-07,
"loss": 1.4107,
"step": 275
},
{
"epoch": 0.9230769230769231,
"grad_norm": 5.21875,
"learning_rate": 8.163265306122449e-07,
"loss": 1.4098,
"step": 276
},
{
"epoch": 0.9264214046822743,
"grad_norm": 5.4375,
"learning_rate": 7.823129251700681e-07,
"loss": 1.4803,
"step": 277
},
{
"epoch": 0.9297658862876255,
"grad_norm": 5.3125,
"learning_rate": 7.482993197278913e-07,
"loss": 1.4627,
"step": 278
},
{
"epoch": 0.9331103678929766,
"grad_norm": 5.21875,
"learning_rate": 7.142857142857143e-07,
"loss": 1.4215,
"step": 279
},
{
"epoch": 0.9364548494983278,
"grad_norm": 6.03125,
"learning_rate": 6.802721088435376e-07,
"loss": 1.3524,
"step": 280
},
{
"epoch": 0.9364548494983278,
"eval_loss": 1.439655065536499,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 26.5568,
"eval_samples_per_second": 45.036,
"eval_steps_per_second": 22.518,
"step": 280
},
{
"epoch": 0.939799331103679,
"grad_norm": 5.625,
"learning_rate": 6.462585034013606e-07,
"loss": 1.5138,
"step": 281
},
{
"epoch": 0.9431438127090301,
"grad_norm": 5.625,
"learning_rate": 6.122448979591837e-07,
"loss": 1.4535,
"step": 282
},
{
"epoch": 0.9464882943143813,
"grad_norm": 5.53125,
"learning_rate": 5.782312925170068e-07,
"loss": 1.6195,
"step": 283
},
{
"epoch": 0.9498327759197325,
"grad_norm": 5.34375,
"learning_rate": 5.4421768707483e-07,
"loss": 1.4418,
"step": 284
},
{
"epoch": 0.9531772575250836,
"grad_norm": 5.25,
"learning_rate": 5.102040816326531e-07,
"loss": 1.3069,
"step": 285
},
{
"epoch": 0.9565217391304348,
"grad_norm": 5.4375,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.2933,
"step": 286
},
{
"epoch": 0.959866220735786,
"grad_norm": 5.53125,
"learning_rate": 4.421768707482994e-07,
"loss": 1.4555,
"step": 287
},
{
"epoch": 0.9632107023411371,
"grad_norm": 5.25,
"learning_rate": 4.0816326530612243e-07,
"loss": 1.3118,
"step": 288
},
{
"epoch": 0.9665551839464883,
"grad_norm": 5.8125,
"learning_rate": 3.7414965986394563e-07,
"loss": 1.4598,
"step": 289
},
{
"epoch": 0.9698996655518395,
"grad_norm": 5.4375,
"learning_rate": 3.401360544217688e-07,
"loss": 1.4002,
"step": 290
},
{
"epoch": 0.9698996655518395,
"eval_loss": 1.4394288063049316,
"eval_model_preparation_time": 0.0182,
"eval_runtime": 24.6446,
"eval_samples_per_second": 48.53,
"eval_steps_per_second": 24.265,
"step": 290
},
{
"epoch": 0.9732441471571907,
"grad_norm": 5.0625,
"learning_rate": 3.0612244897959183e-07,
"loss": 1.3755,
"step": 291
},
{
"epoch": 0.9765886287625418,
"grad_norm": 5.28125,
"learning_rate": 2.72108843537415e-07,
"loss": 1.4194,
"step": 292
},
{
"epoch": 0.979933110367893,
"grad_norm": 6.0625,
"learning_rate": 2.3809523809523811e-07,
"loss": 1.4592,
"step": 293
},
{
"epoch": 0.9832775919732442,
"grad_norm": 5.46875,
"learning_rate": 2.0408163265306121e-07,
"loss": 1.5226,
"step": 294
},
{
"epoch": 0.9866220735785953,
"grad_norm": 5.46875,
"learning_rate": 1.700680272108844e-07,
"loss": 1.3764,
"step": 295
},
{
"epoch": 0.9899665551839465,
"grad_norm": 5.46875,
"learning_rate": 1.360544217687075e-07,
"loss": 1.3846,
"step": 296
},
{
"epoch": 0.9933110367892977,
"grad_norm": 5.375,
"learning_rate": 1.0204081632653061e-07,
"loss": 1.4552,
"step": 297
},
{
"epoch": 0.9966555183946488,
"grad_norm": 5.75,
"learning_rate": 6.802721088435375e-08,
"loss": 1.3043,
"step": 298
},
{
"epoch": 1.0,
"grad_norm": 5.96875,
"learning_rate": 3.4013605442176873e-08,
"loss": 1.4929,
"step": 299
}
],
"logging_steps": 1,
"max_steps": 299,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3124411636793344e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}