Files
Qwen3-4B-Base-ftjob-25058cd…/checkpoint-168/trainer_state.json
ModelHub XC 2d9a095b3c 初始化项目,由ModelHub XC社区提供模型
Model: vohonen/Qwen3-4B-Base-ftjob-25058cdbbe3e-merged
Source: Original Platform
2026-04-25 12:23:08 +08:00

1229 lines
28 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018140589569160998,
"grad_norm": 0.95703125,
"learning_rate": 0.0,
"loss": 2.6996,
"step": 1
},
{
"epoch": 0.036281179138321996,
"grad_norm": 0.9921875,
"learning_rate": 4.000000000000001e-06,
"loss": 2.66,
"step": 2
},
{
"epoch": 0.05442176870748299,
"grad_norm": 1.0703125,
"learning_rate": 8.000000000000001e-06,
"loss": 2.6808,
"step": 3
},
{
"epoch": 0.07256235827664399,
"grad_norm": 1.0078125,
"learning_rate": 1.2e-05,
"loss": 2.6952,
"step": 4
},
{
"epoch": 0.09070294784580499,
"grad_norm": 0.94140625,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.6385,
"step": 5
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.9765625,
"learning_rate": 2e-05,
"loss": 2.6775,
"step": 6
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.8828125,
"learning_rate": 1.9877300613496935e-05,
"loss": 2.6544,
"step": 7
},
{
"epoch": 0.14512471655328799,
"grad_norm": 0.87109375,
"learning_rate": 1.9754601226993868e-05,
"loss": 2.6104,
"step": 8
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.84375,
"learning_rate": 1.96319018404908e-05,
"loss": 2.5816,
"step": 9
},
{
"epoch": 0.18140589569160998,
"grad_norm": 0.921875,
"learning_rate": 1.9509202453987733e-05,
"loss": 2.6088,
"step": 10
},
{
"epoch": 0.19954648526077098,
"grad_norm": 0.85546875,
"learning_rate": 1.9386503067484663e-05,
"loss": 2.5572,
"step": 11
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.8046875,
"learning_rate": 1.9263803680981596e-05,
"loss": 2.5599,
"step": 12
},
{
"epoch": 0.23582766439909297,
"grad_norm": 0.8125,
"learning_rate": 1.914110429447853e-05,
"loss": 2.5125,
"step": 13
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.78515625,
"learning_rate": 1.9018404907975462e-05,
"loss": 2.551,
"step": 14
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.73046875,
"learning_rate": 1.8895705521472395e-05,
"loss": 2.5148,
"step": 15
},
{
"epoch": 0.29024943310657597,
"grad_norm": 0.74609375,
"learning_rate": 1.8773006134969328e-05,
"loss": 2.4706,
"step": 16
},
{
"epoch": 0.30839002267573695,
"grad_norm": 0.76171875,
"learning_rate": 1.8650306748466257e-05,
"loss": 2.435,
"step": 17
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.7109375,
"learning_rate": 1.852760736196319e-05,
"loss": 2.4895,
"step": 18
},
{
"epoch": 0.34467120181405897,
"grad_norm": 0.6953125,
"learning_rate": 1.8404907975460123e-05,
"loss": 2.4438,
"step": 19
},
{
"epoch": 0.36281179138321995,
"grad_norm": 0.703125,
"learning_rate": 1.828220858895706e-05,
"loss": 2.4875,
"step": 20
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.67578125,
"learning_rate": 1.815950920245399e-05,
"loss": 2.4617,
"step": 21
},
{
"epoch": 0.39909297052154197,
"grad_norm": 0.734375,
"learning_rate": 1.8036809815950922e-05,
"loss": 2.3989,
"step": 22
},
{
"epoch": 0.41723356009070295,
"grad_norm": 0.703125,
"learning_rate": 1.7914110429447855e-05,
"loss": 2.3609,
"step": 23
},
{
"epoch": 0.43537414965986393,
"grad_norm": 0.671875,
"learning_rate": 1.7791411042944788e-05,
"loss": 2.3001,
"step": 24
},
{
"epoch": 0.45351473922902497,
"grad_norm": 0.66796875,
"learning_rate": 1.766871165644172e-05,
"loss": 2.3645,
"step": 25
},
{
"epoch": 0.47165532879818595,
"grad_norm": 0.671875,
"learning_rate": 1.7546012269938654e-05,
"loss": 2.3013,
"step": 26
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.69921875,
"learning_rate": 1.7423312883435583e-05,
"loss": 2.4127,
"step": 27
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.6640625,
"learning_rate": 1.7300613496932516e-05,
"loss": 2.3384,
"step": 28
},
{
"epoch": 0.5260770975056689,
"grad_norm": 0.66015625,
"learning_rate": 1.717791411042945e-05,
"loss": 2.2717,
"step": 29
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.6953125,
"learning_rate": 1.7055214723926382e-05,
"loss": 2.3407,
"step": 30
},
{
"epoch": 0.562358276643991,
"grad_norm": 0.640625,
"learning_rate": 1.6932515337423315e-05,
"loss": 2.2607,
"step": 31
},
{
"epoch": 0.5804988662131519,
"grad_norm": 0.671875,
"learning_rate": 1.6809815950920248e-05,
"loss": 2.2913,
"step": 32
},
{
"epoch": 0.5986394557823129,
"grad_norm": 0.69921875,
"learning_rate": 1.6687116564417178e-05,
"loss": 2.3287,
"step": 33
},
{
"epoch": 0.6167800453514739,
"grad_norm": 0.63671875,
"learning_rate": 1.656441717791411e-05,
"loss": 2.2778,
"step": 34
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.6484375,
"learning_rate": 1.6441717791411043e-05,
"loss": 2.2741,
"step": 35
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.63671875,
"learning_rate": 1.6319018404907976e-05,
"loss": 2.2311,
"step": 36
},
{
"epoch": 0.671201814058957,
"grad_norm": 0.6796875,
"learning_rate": 1.619631901840491e-05,
"loss": 2.2988,
"step": 37
},
{
"epoch": 0.6893424036281179,
"grad_norm": 0.6328125,
"learning_rate": 1.6073619631901842e-05,
"loss": 2.2643,
"step": 38
},
{
"epoch": 0.7074829931972789,
"grad_norm": 0.6796875,
"learning_rate": 1.5950920245398772e-05,
"loss": 2.2546,
"step": 39
},
{
"epoch": 0.7256235827664399,
"grad_norm": 0.671875,
"learning_rate": 1.5828220858895708e-05,
"loss": 2.2735,
"step": 40
},
{
"epoch": 0.7437641723356009,
"grad_norm": 0.578125,
"learning_rate": 1.570552147239264e-05,
"loss": 2.2546,
"step": 41
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.6484375,
"learning_rate": 1.5582822085889574e-05,
"loss": 2.2388,
"step": 42
},
{
"epoch": 0.780045351473923,
"grad_norm": 0.68359375,
"learning_rate": 1.5460122699386504e-05,
"loss": 2.2796,
"step": 43
},
{
"epoch": 0.7981859410430839,
"grad_norm": 0.62109375,
"learning_rate": 1.5337423312883436e-05,
"loss": 2.2356,
"step": 44
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.6171875,
"learning_rate": 1.5214723926380371e-05,
"loss": 2.2562,
"step": 45
},
{
"epoch": 0.8344671201814059,
"grad_norm": 0.66015625,
"learning_rate": 1.50920245398773e-05,
"loss": 2.2189,
"step": 46
},
{
"epoch": 0.8526077097505669,
"grad_norm": 0.64453125,
"learning_rate": 1.4969325153374235e-05,
"loss": 2.2293,
"step": 47
},
{
"epoch": 0.8707482993197279,
"grad_norm": 0.6328125,
"learning_rate": 1.4846625766871168e-05,
"loss": 2.1792,
"step": 48
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.6484375,
"learning_rate": 1.47239263803681e-05,
"loss": 2.2333,
"step": 49
},
{
"epoch": 0.9070294784580499,
"grad_norm": 0.58984375,
"learning_rate": 1.4601226993865032e-05,
"loss": 2.2005,
"step": 50
},
{
"epoch": 0.9251700680272109,
"grad_norm": 0.70703125,
"learning_rate": 1.4478527607361965e-05,
"loss": 2.2617,
"step": 51
},
{
"epoch": 0.9433106575963719,
"grad_norm": 0.64453125,
"learning_rate": 1.4355828220858897e-05,
"loss": 2.2125,
"step": 52
},
{
"epoch": 0.9614512471655329,
"grad_norm": 0.734375,
"learning_rate": 1.423312883435583e-05,
"loss": 2.237,
"step": 53
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.703125,
"learning_rate": 1.4110429447852763e-05,
"loss": 2.1756,
"step": 54
},
{
"epoch": 0.9977324263038548,
"grad_norm": 0.62890625,
"learning_rate": 1.3987730061349694e-05,
"loss": 2.2037,
"step": 55
},
{
"epoch": 1.0,
"grad_norm": 1.9765625,
"learning_rate": 1.3865030674846627e-05,
"loss": 2.1214,
"step": 56
},
{
"epoch": 1.0,
"eval_loss": 2.2088165283203125,
"eval_model_preparation_time": 0.0224,
"eval_runtime": 2.7857,
"eval_samples_per_second": 35.179,
"eval_steps_per_second": 17.59,
"step": 56
},
{
"epoch": 1.018140589569161,
"grad_norm": 0.58984375,
"learning_rate": 1.374233128834356e-05,
"loss": 2.1552,
"step": 57
},
{
"epoch": 1.036281179138322,
"grad_norm": 0.62890625,
"learning_rate": 1.3619631901840491e-05,
"loss": 2.1247,
"step": 58
},
{
"epoch": 1.054421768707483,
"grad_norm": 0.6171875,
"learning_rate": 1.3496932515337424e-05,
"loss": 2.2268,
"step": 59
},
{
"epoch": 1.072562358276644,
"grad_norm": 0.65234375,
"learning_rate": 1.3374233128834357e-05,
"loss": 2.1801,
"step": 60
},
{
"epoch": 1.090702947845805,
"grad_norm": 0.65625,
"learning_rate": 1.3251533742331288e-05,
"loss": 2.1991,
"step": 61
},
{
"epoch": 1.1088435374149659,
"grad_norm": 0.70703125,
"learning_rate": 1.3128834355828221e-05,
"loss": 2.1206,
"step": 62
},
{
"epoch": 1.126984126984127,
"grad_norm": 0.73828125,
"learning_rate": 1.3006134969325156e-05,
"loss": 2.1545,
"step": 63
},
{
"epoch": 1.145124716553288,
"grad_norm": 0.65234375,
"learning_rate": 1.2883435582822085e-05,
"loss": 2.1574,
"step": 64
},
{
"epoch": 1.163265306122449,
"grad_norm": 0.62890625,
"learning_rate": 1.276073619631902e-05,
"loss": 2.1384,
"step": 65
},
{
"epoch": 1.18140589569161,
"grad_norm": 0.66015625,
"learning_rate": 1.2638036809815953e-05,
"loss": 2.1563,
"step": 66
},
{
"epoch": 1.199546485260771,
"grad_norm": 0.69921875,
"learning_rate": 1.2515337423312886e-05,
"loss": 2.1593,
"step": 67
},
{
"epoch": 1.217687074829932,
"grad_norm": 0.76953125,
"learning_rate": 1.2392638036809817e-05,
"loss": 2.1628,
"step": 68
},
{
"epoch": 1.235827664399093,
"grad_norm": 0.66796875,
"learning_rate": 1.226993865030675e-05,
"loss": 2.1392,
"step": 69
},
{
"epoch": 1.253968253968254,
"grad_norm": 0.7578125,
"learning_rate": 1.2147239263803683e-05,
"loss": 2.2247,
"step": 70
},
{
"epoch": 1.272108843537415,
"grad_norm": 0.71484375,
"learning_rate": 1.2024539877300614e-05,
"loss": 2.1673,
"step": 71
},
{
"epoch": 1.290249433106576,
"grad_norm": 0.61328125,
"learning_rate": 1.1901840490797547e-05,
"loss": 2.1676,
"step": 72
},
{
"epoch": 1.308390022675737,
"grad_norm": 0.69140625,
"learning_rate": 1.177914110429448e-05,
"loss": 2.0911,
"step": 73
},
{
"epoch": 1.3265306122448979,
"grad_norm": 0.69140625,
"learning_rate": 1.1656441717791411e-05,
"loss": 2.1493,
"step": 74
},
{
"epoch": 1.344671201814059,
"grad_norm": 0.6875,
"learning_rate": 1.1533742331288344e-05,
"loss": 2.1459,
"step": 75
},
{
"epoch": 1.36281179138322,
"grad_norm": 0.7421875,
"learning_rate": 1.1411042944785277e-05,
"loss": 2.0973,
"step": 76
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.69921875,
"learning_rate": 1.1288343558282208e-05,
"loss": 2.0893,
"step": 77
},
{
"epoch": 1.399092970521542,
"grad_norm": 0.73046875,
"learning_rate": 1.1165644171779141e-05,
"loss": 2.1779,
"step": 78
},
{
"epoch": 1.417233560090703,
"grad_norm": 0.6796875,
"learning_rate": 1.1042944785276076e-05,
"loss": 2.0661,
"step": 79
},
{
"epoch": 1.435374149659864,
"grad_norm": 0.6796875,
"learning_rate": 1.0920245398773005e-05,
"loss": 2.1201,
"step": 80
},
{
"epoch": 1.4535147392290249,
"grad_norm": 0.65234375,
"learning_rate": 1.079754601226994e-05,
"loss": 2.0765,
"step": 81
},
{
"epoch": 1.471655328798186,
"grad_norm": 0.66015625,
"learning_rate": 1.0674846625766873e-05,
"loss": 2.091,
"step": 82
},
{
"epoch": 1.489795918367347,
"grad_norm": 0.6796875,
"learning_rate": 1.0552147239263804e-05,
"loss": 2.1094,
"step": 83
},
{
"epoch": 1.507936507936508,
"grad_norm": 0.7109375,
"learning_rate": 1.0429447852760737e-05,
"loss": 2.2231,
"step": 84
},
{
"epoch": 1.5260770975056688,
"grad_norm": 0.66015625,
"learning_rate": 1.030674846625767e-05,
"loss": 2.1197,
"step": 85
},
{
"epoch": 1.54421768707483,
"grad_norm": 0.7421875,
"learning_rate": 1.0184049079754601e-05,
"loss": 2.1248,
"step": 86
},
{
"epoch": 1.562358276643991,
"grad_norm": 0.671875,
"learning_rate": 1.0061349693251534e-05,
"loss": 2.157,
"step": 87
},
{
"epoch": 1.5804988662131518,
"grad_norm": 0.67578125,
"learning_rate": 9.938650306748467e-06,
"loss": 2.1562,
"step": 88
},
{
"epoch": 1.598639455782313,
"grad_norm": 0.6796875,
"learning_rate": 9.8159509202454e-06,
"loss": 2.095,
"step": 89
},
{
"epoch": 1.616780045351474,
"grad_norm": 0.72265625,
"learning_rate": 9.693251533742331e-06,
"loss": 2.1363,
"step": 90
},
{
"epoch": 1.6349206349206349,
"grad_norm": 0.6328125,
"learning_rate": 9.570552147239264e-06,
"loss": 2.0856,
"step": 91
},
{
"epoch": 1.6530612244897958,
"grad_norm": 0.6796875,
"learning_rate": 9.447852760736197e-06,
"loss": 2.1142,
"step": 92
},
{
"epoch": 1.671201814058957,
"grad_norm": 0.72265625,
"learning_rate": 9.325153374233129e-06,
"loss": 2.1567,
"step": 93
},
{
"epoch": 1.689342403628118,
"grad_norm": 0.7890625,
"learning_rate": 9.202453987730062e-06,
"loss": 2.1214,
"step": 94
},
{
"epoch": 1.7074829931972788,
"grad_norm": 0.703125,
"learning_rate": 9.079754601226994e-06,
"loss": 2.1152,
"step": 95
},
{
"epoch": 1.72562358276644,
"grad_norm": 0.69921875,
"learning_rate": 8.957055214723927e-06,
"loss": 2.0999,
"step": 96
},
{
"epoch": 1.743764172335601,
"grad_norm": 0.6484375,
"learning_rate": 8.83435582822086e-06,
"loss": 2.1361,
"step": 97
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.75390625,
"learning_rate": 8.711656441717792e-06,
"loss": 2.1357,
"step": 98
},
{
"epoch": 1.780045351473923,
"grad_norm": 0.7421875,
"learning_rate": 8.588957055214725e-06,
"loss": 2.1405,
"step": 99
},
{
"epoch": 1.798185941043084,
"grad_norm": 0.73046875,
"learning_rate": 8.466257668711658e-06,
"loss": 2.0975,
"step": 100
},
{
"epoch": 1.816326530612245,
"grad_norm": 0.68359375,
"learning_rate": 8.343558282208589e-06,
"loss": 2.1457,
"step": 101
},
{
"epoch": 1.8344671201814058,
"grad_norm": 0.703125,
"learning_rate": 8.220858895705522e-06,
"loss": 2.068,
"step": 102
},
{
"epoch": 1.8526077097505669,
"grad_norm": 0.703125,
"learning_rate": 8.098159509202455e-06,
"loss": 2.1473,
"step": 103
},
{
"epoch": 1.870748299319728,
"grad_norm": 0.66015625,
"learning_rate": 7.975460122699386e-06,
"loss": 2.0983,
"step": 104
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.7265625,
"learning_rate": 7.85276073619632e-06,
"loss": 2.0952,
"step": 105
},
{
"epoch": 1.90702947845805,
"grad_norm": 0.7578125,
"learning_rate": 7.730061349693252e-06,
"loss": 2.086,
"step": 106
},
{
"epoch": 1.925170068027211,
"grad_norm": 0.69140625,
"learning_rate": 7.6073619631901856e-06,
"loss": 2.1086,
"step": 107
},
{
"epoch": 1.943310657596372,
"grad_norm": 0.671875,
"learning_rate": 7.484662576687118e-06,
"loss": 2.092,
"step": 108
},
{
"epoch": 1.9614512471655328,
"grad_norm": 0.77734375,
"learning_rate": 7.36196319018405e-06,
"loss": 2.1334,
"step": 109
},
{
"epoch": 1.9795918367346939,
"grad_norm": 0.77734375,
"learning_rate": 7.239263803680983e-06,
"loss": 2.0644,
"step": 110
},
{
"epoch": 1.997732426303855,
"grad_norm": 0.703125,
"learning_rate": 7.116564417177915e-06,
"loss": 2.0589,
"step": 111
},
{
"epoch": 2.0,
"grad_norm": 1.6953125,
"learning_rate": 6.993865030674847e-06,
"loss": 2.0806,
"step": 112
},
{
"epoch": 2.0,
"eval_loss": 2.1338422298431396,
"eval_model_preparation_time": 0.0224,
"eval_runtime": 2.7565,
"eval_samples_per_second": 35.552,
"eval_steps_per_second": 17.776,
"step": 112
},
{
"epoch": 2.018140589569161,
"grad_norm": 0.6875,
"learning_rate": 6.87116564417178e-06,
"loss": 2.0871,
"step": 113
},
{
"epoch": 2.036281179138322,
"grad_norm": 0.78125,
"learning_rate": 6.748466257668712e-06,
"loss": 2.0733,
"step": 114
},
{
"epoch": 2.054421768707483,
"grad_norm": 0.7421875,
"learning_rate": 6.625766871165644e-06,
"loss": 2.0882,
"step": 115
},
{
"epoch": 2.072562358276644,
"grad_norm": 0.70703125,
"learning_rate": 6.503067484662578e-06,
"loss": 2.028,
"step": 116
},
{
"epoch": 2.090702947845805,
"grad_norm": 0.8125,
"learning_rate": 6.38036809815951e-06,
"loss": 2.0844,
"step": 117
},
{
"epoch": 2.108843537414966,
"grad_norm": 0.7421875,
"learning_rate": 6.257668711656443e-06,
"loss": 2.1208,
"step": 118
},
{
"epoch": 2.126984126984127,
"grad_norm": 0.70703125,
"learning_rate": 6.134969325153375e-06,
"loss": 2.1136,
"step": 119
},
{
"epoch": 2.145124716553288,
"grad_norm": 0.734375,
"learning_rate": 6.012269938650307e-06,
"loss": 2.091,
"step": 120
},
{
"epoch": 2.163265306122449,
"grad_norm": 0.68359375,
"learning_rate": 5.88957055214724e-06,
"loss": 2.0499,
"step": 121
},
{
"epoch": 2.18140589569161,
"grad_norm": 0.734375,
"learning_rate": 5.766871165644172e-06,
"loss": 2.0557,
"step": 122
},
{
"epoch": 2.199546485260771,
"grad_norm": 0.640625,
"learning_rate": 5.644171779141104e-06,
"loss": 2.0273,
"step": 123
},
{
"epoch": 2.2176870748299318,
"grad_norm": 0.76953125,
"learning_rate": 5.521472392638038e-06,
"loss": 2.0663,
"step": 124
},
{
"epoch": 2.235827664399093,
"grad_norm": 0.67578125,
"learning_rate": 5.39877300613497e-06,
"loss": 2.0865,
"step": 125
},
{
"epoch": 2.253968253968254,
"grad_norm": 0.70703125,
"learning_rate": 5.276073619631902e-06,
"loss": 2.0568,
"step": 126
},
{
"epoch": 2.272108843537415,
"grad_norm": 0.70703125,
"learning_rate": 5.153374233128835e-06,
"loss": 2.0819,
"step": 127
},
{
"epoch": 2.290249433106576,
"grad_norm": 0.7734375,
"learning_rate": 5.030674846625767e-06,
"loss": 2.0872,
"step": 128
},
{
"epoch": 2.308390022675737,
"grad_norm": 0.7734375,
"learning_rate": 4.9079754601227e-06,
"loss": 2.1564,
"step": 129
},
{
"epoch": 2.326530612244898,
"grad_norm": 0.73828125,
"learning_rate": 4.785276073619632e-06,
"loss": 2.0401,
"step": 130
},
{
"epoch": 2.3446712018140587,
"grad_norm": 0.68359375,
"learning_rate": 4.662576687116564e-06,
"loss": 2.0781,
"step": 131
},
{
"epoch": 2.36281179138322,
"grad_norm": 0.73828125,
"learning_rate": 4.539877300613497e-06,
"loss": 2.0845,
"step": 132
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.7578125,
"learning_rate": 4.41717791411043e-06,
"loss": 2.0638,
"step": 133
},
{
"epoch": 2.399092970521542,
"grad_norm": 0.77734375,
"learning_rate": 4.294478527607362e-06,
"loss": 2.0817,
"step": 134
},
{
"epoch": 2.417233560090703,
"grad_norm": 0.71875,
"learning_rate": 4.171779141104294e-06,
"loss": 2.056,
"step": 135
},
{
"epoch": 2.435374149659864,
"grad_norm": 0.703125,
"learning_rate": 4.049079754601227e-06,
"loss": 2.109,
"step": 136
},
{
"epoch": 2.453514739229025,
"grad_norm": 0.6484375,
"learning_rate": 3.92638036809816e-06,
"loss": 2.0948,
"step": 137
},
{
"epoch": 2.471655328798186,
"grad_norm": 0.77734375,
"learning_rate": 3.8036809815950928e-06,
"loss": 2.1051,
"step": 138
},
{
"epoch": 2.489795918367347,
"grad_norm": 0.7421875,
"learning_rate": 3.680981595092025e-06,
"loss": 2.0321,
"step": 139
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.81640625,
"learning_rate": 3.5582822085889574e-06,
"loss": 2.1067,
"step": 140
},
{
"epoch": 2.526077097505669,
"grad_norm": 0.6796875,
"learning_rate": 3.43558282208589e-06,
"loss": 2.1105,
"step": 141
},
{
"epoch": 2.54421768707483,
"grad_norm": 0.73046875,
"learning_rate": 3.312883435582822e-06,
"loss": 2.1413,
"step": 142
},
{
"epoch": 2.562358276643991,
"grad_norm": 0.7265625,
"learning_rate": 3.190184049079755e-06,
"loss": 2.0716,
"step": 143
},
{
"epoch": 2.580498866213152,
"grad_norm": 0.69140625,
"learning_rate": 3.0674846625766875e-06,
"loss": 2.0711,
"step": 144
},
{
"epoch": 2.5986394557823127,
"grad_norm": 0.7578125,
"learning_rate": 2.94478527607362e-06,
"loss": 2.0836,
"step": 145
},
{
"epoch": 2.616780045351474,
"grad_norm": 0.71484375,
"learning_rate": 2.822085889570552e-06,
"loss": 2.0586,
"step": 146
},
{
"epoch": 2.634920634920635,
"grad_norm": 0.70703125,
"learning_rate": 2.699386503067485e-06,
"loss": 2.0593,
"step": 147
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.6484375,
"learning_rate": 2.5766871165644175e-06,
"loss": 2.0547,
"step": 148
},
{
"epoch": 2.671201814058957,
"grad_norm": 0.671875,
"learning_rate": 2.45398773006135e-06,
"loss": 2.0451,
"step": 149
},
{
"epoch": 2.689342403628118,
"grad_norm": 0.6640625,
"learning_rate": 2.331288343558282e-06,
"loss": 2.0731,
"step": 150
},
{
"epoch": 2.707482993197279,
"grad_norm": 0.6796875,
"learning_rate": 2.208588957055215e-06,
"loss": 2.0026,
"step": 151
},
{
"epoch": 2.72562358276644,
"grad_norm": 0.765625,
"learning_rate": 2.085889570552147e-06,
"loss": 2.1035,
"step": 152
},
{
"epoch": 2.743764172335601,
"grad_norm": 0.7109375,
"learning_rate": 1.96319018404908e-06,
"loss": 2.0727,
"step": 153
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.69921875,
"learning_rate": 1.8404907975460124e-06,
"loss": 2.0177,
"step": 154
},
{
"epoch": 2.780045351473923,
"grad_norm": 0.71484375,
"learning_rate": 1.717791411042945e-06,
"loss": 2.0351,
"step": 155
},
{
"epoch": 2.798185941043084,
"grad_norm": 0.69921875,
"learning_rate": 1.5950920245398775e-06,
"loss": 2.0597,
"step": 156
},
{
"epoch": 2.816326530612245,
"grad_norm": 0.70703125,
"learning_rate": 1.47239263803681e-06,
"loss": 2.048,
"step": 157
},
{
"epoch": 2.834467120181406,
"grad_norm": 0.6953125,
"learning_rate": 1.3496932515337425e-06,
"loss": 2.0717,
"step": 158
},
{
"epoch": 2.8526077097505667,
"grad_norm": 0.69921875,
"learning_rate": 1.226993865030675e-06,
"loss": 2.1007,
"step": 159
},
{
"epoch": 2.870748299319728,
"grad_norm": 0.71875,
"learning_rate": 1.1042944785276075e-06,
"loss": 2.0228,
"step": 160
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.69140625,
"learning_rate": 9.8159509202454e-07,
"loss": 2.0413,
"step": 161
},
{
"epoch": 2.9070294784580497,
"grad_norm": 0.6875,
"learning_rate": 8.588957055214725e-07,
"loss": 2.0616,
"step": 162
},
{
"epoch": 2.925170068027211,
"grad_norm": 0.71875,
"learning_rate": 7.36196319018405e-07,
"loss": 2.0833,
"step": 163
},
{
"epoch": 2.943310657596372,
"grad_norm": 0.75,
"learning_rate": 6.134969325153375e-07,
"loss": 2.1265,
"step": 164
},
{
"epoch": 2.9614512471655328,
"grad_norm": 0.71875,
"learning_rate": 4.9079754601227e-07,
"loss": 2.0442,
"step": 165
},
{
"epoch": 2.979591836734694,
"grad_norm": 0.6875,
"learning_rate": 3.680981595092025e-07,
"loss": 2.0522,
"step": 166
},
{
"epoch": 2.997732426303855,
"grad_norm": 0.70703125,
"learning_rate": 2.45398773006135e-07,
"loss": 2.0616,
"step": 167
},
{
"epoch": 3.0,
"grad_norm": 2.078125,
"learning_rate": 1.226993865030675e-07,
"loss": 2.0104,
"step": 168
}
],
"logging_steps": 1,
"max_steps": 168,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.058921741456589e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}