1229 lines
28 KiB
JSON
1229 lines
28 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 168,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.018140589569160998,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.6996,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.036281179138321996,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 2.66,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.05442176870748299,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 2.6808,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.07256235827664399,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 2.6952,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.09070294784580499,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 1.6000000000000003e-05,
|
|
"loss": 2.6385,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.10884353741496598,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6775,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.12698412698412698,
|
|
"grad_norm": 0.8828125,
|
|
"learning_rate": 1.9877300613496935e-05,
|
|
"loss": 2.6544,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.14512471655328799,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 1.9754601226993868e-05,
|
|
"loss": 2.6104,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.16326530612244897,
|
|
"grad_norm": 0.84375,
|
|
"learning_rate": 1.96319018404908e-05,
|
|
"loss": 2.5816,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.18140589569160998,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 1.9509202453987733e-05,
|
|
"loss": 2.6088,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.19954648526077098,
|
|
"grad_norm": 0.85546875,
|
|
"learning_rate": 1.9386503067484663e-05,
|
|
"loss": 2.5572,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.21768707482993196,
|
|
"grad_norm": 0.8046875,
|
|
"learning_rate": 1.9263803680981596e-05,
|
|
"loss": 2.5599,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.23582766439909297,
|
|
"grad_norm": 0.8125,
|
|
"learning_rate": 1.914110429447853e-05,
|
|
"loss": 2.5125,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.25396825396825395,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 1.9018404907975462e-05,
|
|
"loss": 2.551,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.272108843537415,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 1.8895705521472395e-05,
|
|
"loss": 2.5148,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.29024943310657597,
|
|
"grad_norm": 0.74609375,
|
|
"learning_rate": 1.8773006134969328e-05,
|
|
"loss": 2.4706,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.30839002267573695,
|
|
"grad_norm": 0.76171875,
|
|
"learning_rate": 1.8650306748466257e-05,
|
|
"loss": 2.435,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.32653061224489793,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 1.852760736196319e-05,
|
|
"loss": 2.4895,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.34467120181405897,
|
|
"grad_norm": 0.6953125,
|
|
"learning_rate": 1.8404907975460123e-05,
|
|
"loss": 2.4438,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.36281179138321995,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 1.828220858895706e-05,
|
|
"loss": 2.4875,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.38095238095238093,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 1.815950920245399e-05,
|
|
"loss": 2.4617,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.39909297052154197,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 1.8036809815950922e-05,
|
|
"loss": 2.3989,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.41723356009070295,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 1.7914110429447855e-05,
|
|
"loss": 2.3609,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.43537414965986393,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 1.7791411042944788e-05,
|
|
"loss": 2.3001,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.45351473922902497,
|
|
"grad_norm": 0.66796875,
|
|
"learning_rate": 1.766871165644172e-05,
|
|
"loss": 2.3645,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.47165532879818595,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 1.7546012269938654e-05,
|
|
"loss": 2.3013,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.4897959183673469,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.7423312883435583e-05,
|
|
"loss": 2.4127,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.5079365079365079,
|
|
"grad_norm": 0.6640625,
|
|
"learning_rate": 1.7300613496932516e-05,
|
|
"loss": 2.3384,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.5260770975056689,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 1.717791411042945e-05,
|
|
"loss": 2.2717,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.54421768707483,
|
|
"grad_norm": 0.6953125,
|
|
"learning_rate": 1.7055214723926382e-05,
|
|
"loss": 2.3407,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.562358276643991,
|
|
"grad_norm": 0.640625,
|
|
"learning_rate": 1.6932515337423315e-05,
|
|
"loss": 2.2607,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.5804988662131519,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 1.6809815950920248e-05,
|
|
"loss": 2.2913,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.5986394557823129,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.6687116564417178e-05,
|
|
"loss": 2.3287,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.6167800453514739,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 1.656441717791411e-05,
|
|
"loss": 2.2778,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.6349206349206349,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 1.6441717791411043e-05,
|
|
"loss": 2.2741,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.6530612244897959,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 1.6319018404907976e-05,
|
|
"loss": 2.2311,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.671201814058957,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 1.619631901840491e-05,
|
|
"loss": 2.2988,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.6893424036281179,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 1.6073619631901842e-05,
|
|
"loss": 2.2643,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.7074829931972789,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 1.5950920245398772e-05,
|
|
"loss": 2.2546,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.7256235827664399,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 1.5828220858895708e-05,
|
|
"loss": 2.2735,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.7437641723356009,
|
|
"grad_norm": 0.578125,
|
|
"learning_rate": 1.570552147239264e-05,
|
|
"loss": 2.2546,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.7619047619047619,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 1.5582822085889574e-05,
|
|
"loss": 2.2388,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.780045351473923,
|
|
"grad_norm": 0.68359375,
|
|
"learning_rate": 1.5460122699386504e-05,
|
|
"loss": 2.2796,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.7981859410430839,
|
|
"grad_norm": 0.62109375,
|
|
"learning_rate": 1.5337423312883436e-05,
|
|
"loss": 2.2356,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.8163265306122449,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 1.5214723926380371e-05,
|
|
"loss": 2.2562,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.8344671201814059,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 1.50920245398773e-05,
|
|
"loss": 2.2189,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.8526077097505669,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 1.4969325153374235e-05,
|
|
"loss": 2.2293,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.8707482993197279,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 1.4846625766871168e-05,
|
|
"loss": 2.1792,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 1.47239263803681e-05,
|
|
"loss": 2.2333,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.9070294784580499,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 1.4601226993865032e-05,
|
|
"loss": 2.2005,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.9251700680272109,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 1.4478527607361965e-05,
|
|
"loss": 2.2617,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.9433106575963719,
|
|
"grad_norm": 0.64453125,
|
|
"learning_rate": 1.4355828220858897e-05,
|
|
"loss": 2.2125,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.9614512471655329,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 1.423312883435583e-05,
|
|
"loss": 2.237,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.9795918367346939,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 1.4110429447852763e-05,
|
|
"loss": 2.1756,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.9977324263038548,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 1.3987730061349694e-05,
|
|
"loss": 2.2037,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 1.3865030674846627e-05,
|
|
"loss": 2.1214,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 2.2088165283203125,
|
|
"eval_model_preparation_time": 0.0224,
|
|
"eval_runtime": 2.7857,
|
|
"eval_samples_per_second": 35.179,
|
|
"eval_steps_per_second": 17.59,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 1.018140589569161,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 1.374233128834356e-05,
|
|
"loss": 2.1552,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 1.036281179138322,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 1.3619631901840491e-05,
|
|
"loss": 2.1247,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 1.054421768707483,
|
|
"grad_norm": 0.6171875,
|
|
"learning_rate": 1.3496932515337424e-05,
|
|
"loss": 2.2268,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 1.072562358276644,
|
|
"grad_norm": 0.65234375,
|
|
"learning_rate": 1.3374233128834357e-05,
|
|
"loss": 2.1801,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 1.090702947845805,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 1.3251533742331288e-05,
|
|
"loss": 2.1991,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 1.1088435374149659,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 1.3128834355828221e-05,
|
|
"loss": 2.1206,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 1.126984126984127,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 1.3006134969325156e-05,
|
|
"loss": 2.1545,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 1.145124716553288,
|
|
"grad_norm": 0.65234375,
|
|
"learning_rate": 1.2883435582822085e-05,
|
|
"loss": 2.1574,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 1.163265306122449,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 1.276073619631902e-05,
|
|
"loss": 2.1384,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 1.18140589569161,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 1.2638036809815953e-05,
|
|
"loss": 2.1563,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 1.199546485260771,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.2515337423312886e-05,
|
|
"loss": 2.1593,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 1.217687074829932,
|
|
"grad_norm": 0.76953125,
|
|
"learning_rate": 1.2392638036809817e-05,
|
|
"loss": 2.1628,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 1.235827664399093,
|
|
"grad_norm": 0.66796875,
|
|
"learning_rate": 1.226993865030675e-05,
|
|
"loss": 2.1392,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 1.253968253968254,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 1.2147239263803683e-05,
|
|
"loss": 2.2247,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 1.272108843537415,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 1.2024539877300614e-05,
|
|
"loss": 2.1673,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 1.290249433106576,
|
|
"grad_norm": 0.61328125,
|
|
"learning_rate": 1.1901840490797547e-05,
|
|
"loss": 2.1676,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 1.308390022675737,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 1.177914110429448e-05,
|
|
"loss": 2.0911,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 1.3265306122448979,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 1.1656441717791411e-05,
|
|
"loss": 2.1493,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 1.344671201814059,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 1.1533742331288344e-05,
|
|
"loss": 2.1459,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 1.36281179138322,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 1.1411042944785277e-05,
|
|
"loss": 2.0973,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 1.380952380952381,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.1288343558282208e-05,
|
|
"loss": 2.0893,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 1.399092970521542,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 1.1165644171779141e-05,
|
|
"loss": 2.1779,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 1.417233560090703,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 1.1042944785276076e-05,
|
|
"loss": 2.0661,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 1.435374149659864,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 1.0920245398773005e-05,
|
|
"loss": 2.1201,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 1.4535147392290249,
|
|
"grad_norm": 0.65234375,
|
|
"learning_rate": 1.079754601226994e-05,
|
|
"loss": 2.0765,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 1.471655328798186,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 1.0674846625766873e-05,
|
|
"loss": 2.091,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 1.489795918367347,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 1.0552147239263804e-05,
|
|
"loss": 2.1094,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 1.507936507936508,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 1.0429447852760737e-05,
|
|
"loss": 2.2231,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 1.5260770975056688,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 1.030674846625767e-05,
|
|
"loss": 2.1197,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.54421768707483,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 1.0184049079754601e-05,
|
|
"loss": 2.1248,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 1.562358276643991,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 1.0061349693251534e-05,
|
|
"loss": 2.157,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 1.5804988662131518,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 9.938650306748467e-06,
|
|
"loss": 2.1562,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 1.598639455782313,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 9.8159509202454e-06,
|
|
"loss": 2.095,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 1.616780045351474,
|
|
"grad_norm": 0.72265625,
|
|
"learning_rate": 9.693251533742331e-06,
|
|
"loss": 2.1363,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.6349206349206349,
|
|
"grad_norm": 0.6328125,
|
|
"learning_rate": 9.570552147239264e-06,
|
|
"loss": 2.0856,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 1.6530612244897958,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 9.447852760736197e-06,
|
|
"loss": 2.1142,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 1.671201814058957,
|
|
"grad_norm": 0.72265625,
|
|
"learning_rate": 9.325153374233129e-06,
|
|
"loss": 2.1567,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 1.689342403628118,
|
|
"grad_norm": 0.7890625,
|
|
"learning_rate": 9.202453987730062e-06,
|
|
"loss": 2.1214,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 1.7074829931972788,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 9.079754601226994e-06,
|
|
"loss": 2.1152,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.72562358276644,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 8.957055214723927e-06,
|
|
"loss": 2.0999,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.743764172335601,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 8.83435582822086e-06,
|
|
"loss": 2.1361,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 1.7619047619047619,
|
|
"grad_norm": 0.75390625,
|
|
"learning_rate": 8.711656441717792e-06,
|
|
"loss": 2.1357,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 1.780045351473923,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 8.588957055214725e-06,
|
|
"loss": 2.1405,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 1.798185941043084,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 8.466257668711658e-06,
|
|
"loss": 2.0975,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.816326530612245,
|
|
"grad_norm": 0.68359375,
|
|
"learning_rate": 8.343558282208589e-06,
|
|
"loss": 2.1457,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 1.8344671201814058,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 8.220858895705522e-06,
|
|
"loss": 2.068,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 1.8526077097505669,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 8.098159509202455e-06,
|
|
"loss": 2.1473,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 1.870748299319728,
|
|
"grad_norm": 0.66015625,
|
|
"learning_rate": 7.975460122699386e-06,
|
|
"loss": 2.0983,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 7.85276073619632e-06,
|
|
"loss": 2.0952,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.90702947845805,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 7.730061349693252e-06,
|
|
"loss": 2.086,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 1.925170068027211,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 7.6073619631901856e-06,
|
|
"loss": 2.1086,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 1.943310657596372,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 7.484662576687118e-06,
|
|
"loss": 2.092,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 1.9614512471655328,
|
|
"grad_norm": 0.77734375,
|
|
"learning_rate": 7.36196319018405e-06,
|
|
"loss": 2.1334,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 1.9795918367346939,
|
|
"grad_norm": 0.77734375,
|
|
"learning_rate": 7.239263803680983e-06,
|
|
"loss": 2.0644,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.997732426303855,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 7.116564417177915e-06,
|
|
"loss": 2.0589,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 6.993865030674847e-06,
|
|
"loss": 2.0806,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 2.1338422298431396,
|
|
"eval_model_preparation_time": 0.0224,
|
|
"eval_runtime": 2.7565,
|
|
"eval_samples_per_second": 35.552,
|
|
"eval_steps_per_second": 17.776,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 2.018140589569161,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 6.87116564417178e-06,
|
|
"loss": 2.0871,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 2.036281179138322,
|
|
"grad_norm": 0.78125,
|
|
"learning_rate": 6.748466257668712e-06,
|
|
"loss": 2.0733,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 2.054421768707483,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 6.625766871165644e-06,
|
|
"loss": 2.0882,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 2.072562358276644,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 6.503067484662578e-06,
|
|
"loss": 2.028,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 2.090702947845805,
|
|
"grad_norm": 0.8125,
|
|
"learning_rate": 6.38036809815951e-06,
|
|
"loss": 2.0844,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 2.108843537414966,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 6.257668711656443e-06,
|
|
"loss": 2.1208,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 2.126984126984127,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 6.134969325153375e-06,
|
|
"loss": 2.1136,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 2.145124716553288,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 6.012269938650307e-06,
|
|
"loss": 2.091,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 2.163265306122449,
|
|
"grad_norm": 0.68359375,
|
|
"learning_rate": 5.88957055214724e-06,
|
|
"loss": 2.0499,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 2.18140589569161,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 5.766871165644172e-06,
|
|
"loss": 2.0557,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 2.199546485260771,
|
|
"grad_norm": 0.640625,
|
|
"learning_rate": 5.644171779141104e-06,
|
|
"loss": 2.0273,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 2.2176870748299318,
|
|
"grad_norm": 0.76953125,
|
|
"learning_rate": 5.521472392638038e-06,
|
|
"loss": 2.0663,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 2.235827664399093,
|
|
"grad_norm": 0.67578125,
|
|
"learning_rate": 5.39877300613497e-06,
|
|
"loss": 2.0865,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 2.253968253968254,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 5.276073619631902e-06,
|
|
"loss": 2.0568,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 2.272108843537415,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 5.153374233128835e-06,
|
|
"loss": 2.0819,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 2.290249433106576,
|
|
"grad_norm": 0.7734375,
|
|
"learning_rate": 5.030674846625767e-06,
|
|
"loss": 2.0872,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 2.308390022675737,
|
|
"grad_norm": 0.7734375,
|
|
"learning_rate": 4.9079754601227e-06,
|
|
"loss": 2.1564,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 2.326530612244898,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 4.785276073619632e-06,
|
|
"loss": 2.0401,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 2.3446712018140587,
|
|
"grad_norm": 0.68359375,
|
|
"learning_rate": 4.662576687116564e-06,
|
|
"loss": 2.0781,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 2.36281179138322,
|
|
"grad_norm": 0.73828125,
|
|
"learning_rate": 4.539877300613497e-06,
|
|
"loss": 2.0845,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 2.380952380952381,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 4.41717791411043e-06,
|
|
"loss": 2.0638,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 2.399092970521542,
|
|
"grad_norm": 0.77734375,
|
|
"learning_rate": 4.294478527607362e-06,
|
|
"loss": 2.0817,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 2.417233560090703,
|
|
"grad_norm": 0.71875,
|
|
"learning_rate": 4.171779141104294e-06,
|
|
"loss": 2.056,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 2.435374149659864,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 4.049079754601227e-06,
|
|
"loss": 2.109,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 2.453514739229025,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 3.92638036809816e-06,
|
|
"loss": 2.0948,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 2.471655328798186,
|
|
"grad_norm": 0.77734375,
|
|
"learning_rate": 3.8036809815950928e-06,
|
|
"loss": 2.1051,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 2.489795918367347,
|
|
"grad_norm": 0.7421875,
|
|
"learning_rate": 3.680981595092025e-06,
|
|
"loss": 2.0321,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 2.507936507936508,
|
|
"grad_norm": 0.81640625,
|
|
"learning_rate": 3.5582822085889574e-06,
|
|
"loss": 2.1067,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 2.526077097505669,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 3.43558282208589e-06,
|
|
"loss": 2.1105,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 2.54421768707483,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 3.312883435582822e-06,
|
|
"loss": 2.1413,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 2.562358276643991,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 3.190184049079755e-06,
|
|
"loss": 2.0716,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 2.580498866213152,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 3.0674846625766875e-06,
|
|
"loss": 2.0711,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 2.5986394557823127,
|
|
"grad_norm": 0.7578125,
|
|
"learning_rate": 2.94478527607362e-06,
|
|
"loss": 2.0836,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 2.616780045351474,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 2.822085889570552e-06,
|
|
"loss": 2.0586,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 2.634920634920635,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 2.699386503067485e-06,
|
|
"loss": 2.0593,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 2.6530612244897958,
|
|
"grad_norm": 0.6484375,
|
|
"learning_rate": 2.5766871165644175e-06,
|
|
"loss": 2.0547,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 2.671201814058957,
|
|
"grad_norm": 0.671875,
|
|
"learning_rate": 2.45398773006135e-06,
|
|
"loss": 2.0451,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 2.689342403628118,
|
|
"grad_norm": 0.6640625,
|
|
"learning_rate": 2.331288343558282e-06,
|
|
"loss": 2.0731,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.707482993197279,
|
|
"grad_norm": 0.6796875,
|
|
"learning_rate": 2.208588957055215e-06,
|
|
"loss": 2.0026,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 2.72562358276644,
|
|
"grad_norm": 0.765625,
|
|
"learning_rate": 2.085889570552147e-06,
|
|
"loss": 2.1035,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 2.743764172335601,
|
|
"grad_norm": 0.7109375,
|
|
"learning_rate": 1.96319018404908e-06,
|
|
"loss": 2.0727,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 2.761904761904762,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.8404907975460124e-06,
|
|
"loss": 2.0177,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 2.780045351473923,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 1.717791411042945e-06,
|
|
"loss": 2.0351,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 2.798185941043084,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.5950920245398775e-06,
|
|
"loss": 2.0597,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 2.816326530612245,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 1.47239263803681e-06,
|
|
"loss": 2.048,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 2.834467120181406,
|
|
"grad_norm": 0.6953125,
|
|
"learning_rate": 1.3496932515337425e-06,
|
|
"loss": 2.0717,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 2.8526077097505667,
|
|
"grad_norm": 0.69921875,
|
|
"learning_rate": 1.226993865030675e-06,
|
|
"loss": 2.1007,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 2.870748299319728,
|
|
"grad_norm": 0.71875,
|
|
"learning_rate": 1.1042944785276075e-06,
|
|
"loss": 2.0228,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.69140625,
|
|
"learning_rate": 9.8159509202454e-07,
|
|
"loss": 2.0413,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 2.9070294784580497,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 8.588957055214725e-07,
|
|
"loss": 2.0616,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 2.925170068027211,
|
|
"grad_norm": 0.71875,
|
|
"learning_rate": 7.36196319018405e-07,
|
|
"loss": 2.0833,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 2.943310657596372,
|
|
"grad_norm": 0.75,
|
|
"learning_rate": 6.134969325153375e-07,
|
|
"loss": 2.1265,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 2.9614512471655328,
|
|
"grad_norm": 0.71875,
|
|
"learning_rate": 4.9079754601227e-07,
|
|
"loss": 2.0442,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 2.979591836734694,
|
|
"grad_norm": 0.6875,
|
|
"learning_rate": 3.680981595092025e-07,
|
|
"loss": 2.0522,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 2.997732426303855,
|
|
"grad_norm": 0.70703125,
|
|
"learning_rate": 2.45398773006135e-07,
|
|
"loss": 2.0616,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 1.226993865030675e-07,
|
|
"loss": 2.0104,
|
|
"step": 168
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 168,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 5000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.058921741456589e+16,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|