Files
Qwen3-4B-Base-ftjob-235faf2…/checkpoint-333/trainer_state.json

2384 lines
54 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 333,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009070294784580499,
"grad_norm": 0.9609375,
"learning_rate": 0.0,
"loss": 2.7821,
"step": 1
},
{
"epoch": 0.018140589569160998,
"grad_norm": 0.99609375,
"learning_rate": 4.000000000000001e-06,
"loss": 2.7555,
"step": 2
},
{
"epoch": 0.027210884353741496,
"grad_norm": 1.03125,
"learning_rate": 8.000000000000001e-06,
"loss": 2.7964,
"step": 3
},
{
"epoch": 0.036281179138321996,
"grad_norm": 1.0,
"learning_rate": 1.2e-05,
"loss": 2.8019,
"step": 4
},
{
"epoch": 0.045351473922902494,
"grad_norm": 0.94140625,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.6892,
"step": 5
},
{
"epoch": 0.05442176870748299,
"grad_norm": 0.99609375,
"learning_rate": 2e-05,
"loss": 2.7791,
"step": 6
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.9375,
"learning_rate": 1.9939024390243904e-05,
"loss": 2.7525,
"step": 7
},
{
"epoch": 0.07256235827664399,
"grad_norm": 0.93359375,
"learning_rate": 1.9878048780487806e-05,
"loss": 2.6766,
"step": 8
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.8828125,
"learning_rate": 1.9817073170731708e-05,
"loss": 2.74,
"step": 9
},
{
"epoch": 0.09070294784580499,
"grad_norm": 0.90234375,
"learning_rate": 1.975609756097561e-05,
"loss": 2.677,
"step": 10
},
{
"epoch": 0.09977324263038549,
"grad_norm": 0.83203125,
"learning_rate": 1.9695121951219512e-05,
"loss": 2.6407,
"step": 11
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.8671875,
"learning_rate": 1.9634146341463414e-05,
"loss": 2.6783,
"step": 12
},
{
"epoch": 0.11791383219954649,
"grad_norm": 0.85546875,
"learning_rate": 1.957317073170732e-05,
"loss": 2.6221,
"step": 13
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.80078125,
"learning_rate": 1.9512195121951222e-05,
"loss": 2.6387,
"step": 14
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.7734375,
"learning_rate": 1.9451219512195124e-05,
"loss": 2.6071,
"step": 15
},
{
"epoch": 0.14512471655328799,
"grad_norm": 0.7109375,
"learning_rate": 1.9390243902439026e-05,
"loss": 2.5977,
"step": 16
},
{
"epoch": 0.15419501133786848,
"grad_norm": 0.71484375,
"learning_rate": 1.9329268292682928e-05,
"loss": 2.5523,
"step": 17
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.73828125,
"learning_rate": 1.926829268292683e-05,
"loss": 2.5804,
"step": 18
},
{
"epoch": 0.17233560090702948,
"grad_norm": 0.703125,
"learning_rate": 1.9207317073170733e-05,
"loss": 2.4848,
"step": 19
},
{
"epoch": 0.18140589569160998,
"grad_norm": 0.7421875,
"learning_rate": 1.9146341463414635e-05,
"loss": 2.5069,
"step": 20
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.71484375,
"learning_rate": 1.9085365853658537e-05,
"loss": 2.4904,
"step": 21
},
{
"epoch": 0.19954648526077098,
"grad_norm": 0.734375,
"learning_rate": 1.902439024390244e-05,
"loss": 2.4733,
"step": 22
},
{
"epoch": 0.20861678004535147,
"grad_norm": 0.6875,
"learning_rate": 1.896341463414634e-05,
"loss": 2.4548,
"step": 23
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.71484375,
"learning_rate": 1.8902439024390243e-05,
"loss": 2.414,
"step": 24
},
{
"epoch": 0.22675736961451248,
"grad_norm": 0.734375,
"learning_rate": 1.8841463414634145e-05,
"loss": 2.4784,
"step": 25
},
{
"epoch": 0.23582766439909297,
"grad_norm": 0.6875,
"learning_rate": 1.878048780487805e-05,
"loss": 2.4775,
"step": 26
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.6875,
"learning_rate": 1.8719512195121953e-05,
"loss": 2.4024,
"step": 27
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.75,
"learning_rate": 1.8658536585365855e-05,
"loss": 2.4668,
"step": 28
},
{
"epoch": 0.26303854875283444,
"grad_norm": 0.7265625,
"learning_rate": 1.8597560975609757e-05,
"loss": 2.4542,
"step": 29
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.7890625,
"learning_rate": 1.8536585365853663e-05,
"loss": 2.4045,
"step": 30
},
{
"epoch": 0.2811791383219955,
"grad_norm": 0.6796875,
"learning_rate": 1.8475609756097565e-05,
"loss": 2.413,
"step": 31
},
{
"epoch": 0.29024943310657597,
"grad_norm": 0.6953125,
"learning_rate": 1.8414634146341467e-05,
"loss": 2.4288,
"step": 32
},
{
"epoch": 0.29931972789115646,
"grad_norm": 0.73046875,
"learning_rate": 1.835365853658537e-05,
"loss": 2.4127,
"step": 33
},
{
"epoch": 0.30839002267573695,
"grad_norm": 0.73046875,
"learning_rate": 1.829268292682927e-05,
"loss": 2.4347,
"step": 34
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.73828125,
"learning_rate": 1.8231707317073173e-05,
"loss": 2.3892,
"step": 35
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.671875,
"learning_rate": 1.8170731707317075e-05,
"loss": 2.3851,
"step": 36
},
{
"epoch": 0.3356009070294785,
"grad_norm": 0.70703125,
"learning_rate": 1.8109756097560977e-05,
"loss": 2.3523,
"step": 37
},
{
"epoch": 0.34467120181405897,
"grad_norm": 0.65234375,
"learning_rate": 1.804878048780488e-05,
"loss": 2.3436,
"step": 38
},
{
"epoch": 0.35374149659863946,
"grad_norm": 0.671875,
"learning_rate": 1.798780487804878e-05,
"loss": 2.3701,
"step": 39
},
{
"epoch": 0.36281179138321995,
"grad_norm": 0.75,
"learning_rate": 1.7926829268292684e-05,
"loss": 2.3614,
"step": 40
},
{
"epoch": 0.37188208616780044,
"grad_norm": 0.71875,
"learning_rate": 1.7865853658536586e-05,
"loss": 2.3828,
"step": 41
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.8359375,
"learning_rate": 1.7804878048780488e-05,
"loss": 2.3818,
"step": 42
},
{
"epoch": 0.3900226757369615,
"grad_norm": 0.7109375,
"learning_rate": 1.7743902439024393e-05,
"loss": 2.2896,
"step": 43
},
{
"epoch": 0.39909297052154197,
"grad_norm": 0.66015625,
"learning_rate": 1.7682926829268296e-05,
"loss": 2.2879,
"step": 44
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.6953125,
"learning_rate": 1.7621951219512198e-05,
"loss": 2.341,
"step": 45
},
{
"epoch": 0.41723356009070295,
"grad_norm": 0.75,
"learning_rate": 1.75609756097561e-05,
"loss": 2.2897,
"step": 46
},
{
"epoch": 0.42630385487528344,
"grad_norm": 0.71875,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.3409,
"step": 47
},
{
"epoch": 0.43537414965986393,
"grad_norm": 0.66015625,
"learning_rate": 1.7439024390243904e-05,
"loss": 2.2946,
"step": 48
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.65625,
"learning_rate": 1.7378048780487806e-05,
"loss": 2.2962,
"step": 49
},
{
"epoch": 0.45351473922902497,
"grad_norm": 0.73828125,
"learning_rate": 1.7317073170731708e-05,
"loss": 2.3276,
"step": 50
},
{
"epoch": 0.46258503401360546,
"grad_norm": 0.7421875,
"learning_rate": 1.725609756097561e-05,
"loss": 2.331,
"step": 51
},
{
"epoch": 0.47165532879818595,
"grad_norm": 0.7421875,
"learning_rate": 1.7195121951219512e-05,
"loss": 2.3291,
"step": 52
},
{
"epoch": 0.48072562358276644,
"grad_norm": 0.70703125,
"learning_rate": 1.7134146341463415e-05,
"loss": 2.3285,
"step": 53
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.6484375,
"learning_rate": 1.7073170731707317e-05,
"loss": 2.2596,
"step": 54
},
{
"epoch": 0.4988662131519274,
"grad_norm": 0.703125,
"learning_rate": 1.7012195121951222e-05,
"loss": 2.2769,
"step": 55
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.71875,
"learning_rate": 1.6951219512195124e-05,
"loss": 2.2414,
"step": 56
},
{
"epoch": 0.5170068027210885,
"grad_norm": 0.6796875,
"learning_rate": 1.6890243902439026e-05,
"loss": 2.276,
"step": 57
},
{
"epoch": 0.5260770975056689,
"grad_norm": 0.73828125,
"learning_rate": 1.682926829268293e-05,
"loss": 2.2745,
"step": 58
},
{
"epoch": 0.5351473922902494,
"grad_norm": 0.7265625,
"learning_rate": 1.676829268292683e-05,
"loss": 2.1905,
"step": 59
},
{
"epoch": 0.54421768707483,
"grad_norm": 1.6171875,
"learning_rate": 1.6707317073170733e-05,
"loss": 2.2355,
"step": 60
},
{
"epoch": 0.5532879818594104,
"grad_norm": 0.87109375,
"learning_rate": 1.6646341463414635e-05,
"loss": 2.209,
"step": 61
},
{
"epoch": 0.562358276643991,
"grad_norm": 0.75,
"learning_rate": 1.6585365853658537e-05,
"loss": 2.3244,
"step": 62
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.6875,
"learning_rate": 1.652439024390244e-05,
"loss": 2.2522,
"step": 63
},
{
"epoch": 0.5804988662131519,
"grad_norm": 0.71484375,
"learning_rate": 1.646341463414634e-05,
"loss": 2.2726,
"step": 64
},
{
"epoch": 0.5895691609977324,
"grad_norm": 0.87109375,
"learning_rate": 1.6402439024390243e-05,
"loss": 2.1989,
"step": 65
},
{
"epoch": 0.5986394557823129,
"grad_norm": 0.75390625,
"learning_rate": 1.6341463414634145e-05,
"loss": 2.3039,
"step": 66
},
{
"epoch": 0.6077097505668935,
"grad_norm": 0.83203125,
"learning_rate": 1.6280487804878048e-05,
"loss": 2.2368,
"step": 67
},
{
"epoch": 0.6167800453514739,
"grad_norm": 0.75,
"learning_rate": 1.6219512195121953e-05,
"loss": 2.3243,
"step": 68
},
{
"epoch": 0.6258503401360545,
"grad_norm": 0.8203125,
"learning_rate": 1.6158536585365855e-05,
"loss": 2.2983,
"step": 69
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.70703125,
"learning_rate": 1.6097560975609757e-05,
"loss": 2.2208,
"step": 70
},
{
"epoch": 0.6439909297052154,
"grad_norm": 0.7734375,
"learning_rate": 1.603658536585366e-05,
"loss": 2.2676,
"step": 71
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.796875,
"learning_rate": 1.597560975609756e-05,
"loss": 2.2764,
"step": 72
},
{
"epoch": 0.6621315192743764,
"grad_norm": 0.78515625,
"learning_rate": 1.5914634146341467e-05,
"loss": 2.2431,
"step": 73
},
{
"epoch": 0.671201814058957,
"grad_norm": 0.70703125,
"learning_rate": 1.585365853658537e-05,
"loss": 2.2251,
"step": 74
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.76171875,
"learning_rate": 1.579268292682927e-05,
"loss": 2.265,
"step": 75
},
{
"epoch": 0.6893424036281179,
"grad_norm": 0.80859375,
"learning_rate": 1.5731707317073173e-05,
"loss": 2.2207,
"step": 76
},
{
"epoch": 0.6984126984126984,
"grad_norm": 0.78125,
"learning_rate": 1.5670731707317075e-05,
"loss": 2.2398,
"step": 77
},
{
"epoch": 0.7074829931972789,
"grad_norm": 0.6875,
"learning_rate": 1.5609756097560978e-05,
"loss": 2.2134,
"step": 78
},
{
"epoch": 0.7165532879818595,
"grad_norm": 0.796875,
"learning_rate": 1.554878048780488e-05,
"loss": 2.2297,
"step": 79
},
{
"epoch": 0.7256235827664399,
"grad_norm": 0.83203125,
"learning_rate": 1.5487804878048782e-05,
"loss": 2.1822,
"step": 80
},
{
"epoch": 0.7346938775510204,
"grad_norm": 0.76953125,
"learning_rate": 1.5426829268292684e-05,
"loss": 2.2522,
"step": 81
},
{
"epoch": 0.7437641723356009,
"grad_norm": 0.77734375,
"learning_rate": 1.5365853658536586e-05,
"loss": 2.2363,
"step": 82
},
{
"epoch": 0.7528344671201814,
"grad_norm": 0.6796875,
"learning_rate": 1.5304878048780488e-05,
"loss": 2.2289,
"step": 83
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.7578125,
"learning_rate": 1.5243902439024392e-05,
"loss": 2.2229,
"step": 84
},
{
"epoch": 0.7709750566893424,
"grad_norm": 1.0390625,
"learning_rate": 1.5182926829268294e-05,
"loss": 2.2818,
"step": 85
},
{
"epoch": 0.780045351473923,
"grad_norm": 0.77734375,
"learning_rate": 1.5121951219512196e-05,
"loss": 2.2268,
"step": 86
},
{
"epoch": 0.7891156462585034,
"grad_norm": 0.7734375,
"learning_rate": 1.5060975609756098e-05,
"loss": 2.2327,
"step": 87
},
{
"epoch": 0.7981859410430839,
"grad_norm": 0.75390625,
"learning_rate": 1.5000000000000002e-05,
"loss": 2.131,
"step": 88
},
{
"epoch": 0.8072562358276644,
"grad_norm": 0.71484375,
"learning_rate": 1.4939024390243904e-05,
"loss": 2.1873,
"step": 89
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.69921875,
"learning_rate": 1.4878048780487806e-05,
"loss": 2.2199,
"step": 90
},
{
"epoch": 0.8253968253968254,
"grad_norm": 0.70703125,
"learning_rate": 1.4817073170731708e-05,
"loss": 2.2014,
"step": 91
},
{
"epoch": 0.8344671201814059,
"grad_norm": 0.6953125,
"learning_rate": 1.475609756097561e-05,
"loss": 2.1782,
"step": 92
},
{
"epoch": 0.8435374149659864,
"grad_norm": 0.8125,
"learning_rate": 1.4695121951219513e-05,
"loss": 2.2409,
"step": 93
},
{
"epoch": 0.8526077097505669,
"grad_norm": 0.8828125,
"learning_rate": 1.4634146341463415e-05,
"loss": 2.1933,
"step": 94
},
{
"epoch": 0.8616780045351474,
"grad_norm": 0.7578125,
"learning_rate": 1.4573170731707319e-05,
"loss": 2.1623,
"step": 95
},
{
"epoch": 0.8707482993197279,
"grad_norm": 0.7890625,
"learning_rate": 1.451219512195122e-05,
"loss": 2.239,
"step": 96
},
{
"epoch": 0.8798185941043084,
"grad_norm": 0.8125,
"learning_rate": 1.4451219512195123e-05,
"loss": 2.2343,
"step": 97
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.84765625,
"learning_rate": 1.4390243902439025e-05,
"loss": 2.2198,
"step": 98
},
{
"epoch": 0.8979591836734694,
"grad_norm": 0.78515625,
"learning_rate": 1.4329268292682927e-05,
"loss": 2.2047,
"step": 99
},
{
"epoch": 0.9070294784580499,
"grad_norm": 0.78125,
"learning_rate": 1.4268292682926829e-05,
"loss": 2.2073,
"step": 100
},
{
"epoch": 0.9160997732426304,
"grad_norm": 0.83203125,
"learning_rate": 1.4207317073170733e-05,
"loss": 2.1697,
"step": 101
},
{
"epoch": 0.9251700680272109,
"grad_norm": 0.72265625,
"learning_rate": 1.4146341463414635e-05,
"loss": 2.213,
"step": 102
},
{
"epoch": 0.9342403628117913,
"grad_norm": 0.73828125,
"learning_rate": 1.4085365853658537e-05,
"loss": 2.2039,
"step": 103
},
{
"epoch": 0.9433106575963719,
"grad_norm": 0.82421875,
"learning_rate": 1.402439024390244e-05,
"loss": 2.178,
"step": 104
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.765625,
"learning_rate": 1.3963414634146341e-05,
"loss": 2.118,
"step": 105
},
{
"epoch": 0.9614512471655329,
"grad_norm": 0.7265625,
"learning_rate": 1.3902439024390244e-05,
"loss": 2.2182,
"step": 106
},
{
"epoch": 0.9705215419501134,
"grad_norm": 0.75,
"learning_rate": 1.3841463414634146e-05,
"loss": 2.1893,
"step": 107
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.75390625,
"learning_rate": 1.378048780487805e-05,
"loss": 2.2261,
"step": 108
},
{
"epoch": 0.9886621315192744,
"grad_norm": 0.8125,
"learning_rate": 1.3719512195121953e-05,
"loss": 2.1841,
"step": 109
},
{
"epoch": 0.9977324263038548,
"grad_norm": 0.9140625,
"learning_rate": 1.3658536585365855e-05,
"loss": 2.2254,
"step": 110
},
{
"epoch": 1.0,
"grad_norm": 1.5859375,
"learning_rate": 1.3597560975609757e-05,
"loss": 2.2042,
"step": 111
},
{
"epoch": 1.0,
"eval_loss": 2.2069785594940186,
"eval_model_preparation_time": 0.0172,
"eval_runtime": 11.0732,
"eval_samples_per_second": 17.7,
"eval_steps_per_second": 8.85,
"step": 111
},
{
"epoch": 1.0090702947845804,
"grad_norm": 0.79296875,
"learning_rate": 1.3536585365853661e-05,
"loss": 2.1678,
"step": 112
},
{
"epoch": 1.018140589569161,
"grad_norm": 0.8125,
"learning_rate": 1.3475609756097563e-05,
"loss": 2.2223,
"step": 113
},
{
"epoch": 1.0272108843537415,
"grad_norm": 0.71875,
"learning_rate": 1.3414634146341466e-05,
"loss": 2.1223,
"step": 114
},
{
"epoch": 1.036281179138322,
"grad_norm": 0.921875,
"learning_rate": 1.3353658536585368e-05,
"loss": 2.1698,
"step": 115
},
{
"epoch": 1.0453514739229024,
"grad_norm": 0.79296875,
"learning_rate": 1.329268292682927e-05,
"loss": 2.105,
"step": 116
},
{
"epoch": 1.054421768707483,
"grad_norm": 0.75,
"learning_rate": 1.3231707317073172e-05,
"loss": 2.1412,
"step": 117
},
{
"epoch": 1.0634920634920635,
"grad_norm": 0.78125,
"learning_rate": 1.3170731707317076e-05,
"loss": 2.1615,
"step": 118
},
{
"epoch": 1.072562358276644,
"grad_norm": 0.765625,
"learning_rate": 1.3109756097560978e-05,
"loss": 2.154,
"step": 119
},
{
"epoch": 1.0816326530612246,
"grad_norm": 0.84765625,
"learning_rate": 1.304878048780488e-05,
"loss": 2.1844,
"step": 120
},
{
"epoch": 1.090702947845805,
"grad_norm": 0.82421875,
"learning_rate": 1.2987804878048782e-05,
"loss": 2.1283,
"step": 121
},
{
"epoch": 1.0997732426303855,
"grad_norm": 0.75,
"learning_rate": 1.2926829268292684e-05,
"loss": 2.0554,
"step": 122
},
{
"epoch": 1.1088435374149659,
"grad_norm": 0.81640625,
"learning_rate": 1.2865853658536586e-05,
"loss": 2.151,
"step": 123
},
{
"epoch": 1.1179138321995465,
"grad_norm": 0.75390625,
"learning_rate": 1.2804878048780488e-05,
"loss": 2.1076,
"step": 124
},
{
"epoch": 1.126984126984127,
"grad_norm": 0.79296875,
"learning_rate": 1.2743902439024392e-05,
"loss": 2.1403,
"step": 125
},
{
"epoch": 1.1360544217687074,
"grad_norm": 0.94921875,
"learning_rate": 1.2682926829268294e-05,
"loss": 2.1241,
"step": 126
},
{
"epoch": 1.145124716553288,
"grad_norm": 0.94921875,
"learning_rate": 1.2621951219512196e-05,
"loss": 2.1536,
"step": 127
},
{
"epoch": 1.1541950113378685,
"grad_norm": 0.953125,
"learning_rate": 1.2560975609756098e-05,
"loss": 2.1969,
"step": 128
},
{
"epoch": 1.163265306122449,
"grad_norm": 0.828125,
"learning_rate": 1.25e-05,
"loss": 2.15,
"step": 129
},
{
"epoch": 1.1723356009070294,
"grad_norm": 0.80078125,
"learning_rate": 1.2439024390243903e-05,
"loss": 2.1331,
"step": 130
},
{
"epoch": 1.18140589569161,
"grad_norm": 0.93359375,
"learning_rate": 1.2378048780487807e-05,
"loss": 2.1181,
"step": 131
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.77734375,
"learning_rate": 1.2317073170731709e-05,
"loss": 2.1217,
"step": 132
},
{
"epoch": 1.199546485260771,
"grad_norm": 0.7578125,
"learning_rate": 1.225609756097561e-05,
"loss": 2.1555,
"step": 133
},
{
"epoch": 1.2086167800453516,
"grad_norm": 0.7265625,
"learning_rate": 1.2195121951219513e-05,
"loss": 2.0599,
"step": 134
},
{
"epoch": 1.217687074829932,
"grad_norm": 0.84375,
"learning_rate": 1.2134146341463415e-05,
"loss": 2.1798,
"step": 135
},
{
"epoch": 1.2267573696145124,
"grad_norm": 0.8125,
"learning_rate": 1.2073170731707317e-05,
"loss": 2.156,
"step": 136
},
{
"epoch": 1.235827664399093,
"grad_norm": 0.84765625,
"learning_rate": 1.2012195121951221e-05,
"loss": 2.1407,
"step": 137
},
{
"epoch": 1.2448979591836735,
"grad_norm": 0.79296875,
"learning_rate": 1.1951219512195123e-05,
"loss": 2.1838,
"step": 138
},
{
"epoch": 1.253968253968254,
"grad_norm": 0.81640625,
"learning_rate": 1.1890243902439025e-05,
"loss": 2.1762,
"step": 139
},
{
"epoch": 1.2630385487528344,
"grad_norm": 0.796875,
"learning_rate": 1.1829268292682927e-05,
"loss": 2.0928,
"step": 140
},
{
"epoch": 1.272108843537415,
"grad_norm": 0.82421875,
"learning_rate": 1.176829268292683e-05,
"loss": 2.1282,
"step": 141
},
{
"epoch": 1.2811791383219955,
"grad_norm": 0.91015625,
"learning_rate": 1.1707317073170731e-05,
"loss": 2.1926,
"step": 142
},
{
"epoch": 1.290249433106576,
"grad_norm": 0.82421875,
"learning_rate": 1.1646341463414634e-05,
"loss": 2.1752,
"step": 143
},
{
"epoch": 1.2993197278911564,
"grad_norm": 0.82421875,
"learning_rate": 1.1585365853658537e-05,
"loss": 2.1733,
"step": 144
},
{
"epoch": 1.308390022675737,
"grad_norm": 0.83203125,
"learning_rate": 1.152439024390244e-05,
"loss": 2.1226,
"step": 145
},
{
"epoch": 1.3174603174603174,
"grad_norm": 0.87109375,
"learning_rate": 1.1463414634146342e-05,
"loss": 2.0981,
"step": 146
},
{
"epoch": 1.3265306122448979,
"grad_norm": 0.890625,
"learning_rate": 1.1402439024390244e-05,
"loss": 2.2364,
"step": 147
},
{
"epoch": 1.3356009070294785,
"grad_norm": 0.7890625,
"learning_rate": 1.1341463414634146e-05,
"loss": 2.157,
"step": 148
},
{
"epoch": 1.344671201814059,
"grad_norm": 0.8828125,
"learning_rate": 1.1280487804878048e-05,
"loss": 2.1118,
"step": 149
},
{
"epoch": 1.3537414965986394,
"grad_norm": 0.828125,
"learning_rate": 1.1219512195121953e-05,
"loss": 2.0814,
"step": 150
},
{
"epoch": 1.36281179138322,
"grad_norm": 0.8984375,
"learning_rate": 1.1158536585365856e-05,
"loss": 2.1626,
"step": 151
},
{
"epoch": 1.3718820861678005,
"grad_norm": 0.90234375,
"learning_rate": 1.1097560975609758e-05,
"loss": 2.1095,
"step": 152
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.83984375,
"learning_rate": 1.103658536585366e-05,
"loss": 2.1176,
"step": 153
},
{
"epoch": 1.3900226757369616,
"grad_norm": 0.83984375,
"learning_rate": 1.0975609756097562e-05,
"loss": 2.1273,
"step": 154
},
{
"epoch": 1.399092970521542,
"grad_norm": 0.82421875,
"learning_rate": 1.0914634146341466e-05,
"loss": 2.1309,
"step": 155
},
{
"epoch": 1.4081632653061225,
"grad_norm": 0.83203125,
"learning_rate": 1.0853658536585368e-05,
"loss": 2.1036,
"step": 156
},
{
"epoch": 1.417233560090703,
"grad_norm": 0.80859375,
"learning_rate": 1.079268292682927e-05,
"loss": 2.1404,
"step": 157
},
{
"epoch": 1.4263038548752833,
"grad_norm": 0.8046875,
"learning_rate": 1.0731707317073172e-05,
"loss": 2.1466,
"step": 158
},
{
"epoch": 1.435374149659864,
"grad_norm": 0.80859375,
"learning_rate": 1.0670731707317074e-05,
"loss": 2.1523,
"step": 159
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.8515625,
"learning_rate": 1.0609756097560976e-05,
"loss": 2.1296,
"step": 160
},
{
"epoch": 1.4535147392290249,
"grad_norm": 0.90234375,
"learning_rate": 1.054878048780488e-05,
"loss": 2.1349,
"step": 161
},
{
"epoch": 1.4625850340136055,
"grad_norm": 0.83984375,
"learning_rate": 1.0487804878048782e-05,
"loss": 2.1676,
"step": 162
},
{
"epoch": 1.471655328798186,
"grad_norm": 1.171875,
"learning_rate": 1.0426829268292684e-05,
"loss": 2.1887,
"step": 163
},
{
"epoch": 1.4807256235827664,
"grad_norm": 0.83984375,
"learning_rate": 1.0365853658536586e-05,
"loss": 2.099,
"step": 164
},
{
"epoch": 1.489795918367347,
"grad_norm": 1.0078125,
"learning_rate": 1.0304878048780489e-05,
"loss": 2.1318,
"step": 165
},
{
"epoch": 1.4988662131519275,
"grad_norm": 0.84375,
"learning_rate": 1.024390243902439e-05,
"loss": 2.08,
"step": 166
},
{
"epoch": 1.507936507936508,
"grad_norm": 0.8515625,
"learning_rate": 1.0182926829268294e-05,
"loss": 2.1447,
"step": 167
},
{
"epoch": 1.5170068027210886,
"grad_norm": 0.8984375,
"learning_rate": 1.0121951219512197e-05,
"loss": 2.1101,
"step": 168
},
{
"epoch": 1.5260770975056688,
"grad_norm": 0.875,
"learning_rate": 1.0060975609756099e-05,
"loss": 2.1164,
"step": 169
},
{
"epoch": 1.5351473922902494,
"grad_norm": 0.90234375,
"learning_rate": 1e-05,
"loss": 2.0865,
"step": 170
},
{
"epoch": 1.54421768707483,
"grad_norm": 0.87890625,
"learning_rate": 9.939024390243903e-06,
"loss": 2.0662,
"step": 171
},
{
"epoch": 1.5532879818594103,
"grad_norm": 0.83203125,
"learning_rate": 9.878048780487805e-06,
"loss": 2.0951,
"step": 172
},
{
"epoch": 1.562358276643991,
"grad_norm": 0.81640625,
"learning_rate": 9.817073170731707e-06,
"loss": 2.1146,
"step": 173
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.78515625,
"learning_rate": 9.756097560975611e-06,
"loss": 2.0241,
"step": 174
},
{
"epoch": 1.5804988662131518,
"grad_norm": 0.80859375,
"learning_rate": 9.695121951219513e-06,
"loss": 2.0803,
"step": 175
},
{
"epoch": 1.5895691609977325,
"grad_norm": 0.83984375,
"learning_rate": 9.634146341463415e-06,
"loss": 2.0791,
"step": 176
},
{
"epoch": 1.598639455782313,
"grad_norm": 0.8984375,
"learning_rate": 9.573170731707317e-06,
"loss": 2.1377,
"step": 177
},
{
"epoch": 1.6077097505668934,
"grad_norm": 0.84375,
"learning_rate": 9.51219512195122e-06,
"loss": 2.131,
"step": 178
},
{
"epoch": 1.616780045351474,
"grad_norm": 0.83203125,
"learning_rate": 9.451219512195122e-06,
"loss": 2.1134,
"step": 179
},
{
"epoch": 1.6258503401360545,
"grad_norm": 0.83984375,
"learning_rate": 9.390243902439025e-06,
"loss": 2.1256,
"step": 180
},
{
"epoch": 1.6349206349206349,
"grad_norm": 0.8125,
"learning_rate": 9.329268292682927e-06,
"loss": 2.0204,
"step": 181
},
{
"epoch": 1.6439909297052155,
"grad_norm": 0.82421875,
"learning_rate": 9.268292682926831e-06,
"loss": 2.0775,
"step": 182
},
{
"epoch": 1.6530612244897958,
"grad_norm": 0.88671875,
"learning_rate": 9.207317073170733e-06,
"loss": 2.1455,
"step": 183
},
{
"epoch": 1.6621315192743764,
"grad_norm": 0.91015625,
"learning_rate": 9.146341463414635e-06,
"loss": 2.084,
"step": 184
},
{
"epoch": 1.671201814058957,
"grad_norm": 0.76953125,
"learning_rate": 9.085365853658538e-06,
"loss": 2.0833,
"step": 185
},
{
"epoch": 1.6802721088435373,
"grad_norm": 0.81640625,
"learning_rate": 9.02439024390244e-06,
"loss": 2.1452,
"step": 186
},
{
"epoch": 1.689342403628118,
"grad_norm": 0.9375,
"learning_rate": 8.963414634146342e-06,
"loss": 2.1227,
"step": 187
},
{
"epoch": 1.6984126984126984,
"grad_norm": 0.9140625,
"learning_rate": 8.902439024390244e-06,
"loss": 2.0747,
"step": 188
},
{
"epoch": 1.7074829931972788,
"grad_norm": 0.8203125,
"learning_rate": 8.841463414634148e-06,
"loss": 2.0927,
"step": 189
},
{
"epoch": 1.7165532879818595,
"grad_norm": 0.77734375,
"learning_rate": 8.78048780487805e-06,
"loss": 2.1195,
"step": 190
},
{
"epoch": 1.72562358276644,
"grad_norm": 0.89453125,
"learning_rate": 8.719512195121952e-06,
"loss": 2.0527,
"step": 191
},
{
"epoch": 1.7346938775510203,
"grad_norm": 0.9296875,
"learning_rate": 8.658536585365854e-06,
"loss": 2.1034,
"step": 192
},
{
"epoch": 1.743764172335601,
"grad_norm": 0.90234375,
"learning_rate": 8.597560975609756e-06,
"loss": 2.1175,
"step": 193
},
{
"epoch": 1.7528344671201814,
"grad_norm": 0.8828125,
"learning_rate": 8.536585365853658e-06,
"loss": 2.0906,
"step": 194
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.890625,
"learning_rate": 8.475609756097562e-06,
"loss": 2.1299,
"step": 195
},
{
"epoch": 1.7709750566893425,
"grad_norm": 0.9609375,
"learning_rate": 8.414634146341464e-06,
"loss": 2.1131,
"step": 196
},
{
"epoch": 1.780045351473923,
"grad_norm": 0.890625,
"learning_rate": 8.353658536585366e-06,
"loss": 2.1026,
"step": 197
},
{
"epoch": 1.7891156462585034,
"grad_norm": 0.8671875,
"learning_rate": 8.292682926829268e-06,
"loss": 2.0937,
"step": 198
},
{
"epoch": 1.798185941043084,
"grad_norm": 0.8046875,
"learning_rate": 8.23170731707317e-06,
"loss": 2.161,
"step": 199
},
{
"epoch": 1.8072562358276643,
"grad_norm": 0.86328125,
"learning_rate": 8.170731707317073e-06,
"loss": 2.0945,
"step": 200
},
{
"epoch": 1.816326530612245,
"grad_norm": 0.83984375,
"learning_rate": 8.109756097560977e-06,
"loss": 2.1093,
"step": 201
},
{
"epoch": 1.8253968253968254,
"grad_norm": 0.98828125,
"learning_rate": 8.048780487804879e-06,
"loss": 2.1666,
"step": 202
},
{
"epoch": 1.8344671201814058,
"grad_norm": 0.8671875,
"learning_rate": 7.98780487804878e-06,
"loss": 2.123,
"step": 203
},
{
"epoch": 1.8435374149659864,
"grad_norm": 0.87109375,
"learning_rate": 7.926829268292685e-06,
"loss": 2.0771,
"step": 204
},
{
"epoch": 1.8526077097505669,
"grad_norm": 0.8984375,
"learning_rate": 7.865853658536587e-06,
"loss": 2.1275,
"step": 205
},
{
"epoch": 1.8616780045351473,
"grad_norm": 0.8046875,
"learning_rate": 7.804878048780489e-06,
"loss": 2.0512,
"step": 206
},
{
"epoch": 1.870748299319728,
"grad_norm": 0.88671875,
"learning_rate": 7.743902439024391e-06,
"loss": 2.0877,
"step": 207
},
{
"epoch": 1.8798185941043084,
"grad_norm": 0.85546875,
"learning_rate": 7.682926829268293e-06,
"loss": 2.1309,
"step": 208
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.84375,
"learning_rate": 7.621951219512196e-06,
"loss": 2.038,
"step": 209
},
{
"epoch": 1.8979591836734695,
"grad_norm": 0.83203125,
"learning_rate": 7.560975609756098e-06,
"loss": 2.1102,
"step": 210
},
{
"epoch": 1.90702947845805,
"grad_norm": 0.875,
"learning_rate": 7.500000000000001e-06,
"loss": 2.0963,
"step": 211
},
{
"epoch": 1.9160997732426304,
"grad_norm": 0.890625,
"learning_rate": 7.439024390243903e-06,
"loss": 2.0508,
"step": 212
},
{
"epoch": 1.925170068027211,
"grad_norm": 0.875,
"learning_rate": 7.378048780487805e-06,
"loss": 2.1489,
"step": 213
},
{
"epoch": 1.9342403628117912,
"grad_norm": 0.80859375,
"learning_rate": 7.317073170731707e-06,
"loss": 2.0664,
"step": 214
},
{
"epoch": 1.943310657596372,
"grad_norm": 0.90625,
"learning_rate": 7.25609756097561e-06,
"loss": 2.0465,
"step": 215
},
{
"epoch": 1.9523809523809523,
"grad_norm": 0.83984375,
"learning_rate": 7.1951219512195125e-06,
"loss": 2.0658,
"step": 216
},
{
"epoch": 1.9614512471655328,
"grad_norm": 0.8359375,
"learning_rate": 7.1341463414634146e-06,
"loss": 2.1214,
"step": 217
},
{
"epoch": 1.9705215419501134,
"grad_norm": 0.875,
"learning_rate": 7.0731707317073175e-06,
"loss": 2.1,
"step": 218
},
{
"epoch": 1.9795918367346939,
"grad_norm": 0.96484375,
"learning_rate": 7.01219512195122e-06,
"loss": 2.1089,
"step": 219
},
{
"epoch": 1.9886621315192743,
"grad_norm": 0.890625,
"learning_rate": 6.951219512195122e-06,
"loss": 2.1077,
"step": 220
},
{
"epoch": 1.997732426303855,
"grad_norm": 0.859375,
"learning_rate": 6.890243902439025e-06,
"loss": 2.0788,
"step": 221
},
{
"epoch": 2.0,
"grad_norm": 2.265625,
"learning_rate": 6.829268292682928e-06,
"loss": 2.2393,
"step": 222
},
{
"epoch": 2.0,
"eval_loss": 2.139087677001953,
"eval_model_preparation_time": 0.0172,
"eval_runtime": 11.0668,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 8.855,
"step": 222
},
{
"epoch": 2.0090702947845807,
"grad_norm": 0.82421875,
"learning_rate": 6.768292682926831e-06,
"loss": 2.0251,
"step": 223
},
{
"epoch": 2.018140589569161,
"grad_norm": 0.828125,
"learning_rate": 6.707317073170733e-06,
"loss": 2.0424,
"step": 224
},
{
"epoch": 2.0272108843537415,
"grad_norm": 0.8203125,
"learning_rate": 6.646341463414635e-06,
"loss": 2.0415,
"step": 225
},
{
"epoch": 2.036281179138322,
"grad_norm": 0.84765625,
"learning_rate": 6.585365853658538e-06,
"loss": 2.0551,
"step": 226
},
{
"epoch": 2.0453514739229024,
"grad_norm": 0.9296875,
"learning_rate": 6.52439024390244e-06,
"loss": 2.1507,
"step": 227
},
{
"epoch": 2.054421768707483,
"grad_norm": 0.85546875,
"learning_rate": 6.463414634146342e-06,
"loss": 2.0214,
"step": 228
},
{
"epoch": 2.0634920634920633,
"grad_norm": 0.80078125,
"learning_rate": 6.402439024390244e-06,
"loss": 2.064,
"step": 229
},
{
"epoch": 2.072562358276644,
"grad_norm": 0.97265625,
"learning_rate": 6.341463414634147e-06,
"loss": 2.0992,
"step": 230
},
{
"epoch": 2.0816326530612246,
"grad_norm": 0.87109375,
"learning_rate": 6.280487804878049e-06,
"loss": 2.0473,
"step": 231
},
{
"epoch": 2.090702947845805,
"grad_norm": 0.8984375,
"learning_rate": 6.219512195121951e-06,
"loss": 2.1006,
"step": 232
},
{
"epoch": 2.0997732426303855,
"grad_norm": 0.91796875,
"learning_rate": 6.158536585365854e-06,
"loss": 2.1157,
"step": 233
},
{
"epoch": 2.108843537414966,
"grad_norm": 0.90625,
"learning_rate": 6.0975609756097564e-06,
"loss": 2.084,
"step": 234
},
{
"epoch": 2.1179138321995463,
"grad_norm": 1.0234375,
"learning_rate": 6.0365853658536585e-06,
"loss": 2.0984,
"step": 235
},
{
"epoch": 2.126984126984127,
"grad_norm": 0.85546875,
"learning_rate": 5.9756097560975615e-06,
"loss": 2.056,
"step": 236
},
{
"epoch": 2.1360544217687076,
"grad_norm": 0.88671875,
"learning_rate": 5.914634146341464e-06,
"loss": 2.0533,
"step": 237
},
{
"epoch": 2.145124716553288,
"grad_norm": 0.875,
"learning_rate": 5.853658536585366e-06,
"loss": 2.0605,
"step": 238
},
{
"epoch": 2.1541950113378685,
"grad_norm": 0.92578125,
"learning_rate": 5.792682926829269e-06,
"loss": 2.0865,
"step": 239
},
{
"epoch": 2.163265306122449,
"grad_norm": 0.8671875,
"learning_rate": 5.731707317073171e-06,
"loss": 2.0988,
"step": 240
},
{
"epoch": 2.1723356009070294,
"grad_norm": 0.921875,
"learning_rate": 5.670731707317073e-06,
"loss": 2.1342,
"step": 241
},
{
"epoch": 2.18140589569161,
"grad_norm": 0.84375,
"learning_rate": 5.609756097560977e-06,
"loss": 2.0634,
"step": 242
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.99609375,
"learning_rate": 5.548780487804879e-06,
"loss": 2.0547,
"step": 243
},
{
"epoch": 2.199546485260771,
"grad_norm": 0.86328125,
"learning_rate": 5.487804878048781e-06,
"loss": 2.1118,
"step": 244
},
{
"epoch": 2.2086167800453516,
"grad_norm": 0.8671875,
"learning_rate": 5.426829268292684e-06,
"loss": 2.0422,
"step": 245
},
{
"epoch": 2.2176870748299318,
"grad_norm": 0.87109375,
"learning_rate": 5.365853658536586e-06,
"loss": 2.0844,
"step": 246
},
{
"epoch": 2.2267573696145124,
"grad_norm": 0.85546875,
"learning_rate": 5.304878048780488e-06,
"loss": 2.0697,
"step": 247
},
{
"epoch": 2.235827664399093,
"grad_norm": 0.859375,
"learning_rate": 5.243902439024391e-06,
"loss": 2.0595,
"step": 248
},
{
"epoch": 2.2448979591836733,
"grad_norm": 0.86328125,
"learning_rate": 5.182926829268293e-06,
"loss": 2.0494,
"step": 249
},
{
"epoch": 2.253968253968254,
"grad_norm": 0.81640625,
"learning_rate": 5.121951219512195e-06,
"loss": 2.052,
"step": 250
},
{
"epoch": 2.2630385487528346,
"grad_norm": 0.94140625,
"learning_rate": 5.060975609756098e-06,
"loss": 2.0576,
"step": 251
},
{
"epoch": 2.272108843537415,
"grad_norm": 0.9765625,
"learning_rate": 5e-06,
"loss": 2.078,
"step": 252
},
{
"epoch": 2.2811791383219955,
"grad_norm": 0.84765625,
"learning_rate": 4.9390243902439025e-06,
"loss": 2.0474,
"step": 253
},
{
"epoch": 2.290249433106576,
"grad_norm": 0.921875,
"learning_rate": 4.8780487804878055e-06,
"loss": 2.0628,
"step": 254
},
{
"epoch": 2.2993197278911564,
"grad_norm": 0.8828125,
"learning_rate": 4.817073170731708e-06,
"loss": 2.0422,
"step": 255
},
{
"epoch": 2.308390022675737,
"grad_norm": 0.99609375,
"learning_rate": 4.75609756097561e-06,
"loss": 2.0495,
"step": 256
},
{
"epoch": 2.317460317460317,
"grad_norm": 0.84765625,
"learning_rate": 4.695121951219513e-06,
"loss": 2.1065,
"step": 257
},
{
"epoch": 2.326530612244898,
"grad_norm": 0.84375,
"learning_rate": 4.634146341463416e-06,
"loss": 2.007,
"step": 258
},
{
"epoch": 2.3356009070294785,
"grad_norm": 0.9375,
"learning_rate": 4.573170731707318e-06,
"loss": 2.0885,
"step": 259
},
{
"epoch": 2.3446712018140587,
"grad_norm": 0.90234375,
"learning_rate": 4.51219512195122e-06,
"loss": 2.035,
"step": 260
},
{
"epoch": 2.3537414965986394,
"grad_norm": 0.9140625,
"learning_rate": 4.451219512195122e-06,
"loss": 2.0861,
"step": 261
},
{
"epoch": 2.36281179138322,
"grad_norm": 0.875,
"learning_rate": 4.390243902439025e-06,
"loss": 2.0555,
"step": 262
},
{
"epoch": 2.3718820861678003,
"grad_norm": 0.89453125,
"learning_rate": 4.329268292682927e-06,
"loss": 2.0892,
"step": 263
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.87890625,
"learning_rate": 4.268292682926829e-06,
"loss": 2.0651,
"step": 264
},
{
"epoch": 2.3900226757369616,
"grad_norm": 0.91015625,
"learning_rate": 4.207317073170732e-06,
"loss": 2.0115,
"step": 265
},
{
"epoch": 2.399092970521542,
"grad_norm": 0.921875,
"learning_rate": 4.146341463414634e-06,
"loss": 2.0983,
"step": 266
},
{
"epoch": 2.4081632653061225,
"grad_norm": 0.8828125,
"learning_rate": 4.085365853658536e-06,
"loss": 2.0977,
"step": 267
},
{
"epoch": 2.417233560090703,
"grad_norm": 0.88671875,
"learning_rate": 4.024390243902439e-06,
"loss": 1.9982,
"step": 268
},
{
"epoch": 2.4263038548752833,
"grad_norm": 0.86328125,
"learning_rate": 3.963414634146342e-06,
"loss": 2.0723,
"step": 269
},
{
"epoch": 2.435374149659864,
"grad_norm": 0.9296875,
"learning_rate": 3.902439024390244e-06,
"loss": 2.0516,
"step": 270
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.84765625,
"learning_rate": 3.8414634146341465e-06,
"loss": 2.0384,
"step": 271
},
{
"epoch": 2.453514739229025,
"grad_norm": 0.921875,
"learning_rate": 3.780487804878049e-06,
"loss": 2.0291,
"step": 272
},
{
"epoch": 2.4625850340136055,
"grad_norm": 0.859375,
"learning_rate": 3.7195121951219516e-06,
"loss": 2.0624,
"step": 273
},
{
"epoch": 2.471655328798186,
"grad_norm": 0.875,
"learning_rate": 3.6585365853658537e-06,
"loss": 2.0481,
"step": 274
},
{
"epoch": 2.4807256235827664,
"grad_norm": 0.84375,
"learning_rate": 3.5975609756097562e-06,
"loss": 2.0663,
"step": 275
},
{
"epoch": 2.489795918367347,
"grad_norm": 0.88671875,
"learning_rate": 3.5365853658536588e-06,
"loss": 2.0932,
"step": 276
},
{
"epoch": 2.4988662131519273,
"grad_norm": 0.91796875,
"learning_rate": 3.475609756097561e-06,
"loss": 2.0711,
"step": 277
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.8828125,
"learning_rate": 3.414634146341464e-06,
"loss": 2.08,
"step": 278
},
{
"epoch": 2.5170068027210886,
"grad_norm": 0.984375,
"learning_rate": 3.3536585365853664e-06,
"loss": 2.0578,
"step": 279
},
{
"epoch": 2.526077097505669,
"grad_norm": 0.85546875,
"learning_rate": 3.292682926829269e-06,
"loss": 2.0657,
"step": 280
},
{
"epoch": 2.5351473922902494,
"grad_norm": 0.83203125,
"learning_rate": 3.231707317073171e-06,
"loss": 2.0596,
"step": 281
},
{
"epoch": 2.54421768707483,
"grad_norm": 0.87890625,
"learning_rate": 3.1707317073170736e-06,
"loss": 2.0006,
"step": 282
},
{
"epoch": 2.5532879818594103,
"grad_norm": 0.87890625,
"learning_rate": 3.1097560975609757e-06,
"loss": 2.1218,
"step": 283
},
{
"epoch": 2.562358276643991,
"grad_norm": 0.8828125,
"learning_rate": 3.0487804878048782e-06,
"loss": 1.9871,
"step": 284
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.8515625,
"learning_rate": 2.9878048780487808e-06,
"loss": 2.0153,
"step": 285
},
{
"epoch": 2.580498866213152,
"grad_norm": 0.828125,
"learning_rate": 2.926829268292683e-06,
"loss": 2.0725,
"step": 286
},
{
"epoch": 2.5895691609977325,
"grad_norm": 0.875,
"learning_rate": 2.8658536585365854e-06,
"loss": 2.1162,
"step": 287
},
{
"epoch": 2.5986394557823127,
"grad_norm": 0.8671875,
"learning_rate": 2.8048780487804884e-06,
"loss": 2.0886,
"step": 288
},
{
"epoch": 2.6077097505668934,
"grad_norm": 0.90625,
"learning_rate": 2.7439024390243905e-06,
"loss": 2.0458,
"step": 289
},
{
"epoch": 2.616780045351474,
"grad_norm": 0.8671875,
"learning_rate": 2.682926829268293e-06,
"loss": 2.0301,
"step": 290
},
{
"epoch": 2.6258503401360542,
"grad_norm": 0.8671875,
"learning_rate": 2.6219512195121956e-06,
"loss": 2.0675,
"step": 291
},
{
"epoch": 2.634920634920635,
"grad_norm": 0.90234375,
"learning_rate": 2.5609756097560977e-06,
"loss": 2.0651,
"step": 292
},
{
"epoch": 2.6439909297052155,
"grad_norm": 0.84375,
"learning_rate": 2.5e-06,
"loss": 2.0602,
"step": 293
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.859375,
"learning_rate": 2.4390243902439027e-06,
"loss": 2.061,
"step": 294
},
{
"epoch": 2.6621315192743764,
"grad_norm": 0.82421875,
"learning_rate": 2.378048780487805e-06,
"loss": 2.0801,
"step": 295
},
{
"epoch": 2.671201814058957,
"grad_norm": 0.92578125,
"learning_rate": 2.317073170731708e-06,
"loss": 2.0709,
"step": 296
},
{
"epoch": 2.6802721088435373,
"grad_norm": 0.87890625,
"learning_rate": 2.25609756097561e-06,
"loss": 2.0765,
"step": 297
},
{
"epoch": 2.689342403628118,
"grad_norm": 0.859375,
"learning_rate": 2.1951219512195125e-06,
"loss": 2.0208,
"step": 298
},
{
"epoch": 2.6984126984126986,
"grad_norm": 0.87109375,
"learning_rate": 2.1341463414634146e-06,
"loss": 2.0137,
"step": 299
},
{
"epoch": 2.707482993197279,
"grad_norm": 0.859375,
"learning_rate": 2.073170731707317e-06,
"loss": 2.0745,
"step": 300
},
{
"epoch": 2.7165532879818595,
"grad_norm": 0.9453125,
"learning_rate": 2.0121951219512197e-06,
"loss": 2.0442,
"step": 301
},
{
"epoch": 2.72562358276644,
"grad_norm": 0.93359375,
"learning_rate": 1.951219512195122e-06,
"loss": 2.0374,
"step": 302
},
{
"epoch": 2.7346938775510203,
"grad_norm": 0.90625,
"learning_rate": 1.8902439024390245e-06,
"loss": 2.1019,
"step": 303
},
{
"epoch": 2.743764172335601,
"grad_norm": 0.92578125,
"learning_rate": 1.8292682926829268e-06,
"loss": 2.1039,
"step": 304
},
{
"epoch": 2.7528344671201816,
"grad_norm": 0.83984375,
"learning_rate": 1.7682926829268294e-06,
"loss": 2.0586,
"step": 305
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.85546875,
"learning_rate": 1.707317073170732e-06,
"loss": 2.0117,
"step": 306
},
{
"epoch": 2.7709750566893425,
"grad_norm": 0.875,
"learning_rate": 1.6463414634146345e-06,
"loss": 2.0583,
"step": 307
},
{
"epoch": 2.780045351473923,
"grad_norm": 0.97265625,
"learning_rate": 1.5853658536585368e-06,
"loss": 2.0683,
"step": 308
},
{
"epoch": 2.7891156462585034,
"grad_norm": 0.83984375,
"learning_rate": 1.5243902439024391e-06,
"loss": 2.0711,
"step": 309
},
{
"epoch": 2.798185941043084,
"grad_norm": 0.83984375,
"learning_rate": 1.4634146341463414e-06,
"loss": 2.0822,
"step": 310
},
{
"epoch": 2.8072562358276643,
"grad_norm": 0.96875,
"learning_rate": 1.4024390243902442e-06,
"loss": 2.0757,
"step": 311
},
{
"epoch": 2.816326530612245,
"grad_norm": 0.8515625,
"learning_rate": 1.3414634146341465e-06,
"loss": 2.1005,
"step": 312
},
{
"epoch": 2.825396825396825,
"grad_norm": 0.875,
"learning_rate": 1.2804878048780488e-06,
"loss": 2.0856,
"step": 313
},
{
"epoch": 2.834467120181406,
"grad_norm": 0.9375,
"learning_rate": 1.2195121951219514e-06,
"loss": 2.0978,
"step": 314
},
{
"epoch": 2.8435374149659864,
"grad_norm": 0.83203125,
"learning_rate": 1.158536585365854e-06,
"loss": 2.05,
"step": 315
},
{
"epoch": 2.8526077097505667,
"grad_norm": 0.86328125,
"learning_rate": 1.0975609756097562e-06,
"loss": 2.064,
"step": 316
},
{
"epoch": 2.8616780045351473,
"grad_norm": 0.890625,
"learning_rate": 1.0365853658536586e-06,
"loss": 2.0545,
"step": 317
},
{
"epoch": 2.870748299319728,
"grad_norm": 0.98828125,
"learning_rate": 9.75609756097561e-07,
"loss": 2.108,
"step": 318
},
{
"epoch": 2.879818594104308,
"grad_norm": 0.84375,
"learning_rate": 9.146341463414634e-07,
"loss": 2.0705,
"step": 319
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.87109375,
"learning_rate": 8.53658536585366e-07,
"loss": 2.1163,
"step": 320
},
{
"epoch": 2.8979591836734695,
"grad_norm": 0.8671875,
"learning_rate": 7.926829268292684e-07,
"loss": 2.051,
"step": 321
},
{
"epoch": 2.9070294784580497,
"grad_norm": 0.94140625,
"learning_rate": 7.317073170731707e-07,
"loss": 2.0958,
"step": 322
},
{
"epoch": 2.9160997732426304,
"grad_norm": 0.875,
"learning_rate": 6.707317073170733e-07,
"loss": 2.1143,
"step": 323
},
{
"epoch": 2.925170068027211,
"grad_norm": 0.83984375,
"learning_rate": 6.097560975609757e-07,
"loss": 2.0107,
"step": 324
},
{
"epoch": 2.9342403628117912,
"grad_norm": 0.80859375,
"learning_rate": 5.487804878048781e-07,
"loss": 2.0394,
"step": 325
},
{
"epoch": 2.943310657596372,
"grad_norm": 0.83984375,
"learning_rate": 4.878048780487805e-07,
"loss": 2.0891,
"step": 326
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.80859375,
"learning_rate": 4.26829268292683e-07,
"loss": 2.007,
"step": 327
},
{
"epoch": 2.9614512471655328,
"grad_norm": 0.859375,
"learning_rate": 3.6585365853658536e-07,
"loss": 2.0731,
"step": 328
},
{
"epoch": 2.9705215419501134,
"grad_norm": 0.87109375,
"learning_rate": 3.0487804878048784e-07,
"loss": 2.0552,
"step": 329
},
{
"epoch": 2.979591836734694,
"grad_norm": 0.9453125,
"learning_rate": 2.439024390243903e-07,
"loss": 2.0833,
"step": 330
},
{
"epoch": 2.9886621315192743,
"grad_norm": 0.87890625,
"learning_rate": 1.8292682926829268e-07,
"loss": 2.0287,
"step": 331
},
{
"epoch": 2.997732426303855,
"grad_norm": 0.8359375,
"learning_rate": 1.2195121951219514e-07,
"loss": 2.0579,
"step": 332
},
{
"epoch": 3.0,
"grad_norm": 2.234375,
"learning_rate": 6.097560975609757e-08,
"loss": 2.0824,
"step": 333
}
],
"logging_steps": 1,
"max_steps": 333,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.870273289084211e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}