Files
openthoughts3_100k_qwen25_1…/trainer_state.json
ModelHub XC 55fc6b7911 初始化项目,由ModelHub XC社区提供模型
Model: mlfoundations-dev/openthoughts3_100k_qwen25_1b_bsz512_lr4e5_epochs5
Source: Original Platform
2026-06-08 15:16:14 +08:00

6868 lines
166 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.987212276214834,
"eval_steps": 500,
"global_step": 975,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005115089514066497,
"grad_norm": 2.9552626224042746,
"learning_rate": 4.0816326530612243e-07,
"loss": 1.4957,
"step": 1
},
{
"epoch": 0.010230179028132993,
"grad_norm": 2.9842329763367634,
"learning_rate": 8.163265306122449e-07,
"loss": 1.4889,
"step": 2
},
{
"epoch": 0.015345268542199489,
"grad_norm": 2.9894673729833756,
"learning_rate": 1.2244897959183673e-06,
"loss": 1.4995,
"step": 3
},
{
"epoch": 0.020460358056265986,
"grad_norm": 2.9406348527530266,
"learning_rate": 1.6326530612244897e-06,
"loss": 1.4851,
"step": 4
},
{
"epoch": 0.02557544757033248,
"grad_norm": 2.884254781810192,
"learning_rate": 2.0408163265306125e-06,
"loss": 1.5193,
"step": 5
},
{
"epoch": 0.030690537084398978,
"grad_norm": 2.6620670413572483,
"learning_rate": 2.4489795918367347e-06,
"loss": 1.4843,
"step": 6
},
{
"epoch": 0.03580562659846547,
"grad_norm": 2.5965076114745362,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.4878,
"step": 7
},
{
"epoch": 0.04092071611253197,
"grad_norm": 2.0170252553614705,
"learning_rate": 3.2653061224489794e-06,
"loss": 1.4657,
"step": 8
},
{
"epoch": 0.04603580562659847,
"grad_norm": 1.9679168334183053,
"learning_rate": 3.6734693877551024e-06,
"loss": 1.4733,
"step": 9
},
{
"epoch": 0.05115089514066496,
"grad_norm": 1.8694892639210836,
"learning_rate": 4.081632653061225e-06,
"loss": 1.4641,
"step": 10
},
{
"epoch": 0.056265984654731455,
"grad_norm": 1.8685294422005332,
"learning_rate": 4.489795918367348e-06,
"loss": 1.4459,
"step": 11
},
{
"epoch": 0.061381074168797956,
"grad_norm": 1.8352337338017686,
"learning_rate": 4.897959183673469e-06,
"loss": 1.4347,
"step": 12
},
{
"epoch": 0.06649616368286446,
"grad_norm": 1.715529029874086,
"learning_rate": 5.306122448979593e-06,
"loss": 1.4253,
"step": 13
},
{
"epoch": 0.07161125319693094,
"grad_norm": 1.4603873855315377,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.4073,
"step": 14
},
{
"epoch": 0.07672634271099744,
"grad_norm": 1.8889329281960052,
"learning_rate": 6.122448979591837e-06,
"loss": 1.4073,
"step": 15
},
{
"epoch": 0.08184143222506395,
"grad_norm": 1.8256444813829467,
"learning_rate": 6.530612244897959e-06,
"loss": 1.402,
"step": 16
},
{
"epoch": 0.08695652173913043,
"grad_norm": 1.5107616554904217,
"learning_rate": 6.938775510204082e-06,
"loss": 1.4226,
"step": 17
},
{
"epoch": 0.09207161125319693,
"grad_norm": 1.2135260104218866,
"learning_rate": 7.346938775510205e-06,
"loss": 1.3806,
"step": 18
},
{
"epoch": 0.09718670076726342,
"grad_norm": 0.9783065195702848,
"learning_rate": 7.755102040816327e-06,
"loss": 1.3779,
"step": 19
},
{
"epoch": 0.10230179028132992,
"grad_norm": 1.0913652122620938,
"learning_rate": 8.16326530612245e-06,
"loss": 1.3479,
"step": 20
},
{
"epoch": 0.10741687979539642,
"grad_norm": 0.9002918394031234,
"learning_rate": 8.571428571428571e-06,
"loss": 1.3421,
"step": 21
},
{
"epoch": 0.11253196930946291,
"grad_norm": 0.8291547537993212,
"learning_rate": 8.979591836734695e-06,
"loss": 1.3199,
"step": 22
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.807288721347729,
"learning_rate": 9.387755102040818e-06,
"loss": 1.3344,
"step": 23
},
{
"epoch": 0.12276214833759591,
"grad_norm": 0.6809772051446031,
"learning_rate": 9.795918367346939e-06,
"loss": 1.3248,
"step": 24
},
{
"epoch": 0.1278772378516624,
"grad_norm": 0.6460286130501192,
"learning_rate": 1.0204081632653063e-05,
"loss": 1.3101,
"step": 25
},
{
"epoch": 0.1329923273657289,
"grad_norm": 0.7041553190299802,
"learning_rate": 1.0612244897959186e-05,
"loss": 1.2905,
"step": 26
},
{
"epoch": 0.13810741687979539,
"grad_norm": 0.5916151711963107,
"learning_rate": 1.1020408163265306e-05,
"loss": 1.2957,
"step": 27
},
{
"epoch": 0.1432225063938619,
"grad_norm": 0.4814903541835213,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.3205,
"step": 28
},
{
"epoch": 0.1483375959079284,
"grad_norm": 0.5453307778014714,
"learning_rate": 1.1836734693877552e-05,
"loss": 1.3054,
"step": 29
},
{
"epoch": 0.1534526854219949,
"grad_norm": 0.5416929825329082,
"learning_rate": 1.2244897959183674e-05,
"loss": 1.2876,
"step": 30
},
{
"epoch": 0.1585677749360614,
"grad_norm": 0.47860645294978243,
"learning_rate": 1.2653061224489798e-05,
"loss": 1.3007,
"step": 31
},
{
"epoch": 0.1636828644501279,
"grad_norm": 0.4486371377266475,
"learning_rate": 1.3061224489795918e-05,
"loss": 1.2954,
"step": 32
},
{
"epoch": 0.16879795396419436,
"grad_norm": 0.44450958377830624,
"learning_rate": 1.3469387755102042e-05,
"loss": 1.2676,
"step": 33
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.44087698545639353,
"learning_rate": 1.3877551020408165e-05,
"loss": 1.2574,
"step": 34
},
{
"epoch": 0.17902813299232737,
"grad_norm": 0.40237081708363603,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.2344,
"step": 35
},
{
"epoch": 0.18414322250639387,
"grad_norm": 0.36218165247096934,
"learning_rate": 1.469387755102041e-05,
"loss": 1.2553,
"step": 36
},
{
"epoch": 0.18925831202046037,
"grad_norm": 0.3828961442249037,
"learning_rate": 1.510204081632653e-05,
"loss": 1.2678,
"step": 37
},
{
"epoch": 0.19437340153452684,
"grad_norm": 0.37416837544698317,
"learning_rate": 1.5510204081632655e-05,
"loss": 1.2406,
"step": 38
},
{
"epoch": 0.19948849104859334,
"grad_norm": 0.373896446093631,
"learning_rate": 1.5918367346938776e-05,
"loss": 1.2431,
"step": 39
},
{
"epoch": 0.20460358056265984,
"grad_norm": 0.3008904471337201,
"learning_rate": 1.63265306122449e-05,
"loss": 1.2324,
"step": 40
},
{
"epoch": 0.20971867007672634,
"grad_norm": 0.30387006502929764,
"learning_rate": 1.673469387755102e-05,
"loss": 1.2238,
"step": 41
},
{
"epoch": 0.21483375959079284,
"grad_norm": 0.3165654505340095,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.2344,
"step": 42
},
{
"epoch": 0.21994884910485935,
"grad_norm": 0.28748071942522757,
"learning_rate": 1.7551020408163266e-05,
"loss": 1.2559,
"step": 43
},
{
"epoch": 0.22506393861892582,
"grad_norm": 0.2664919173232997,
"learning_rate": 1.795918367346939e-05,
"loss": 1.2501,
"step": 44
},
{
"epoch": 0.23017902813299232,
"grad_norm": 0.2716330693107337,
"learning_rate": 1.836734693877551e-05,
"loss": 1.235,
"step": 45
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.2758197763809488,
"learning_rate": 1.8775510204081636e-05,
"loss": 1.2463,
"step": 46
},
{
"epoch": 0.24040920716112532,
"grad_norm": 0.24892493271713195,
"learning_rate": 1.9183673469387756e-05,
"loss": 1.2415,
"step": 47
},
{
"epoch": 0.24552429667519182,
"grad_norm": 0.2755681817261238,
"learning_rate": 1.9591836734693877e-05,
"loss": 1.2186,
"step": 48
},
{
"epoch": 0.2506393861892583,
"grad_norm": 0.24083989397516156,
"learning_rate": 2e-05,
"loss": 1.2224,
"step": 49
},
{
"epoch": 0.2557544757033248,
"grad_norm": 0.24122206872487506,
"learning_rate": 2.0408163265306126e-05,
"loss": 1.2012,
"step": 50
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.25341193733481965,
"learning_rate": 2.0816326530612247e-05,
"loss": 1.2221,
"step": 51
},
{
"epoch": 0.2659846547314578,
"grad_norm": 0.22903243976655355,
"learning_rate": 2.122448979591837e-05,
"loss": 1.1924,
"step": 52
},
{
"epoch": 0.2710997442455243,
"grad_norm": 0.21677309278156626,
"learning_rate": 2.1632653061224492e-05,
"loss": 1.1889,
"step": 53
},
{
"epoch": 0.27621483375959077,
"grad_norm": 0.21471884542317143,
"learning_rate": 2.2040816326530613e-05,
"loss": 1.2008,
"step": 54
},
{
"epoch": 0.2813299232736573,
"grad_norm": 0.24214035073106985,
"learning_rate": 2.2448979591836734e-05,
"loss": 1.2144,
"step": 55
},
{
"epoch": 0.2864450127877238,
"grad_norm": 0.2498234706393149,
"learning_rate": 2.2857142857142858e-05,
"loss": 1.2106,
"step": 56
},
{
"epoch": 0.2915601023017903,
"grad_norm": 0.2946584201665625,
"learning_rate": 2.3265306122448982e-05,
"loss": 1.1915,
"step": 57
},
{
"epoch": 0.2966751918158568,
"grad_norm": 0.31814731087212356,
"learning_rate": 2.3673469387755103e-05,
"loss": 1.1919,
"step": 58
},
{
"epoch": 0.30179028132992325,
"grad_norm": 0.2535281231928839,
"learning_rate": 2.4081632653061227e-05,
"loss": 1.2135,
"step": 59
},
{
"epoch": 0.3069053708439898,
"grad_norm": 0.28231587803629354,
"learning_rate": 2.448979591836735e-05,
"loss": 1.2029,
"step": 60
},
{
"epoch": 0.31202046035805625,
"grad_norm": 0.2010585704472117,
"learning_rate": 2.4897959183673473e-05,
"loss": 1.1958,
"step": 61
},
{
"epoch": 0.3171355498721228,
"grad_norm": 0.2261450242992654,
"learning_rate": 2.5306122448979597e-05,
"loss": 1.1903,
"step": 62
},
{
"epoch": 0.32225063938618925,
"grad_norm": 0.23939206047592157,
"learning_rate": 2.5714285714285718e-05,
"loss": 1.1947,
"step": 63
},
{
"epoch": 0.3273657289002558,
"grad_norm": 0.2629186812479751,
"learning_rate": 2.6122448979591835e-05,
"loss": 1.1738,
"step": 64
},
{
"epoch": 0.33248081841432225,
"grad_norm": 0.25820871123390515,
"learning_rate": 2.653061224489796e-05,
"loss": 1.1918,
"step": 65
},
{
"epoch": 0.3375959079283887,
"grad_norm": 0.2860124405371331,
"learning_rate": 2.6938775510204084e-05,
"loss": 1.2015,
"step": 66
},
{
"epoch": 0.34271099744245526,
"grad_norm": 0.37417972707557,
"learning_rate": 2.7346938775510205e-05,
"loss": 1.1877,
"step": 67
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.5554510331993981,
"learning_rate": 2.775510204081633e-05,
"loss": 1.2075,
"step": 68
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.8032264545180317,
"learning_rate": 2.816326530612245e-05,
"loss": 1.1954,
"step": 69
},
{
"epoch": 0.35805626598465473,
"grad_norm": 0.8465110589364025,
"learning_rate": 2.8571428571428574e-05,
"loss": 1.1948,
"step": 70
},
{
"epoch": 0.3631713554987212,
"grad_norm": 0.6056567908053532,
"learning_rate": 2.89795918367347e-05,
"loss": 1.1608,
"step": 71
},
{
"epoch": 0.36828644501278773,
"grad_norm": 1.0339865612062735,
"learning_rate": 2.938775510204082e-05,
"loss": 1.198,
"step": 72
},
{
"epoch": 0.3734015345268542,
"grad_norm": 1.0360361783633478,
"learning_rate": 2.9795918367346944e-05,
"loss": 1.1873,
"step": 73
},
{
"epoch": 0.37851662404092073,
"grad_norm": 0.5781784281108936,
"learning_rate": 3.020408163265306e-05,
"loss": 1.1919,
"step": 74
},
{
"epoch": 0.3836317135549872,
"grad_norm": 0.9260593669478031,
"learning_rate": 3.061224489795918e-05,
"loss": 1.1672,
"step": 75
},
{
"epoch": 0.3887468030690537,
"grad_norm": 0.8443095651460022,
"learning_rate": 3.102040816326531e-05,
"loss": 1.1785,
"step": 76
},
{
"epoch": 0.3938618925831202,
"grad_norm": 0.5037173461965399,
"learning_rate": 3.142857142857143e-05,
"loss": 1.1696,
"step": 77
},
{
"epoch": 0.3989769820971867,
"grad_norm": 0.83134727018411,
"learning_rate": 3.183673469387755e-05,
"loss": 1.1913,
"step": 78
},
{
"epoch": 0.4040920716112532,
"grad_norm": 0.43515185597618067,
"learning_rate": 3.224489795918368e-05,
"loss": 1.1998,
"step": 79
},
{
"epoch": 0.4092071611253197,
"grad_norm": 0.5605879393832622,
"learning_rate": 3.26530612244898e-05,
"loss": 1.159,
"step": 80
},
{
"epoch": 0.4143222506393862,
"grad_norm": 0.5401839074014898,
"learning_rate": 3.306122448979592e-05,
"loss": 1.1614,
"step": 81
},
{
"epoch": 0.4194373401534527,
"grad_norm": 0.6023459415610618,
"learning_rate": 3.346938775510204e-05,
"loss": 1.1681,
"step": 82
},
{
"epoch": 0.42455242966751916,
"grad_norm": 0.5818804857404473,
"learning_rate": 3.387755102040817e-05,
"loss": 1.1598,
"step": 83
},
{
"epoch": 0.4296675191815857,
"grad_norm": 0.6214983408408011,
"learning_rate": 3.4285714285714284e-05,
"loss": 1.1719,
"step": 84
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.6889766714463987,
"learning_rate": 3.469387755102041e-05,
"loss": 1.1652,
"step": 85
},
{
"epoch": 0.4398976982097187,
"grad_norm": 0.5760202894156686,
"learning_rate": 3.510204081632653e-05,
"loss": 1.1662,
"step": 86
},
{
"epoch": 0.44501278772378516,
"grad_norm": 0.6061942284235254,
"learning_rate": 3.551020408163265e-05,
"loss": 1.1879,
"step": 87
},
{
"epoch": 0.45012787723785164,
"grad_norm": 0.4851908724925709,
"learning_rate": 3.591836734693878e-05,
"loss": 1.1804,
"step": 88
},
{
"epoch": 0.45524296675191817,
"grad_norm": 0.558284089379506,
"learning_rate": 3.63265306122449e-05,
"loss": 1.1667,
"step": 89
},
{
"epoch": 0.46035805626598464,
"grad_norm": 0.5249911631612906,
"learning_rate": 3.673469387755102e-05,
"loss": 1.1551,
"step": 90
},
{
"epoch": 0.46547314578005117,
"grad_norm": 0.6405503624893562,
"learning_rate": 3.714285714285715e-05,
"loss": 1.1732,
"step": 91
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.5784785007056862,
"learning_rate": 3.755102040816327e-05,
"loss": 1.1713,
"step": 92
},
{
"epoch": 0.47570332480818417,
"grad_norm": 0.8586793710160441,
"learning_rate": 3.795918367346939e-05,
"loss": 1.1749,
"step": 93
},
{
"epoch": 0.48081841432225064,
"grad_norm": 0.7501542028722769,
"learning_rate": 3.836734693877551e-05,
"loss": 1.1661,
"step": 94
},
{
"epoch": 0.4859335038363171,
"grad_norm": 0.5945696896064088,
"learning_rate": 3.8775510204081634e-05,
"loss": 1.1613,
"step": 95
},
{
"epoch": 0.49104859335038364,
"grad_norm": 0.7440586521483908,
"learning_rate": 3.9183673469387755e-05,
"loss": 1.1322,
"step": 96
},
{
"epoch": 0.4961636828644501,
"grad_norm": 1.114624130882715,
"learning_rate": 3.959183673469388e-05,
"loss": 1.1255,
"step": 97
},
{
"epoch": 0.5012787723785166,
"grad_norm": 1.7108401028613913,
"learning_rate": 4e-05,
"loss": 1.1813,
"step": 98
},
{
"epoch": 0.5063938618925832,
"grad_norm": 0.44659523303682175,
"learning_rate": 3.9999871678303026e-05,
"loss": 1.16,
"step": 99
},
{
"epoch": 0.5115089514066496,
"grad_norm": 1.9551355398623464,
"learning_rate": 3.999948671485876e-05,
"loss": 1.1552,
"step": 100
},
{
"epoch": 0.5166240409207161,
"grad_norm": 0.7446799337776974,
"learning_rate": 3.9998845114607106e-05,
"loss": 1.1362,
"step": 101
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.50554061275789,
"learning_rate": 3.99979468857812e-05,
"loss": 1.1578,
"step": 102
},
{
"epoch": 0.5268542199488491,
"grad_norm": 0.7841214706789117,
"learning_rate": 3.9996792039907254e-05,
"loss": 1.1512,
"step": 103
},
{
"epoch": 0.5319693094629157,
"grad_norm": 1.307842675480053,
"learning_rate": 3.999538059180445e-05,
"loss": 1.1637,
"step": 104
},
{
"epoch": 0.5370843989769821,
"grad_norm": 0.9212444211121106,
"learning_rate": 3.9993712559584736e-05,
"loss": 1.1395,
"step": 105
},
{
"epoch": 0.5421994884910486,
"grad_norm": 1.1124085438820968,
"learning_rate": 3.9991787964652576e-05,
"loss": 1.1416,
"step": 106
},
{
"epoch": 0.5473145780051151,
"grad_norm": 1.1650621890898196,
"learning_rate": 3.9989606831704704e-05,
"loss": 1.142,
"step": 107
},
{
"epoch": 0.5524296675191815,
"grad_norm": 0.7803938686688274,
"learning_rate": 3.998716918872979e-05,
"loss": 1.1412,
"step": 108
},
{
"epoch": 0.5575447570332481,
"grad_norm": 1.2099933733931747,
"learning_rate": 3.998447506700807e-05,
"loss": 1.119,
"step": 109
},
{
"epoch": 0.5626598465473146,
"grad_norm": 0.5996285589701097,
"learning_rate": 3.998152450111099e-05,
"loss": 1.1538,
"step": 110
},
{
"epoch": 0.5677749360613811,
"grad_norm": 1.0736932260251144,
"learning_rate": 3.9978317528900704e-05,
"loss": 1.1249,
"step": 111
},
{
"epoch": 0.5728900255754475,
"grad_norm": 0.5804871130419977,
"learning_rate": 3.9974854191529616e-05,
"loss": 1.1441,
"step": 112
},
{
"epoch": 0.578005115089514,
"grad_norm": 0.7862604118769216,
"learning_rate": 3.997113453343987e-05,
"loss": 1.1421,
"step": 113
},
{
"epoch": 0.5831202046035806,
"grad_norm": 0.6885271538338327,
"learning_rate": 3.996715860236275e-05,
"loss": 1.1343,
"step": 114
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.6788644129844003,
"learning_rate": 3.996292644931807e-05,
"loss": 1.1469,
"step": 115
},
{
"epoch": 0.5933503836317136,
"grad_norm": 0.684753398463379,
"learning_rate": 3.995843812861353e-05,
"loss": 1.1399,
"step": 116
},
{
"epoch": 0.59846547314578,
"grad_norm": 0.5126160753180327,
"learning_rate": 3.9953693697844036e-05,
"loss": 1.1149,
"step": 117
},
{
"epoch": 0.6035805626598465,
"grad_norm": 0.5713048245116996,
"learning_rate": 3.994869321789093e-05,
"loss": 1.1294,
"step": 118
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.4824046734986782,
"learning_rate": 3.994343675292121e-05,
"loss": 1.1194,
"step": 119
},
{
"epoch": 0.6138107416879796,
"grad_norm": 0.5772818973773468,
"learning_rate": 3.9937924370386733e-05,
"loss": 1.1382,
"step": 120
},
{
"epoch": 0.618925831202046,
"grad_norm": 0.5640558910780374,
"learning_rate": 3.9932156141023325e-05,
"loss": 1.1288,
"step": 121
},
{
"epoch": 0.6240409207161125,
"grad_norm": 0.6644942399025234,
"learning_rate": 3.992613213884989e-05,
"loss": 1.159,
"step": 122
},
{
"epoch": 0.629156010230179,
"grad_norm": 0.7622498053014113,
"learning_rate": 3.9919852441167426e-05,
"loss": 1.1412,
"step": 123
},
{
"epoch": 0.6342710997442456,
"grad_norm": 0.642371805679136,
"learning_rate": 3.99133171285581e-05,
"loss": 1.1547,
"step": 124
},
{
"epoch": 0.639386189258312,
"grad_norm": 0.47445759803876414,
"learning_rate": 3.9906526284884156e-05,
"loss": 1.1463,
"step": 125
},
{
"epoch": 0.6445012787723785,
"grad_norm": 0.3468568785754221,
"learning_rate": 3.989947999728683e-05,
"loss": 1.1473,
"step": 126
},
{
"epoch": 0.649616368286445,
"grad_norm": 0.3826230766226846,
"learning_rate": 3.98921783561853e-05,
"loss": 1.1523,
"step": 127
},
{
"epoch": 0.6547314578005116,
"grad_norm": 0.40385322488596853,
"learning_rate": 3.988462145527545e-05,
"loss": 1.1275,
"step": 128
},
{
"epoch": 0.659846547314578,
"grad_norm": 0.5075478796367268,
"learning_rate": 3.9876809391528724e-05,
"loss": 1.1392,
"step": 129
},
{
"epoch": 0.6649616368286445,
"grad_norm": 0.5208684496947872,
"learning_rate": 3.986874226519085e-05,
"loss": 1.14,
"step": 130
},
{
"epoch": 0.670076726342711,
"grad_norm": 0.5624315662158746,
"learning_rate": 3.986042017978055e-05,
"loss": 1.1099,
"step": 131
},
{
"epoch": 0.6751918158567775,
"grad_norm": 0.5780512451990953,
"learning_rate": 3.985184324208826e-05,
"loss": 1.1511,
"step": 132
},
{
"epoch": 0.680306905370844,
"grad_norm": 0.6247395333389373,
"learning_rate": 3.984301156217467e-05,
"loss": 1.1515,
"step": 133
},
{
"epoch": 0.6854219948849105,
"grad_norm": 0.5133917066342321,
"learning_rate": 3.9833925253369415e-05,
"loss": 1.1117,
"step": 134
},
{
"epoch": 0.690537084398977,
"grad_norm": 0.3219649704869207,
"learning_rate": 3.982458443226955e-05,
"loss": 1.1305,
"step": 135
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.2522321720635661,
"learning_rate": 3.9814989218738074e-05,
"loss": 1.129,
"step": 136
},
{
"epoch": 0.7007672634271099,
"grad_norm": 0.2980429052508869,
"learning_rate": 3.980513973590239e-05,
"loss": 1.1332,
"step": 137
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.34290982806032505,
"learning_rate": 3.9795036110152745e-05,
"loss": 1.1296,
"step": 138
},
{
"epoch": 0.710997442455243,
"grad_norm": 0.44091227459865195,
"learning_rate": 3.978467847114057e-05,
"loss": 1.1392,
"step": 139
},
{
"epoch": 0.7161125319693095,
"grad_norm": 0.5949165816794791,
"learning_rate": 3.977406695177684e-05,
"loss": 1.1363,
"step": 140
},
{
"epoch": 0.7212276214833759,
"grad_norm": 0.7738088490588236,
"learning_rate": 3.9763201688230385e-05,
"loss": 1.13,
"step": 141
},
{
"epoch": 0.7263427109974424,
"grad_norm": 0.9415135094116259,
"learning_rate": 3.975208281992611e-05,
"loss": 1.1054,
"step": 142
},
{
"epoch": 0.731457800511509,
"grad_norm": 1.013667201332667,
"learning_rate": 3.974071048954322e-05,
"loss": 1.1335,
"step": 143
},
{
"epoch": 0.7365728900255755,
"grad_norm": 0.918574474686866,
"learning_rate": 3.972908484301338e-05,
"loss": 1.1225,
"step": 144
},
{
"epoch": 0.7416879795396419,
"grad_norm": 1.0028928008507987,
"learning_rate": 3.971720602951886e-05,
"loss": 1.1394,
"step": 145
},
{
"epoch": 0.7468030690537084,
"grad_norm": 1.01371150070985,
"learning_rate": 3.9705074201490614e-05,
"loss": 1.1135,
"step": 146
},
{
"epoch": 0.7519181585677749,
"grad_norm": 0.6829183814095904,
"learning_rate": 3.9692689514606326e-05,
"loss": 1.1195,
"step": 147
},
{
"epoch": 0.7570332480818415,
"grad_norm": 0.4995779268853437,
"learning_rate": 3.9680052127788386e-05,
"loss": 1.1096,
"step": 148
},
{
"epoch": 0.7621483375959079,
"grad_norm": 0.5466194571724614,
"learning_rate": 3.96671622032019e-05,
"loss": 1.1438,
"step": 149
},
{
"epoch": 0.7672634271099744,
"grad_norm": 0.4993258831722339,
"learning_rate": 3.965401990625255e-05,
"loss": 1.1246,
"step": 150
},
{
"epoch": 0.7723785166240409,
"grad_norm": 0.5394473074799043,
"learning_rate": 3.964062540558454e-05,
"loss": 1.1367,
"step": 151
},
{
"epoch": 0.7774936061381074,
"grad_norm": 0.6014091892453812,
"learning_rate": 3.962697887307836e-05,
"loss": 1.1074,
"step": 152
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.44042594454750733,
"learning_rate": 3.961308048384863e-05,
"loss": 1.1203,
"step": 153
},
{
"epoch": 0.7877237851662404,
"grad_norm": 0.41663585280574117,
"learning_rate": 3.9598930416241857e-05,
"loss": 1.1,
"step": 154
},
{
"epoch": 0.7928388746803069,
"grad_norm": 0.43073314610132185,
"learning_rate": 3.9584528851834096e-05,
"loss": 1.1176,
"step": 155
},
{
"epoch": 0.7979539641943734,
"grad_norm": 0.4025238410612463,
"learning_rate": 3.956987597542867e-05,
"loss": 1.1404,
"step": 156
},
{
"epoch": 0.80306905370844,
"grad_norm": 0.31467711628825223,
"learning_rate": 3.955497197505377e-05,
"loss": 1.1151,
"step": 157
},
{
"epoch": 0.8081841432225064,
"grad_norm": 0.3777897458256624,
"learning_rate": 3.953981704196007e-05,
"loss": 1.1064,
"step": 158
},
{
"epoch": 0.8132992327365729,
"grad_norm": 0.4263089414332897,
"learning_rate": 3.952441137061823e-05,
"loss": 1.1317,
"step": 159
},
{
"epoch": 0.8184143222506394,
"grad_norm": 0.44871938263927913,
"learning_rate": 3.9508755158716445e-05,
"loss": 1.1006,
"step": 160
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.39666587385431135,
"learning_rate": 3.949284860715787e-05,
"loss": 1.106,
"step": 161
},
{
"epoch": 0.8286445012787724,
"grad_norm": 0.37222061819294466,
"learning_rate": 3.94766919200581e-05,
"loss": 1.1424,
"step": 162
},
{
"epoch": 0.8337595907928389,
"grad_norm": 0.4106985425516431,
"learning_rate": 3.946028530474247e-05,
"loss": 1.1066,
"step": 163
},
{
"epoch": 0.8388746803069054,
"grad_norm": 0.34174723556345027,
"learning_rate": 3.944362897174345e-05,
"loss": 1.1377,
"step": 164
},
{
"epoch": 0.8439897698209718,
"grad_norm": 0.31421437045922873,
"learning_rate": 3.942672313479794e-05,
"loss": 1.1481,
"step": 165
},
{
"epoch": 0.8491048593350383,
"grad_norm": 0.4554074855575182,
"learning_rate": 3.9409568010844504e-05,
"loss": 1.1021,
"step": 166
},
{
"epoch": 0.8542199488491049,
"grad_norm": 0.4862902618582567,
"learning_rate": 3.9392163820020596e-05,
"loss": 1.1136,
"step": 167
},
{
"epoch": 0.8593350383631714,
"grad_norm": 0.5031898914893316,
"learning_rate": 3.937451078565975e-05,
"loss": 1.1041,
"step": 168
},
{
"epoch": 0.8644501278772379,
"grad_norm": 0.4901545096169435,
"learning_rate": 3.935660913428871e-05,
"loss": 1.1283,
"step": 169
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.4686488348433868,
"learning_rate": 3.93384590956245e-05,
"loss": 1.1188,
"step": 170
},
{
"epoch": 0.8746803069053708,
"grad_norm": 0.382892444800895,
"learning_rate": 3.932006090257149e-05,
"loss": 1.1068,
"step": 171
},
{
"epoch": 0.8797953964194374,
"grad_norm": 0.37159943638867027,
"learning_rate": 3.930141479121841e-05,
"loss": 1.1208,
"step": 172
},
{
"epoch": 0.8849104859335039,
"grad_norm": 0.42709573093618236,
"learning_rate": 3.9282521000835343e-05,
"loss": 1.1383,
"step": 173
},
{
"epoch": 0.8900255754475703,
"grad_norm": 0.47346272108755927,
"learning_rate": 3.9263379773870595e-05,
"loss": 1.0882,
"step": 174
},
{
"epoch": 0.8951406649616368,
"grad_norm": 0.5285371745037324,
"learning_rate": 3.9243991355947654e-05,
"loss": 1.112,
"step": 175
},
{
"epoch": 0.9002557544757033,
"grad_norm": 0.5906366765469058,
"learning_rate": 3.9224355995861976e-05,
"loss": 1.1198,
"step": 176
},
{
"epoch": 0.9053708439897699,
"grad_norm": 0.6390366442616775,
"learning_rate": 3.9204473945577844e-05,
"loss": 1.1165,
"step": 177
},
{
"epoch": 0.9104859335038363,
"grad_norm": 0.7619939625989861,
"learning_rate": 3.9184345460225086e-05,
"loss": 1.0976,
"step": 178
},
{
"epoch": 0.9156010230179028,
"grad_norm": 0.8274597084021534,
"learning_rate": 3.916397079809587e-05,
"loss": 1.1263,
"step": 179
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.8670812778043089,
"learning_rate": 3.914335022064129e-05,
"loss": 1.1038,
"step": 180
},
{
"epoch": 0.9258312020460358,
"grad_norm": 0.7987111262536127,
"learning_rate": 3.91224839924681e-05,
"loss": 1.0991,
"step": 181
},
{
"epoch": 0.9309462915601023,
"grad_norm": 0.6789089254585002,
"learning_rate": 3.91013723813353e-05,
"loss": 1.1153,
"step": 182
},
{
"epoch": 0.9360613810741688,
"grad_norm": 0.5247353582861676,
"learning_rate": 3.9080015658150644e-05,
"loss": 1.1346,
"step": 183
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.3644802006853034,
"learning_rate": 3.905841409696724e-05,
"loss": 1.1203,
"step": 184
},
{
"epoch": 0.9462915601023018,
"grad_norm": 0.302951995016467,
"learning_rate": 3.903656797497998e-05,
"loss": 1.1085,
"step": 185
},
{
"epoch": 0.9514066496163683,
"grad_norm": 0.40803915020390363,
"learning_rate": 3.901447757252202e-05,
"loss": 1.1249,
"step": 186
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.4575912395967058,
"learning_rate": 3.899214317306114e-05,
"loss": 1.1207,
"step": 187
},
{
"epoch": 0.9616368286445013,
"grad_norm": 0.4384857639291277,
"learning_rate": 3.896956506319615e-05,
"loss": 1.1175,
"step": 188
},
{
"epoch": 0.9667519181585678,
"grad_norm": 0.37810315759266455,
"learning_rate": 3.894674353265319e-05,
"loss": 1.1263,
"step": 189
},
{
"epoch": 0.9718670076726342,
"grad_norm": 0.33040746591886666,
"learning_rate": 3.8923678874282005e-05,
"loss": 1.0891,
"step": 190
},
{
"epoch": 0.9769820971867008,
"grad_norm": 0.2970896158791436,
"learning_rate": 3.890037138405221e-05,
"loss": 1.1118,
"step": 191
},
{
"epoch": 0.9820971867007673,
"grad_norm": 0.2786192370878433,
"learning_rate": 3.887682136104948e-05,
"loss": 1.0955,
"step": 192
},
{
"epoch": 0.9872122762148338,
"grad_norm": 0.37616961887523437,
"learning_rate": 3.88530291074717e-05,
"loss": 1.1309,
"step": 193
},
{
"epoch": 0.9923273657289002,
"grad_norm": 0.30458475511135424,
"learning_rate": 3.88289949286251e-05,
"loss": 1.1288,
"step": 194
},
{
"epoch": 0.9974424552429667,
"grad_norm": 0.27006171785266825,
"learning_rate": 3.880471913292035e-05,
"loss": 1.1067,
"step": 195
},
{
"epoch": 1.0025575447570332,
"grad_norm": 0.641130344717003,
"learning_rate": 3.878020203186858e-05,
"loss": 1.6856,
"step": 196
},
{
"epoch": 1.0076726342710998,
"grad_norm": 1.12901892252244,
"learning_rate": 3.875544394007739e-05,
"loss": 1.1049,
"step": 197
},
{
"epoch": 1.0127877237851663,
"grad_norm": 0.8810192018583497,
"learning_rate": 3.8730445175246815e-05,
"loss": 1.0976,
"step": 198
},
{
"epoch": 1.0179028132992327,
"grad_norm": 0.5060443892157782,
"learning_rate": 3.8705206058165244e-05,
"loss": 1.1224,
"step": 199
},
{
"epoch": 1.0230179028132993,
"grad_norm": 0.45555273554867565,
"learning_rate": 3.8679726912705315e-05,
"loss": 1.1104,
"step": 200
},
{
"epoch": 1.0281329923273657,
"grad_norm": 0.5987124980388637,
"learning_rate": 3.865400806581975e-05,
"loss": 1.1236,
"step": 201
},
{
"epoch": 1.0332480818414322,
"grad_norm": 0.6102266868676671,
"learning_rate": 3.862804984753714e-05,
"loss": 1.0929,
"step": 202
},
{
"epoch": 1.0383631713554988,
"grad_norm": 0.4715043183631868,
"learning_rate": 3.8601852590957766e-05,
"loss": 1.1314,
"step": 203
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.35678120553758946,
"learning_rate": 3.857541663224926e-05,
"loss": 1.0817,
"step": 204
},
{
"epoch": 1.0485933503836318,
"grad_norm": 0.5779239076939061,
"learning_rate": 3.8548742310642334e-05,
"loss": 1.0744,
"step": 205
},
{
"epoch": 1.0537084398976981,
"grad_norm": 0.4640287441279096,
"learning_rate": 3.852182996842641e-05,
"loss": 1.0778,
"step": 206
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.42150823261105713,
"learning_rate": 3.849467995094522e-05,
"loss": 1.1283,
"step": 207
},
{
"epoch": 1.0639386189258313,
"grad_norm": 0.5821601085737635,
"learning_rate": 3.846729260659241e-05,
"loss": 1.0972,
"step": 208
},
{
"epoch": 1.0690537084398977,
"grad_norm": 0.4704548470212048,
"learning_rate": 3.843966828680702e-05,
"loss": 1.0747,
"step": 209
},
{
"epoch": 1.0741687979539642,
"grad_norm": 0.38693655375739405,
"learning_rate": 3.841180734606902e-05,
"loss": 1.1187,
"step": 210
},
{
"epoch": 1.0792838874680306,
"grad_norm": 0.3630491063461679,
"learning_rate": 3.838371014189472e-05,
"loss": 1.0892,
"step": 211
},
{
"epoch": 1.0843989769820972,
"grad_norm": 0.4405195407898948,
"learning_rate": 3.835537703483221e-05,
"loss": 1.1067,
"step": 212
},
{
"epoch": 1.0895140664961638,
"grad_norm": 0.44452544270846567,
"learning_rate": 3.832680838845674e-05,
"loss": 1.0756,
"step": 213
},
{
"epoch": 1.0946291560102301,
"grad_norm": 0.5401710772090864,
"learning_rate": 3.8298004569366016e-05,
"loss": 1.0847,
"step": 214
},
{
"epoch": 1.0997442455242967,
"grad_norm": 0.7049028690595734,
"learning_rate": 3.8268965947175545e-05,
"loss": 1.1182,
"step": 215
},
{
"epoch": 1.104859335038363,
"grad_norm": 0.7116280073130989,
"learning_rate": 3.823969289451384e-05,
"loss": 1.126,
"step": 216
},
{
"epoch": 1.1099744245524297,
"grad_norm": 0.6441754688478455,
"learning_rate": 3.821018578701769e-05,
"loss": 1.1012,
"step": 217
},
{
"epoch": 1.1150895140664963,
"grad_norm": 0.5733417711636994,
"learning_rate": 3.8180445003327296e-05,
"loss": 1.0919,
"step": 218
},
{
"epoch": 1.1202046035805626,
"grad_norm": 0.5168592814231329,
"learning_rate": 3.815047092508146e-05,
"loss": 1.0708,
"step": 219
},
{
"epoch": 1.1253196930946292,
"grad_norm": 0.40791830250588856,
"learning_rate": 3.812026393691262e-05,
"loss": 1.1016,
"step": 220
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.3935626219257489,
"learning_rate": 3.808982442644199e-05,
"loss": 1.1062,
"step": 221
},
{
"epoch": 1.1355498721227621,
"grad_norm": 0.42991493583618057,
"learning_rate": 3.8059152784274526e-05,
"loss": 1.0787,
"step": 222
},
{
"epoch": 1.1406649616368287,
"grad_norm": 0.381545106483542,
"learning_rate": 3.802824940399395e-05,
"loss": 1.076,
"step": 223
},
{
"epoch": 1.145780051150895,
"grad_norm": 0.43466891058485496,
"learning_rate": 3.799711468215767e-05,
"loss": 1.0788,
"step": 224
},
{
"epoch": 1.1508951406649617,
"grad_norm": 0.5444427949474252,
"learning_rate": 3.796574901829173e-05,
"loss": 1.0916,
"step": 225
},
{
"epoch": 1.156010230179028,
"grad_norm": 0.597694612097652,
"learning_rate": 3.793415281488566e-05,
"loss": 1.1063,
"step": 226
},
{
"epoch": 1.1611253196930946,
"grad_norm": 0.5644442647832518,
"learning_rate": 3.790232647738728e-05,
"loss": 1.1242,
"step": 227
},
{
"epoch": 1.1662404092071612,
"grad_norm": 0.5088885436714243,
"learning_rate": 3.7870270414197566e-05,
"loss": 1.0651,
"step": 228
},
{
"epoch": 1.1713554987212276,
"grad_norm": 0.45669824317435515,
"learning_rate": 3.783798503666537e-05,
"loss": 1.0788,
"step": 229
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.5158511102992952,
"learning_rate": 3.780547075908213e-05,
"loss": 1.0897,
"step": 230
},
{
"epoch": 1.1815856777493605,
"grad_norm": 0.5509778938951233,
"learning_rate": 3.777272799867657e-05,
"loss": 1.1186,
"step": 231
},
{
"epoch": 1.186700767263427,
"grad_norm": 0.6199044676759673,
"learning_rate": 3.773975717560934e-05,
"loss": 1.0839,
"step": 232
},
{
"epoch": 1.1918158567774937,
"grad_norm": 0.5867928689399399,
"learning_rate": 3.7706558712967656e-05,
"loss": 1.0994,
"step": 233
},
{
"epoch": 1.19693094629156,
"grad_norm": 0.43551312787108826,
"learning_rate": 3.76731330367598e-05,
"loss": 1.1112,
"step": 234
},
{
"epoch": 1.2020460358056266,
"grad_norm": 0.3428798495589015,
"learning_rate": 3.763948057590975e-05,
"loss": 1.1025,
"step": 235
},
{
"epoch": 1.207161125319693,
"grad_norm": 0.3420589750674672,
"learning_rate": 3.760560176225157e-05,
"loss": 1.064,
"step": 236
},
{
"epoch": 1.2122762148337596,
"grad_norm": 0.47964929329378947,
"learning_rate": 3.757149703052395e-05,
"loss": 1.0982,
"step": 237
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.5164961726841073,
"learning_rate": 3.753716681836461e-05,
"loss": 1.0698,
"step": 238
},
{
"epoch": 1.2225063938618925,
"grad_norm": 0.5004726718437401,
"learning_rate": 3.750261156630465e-05,
"loss": 1.1286,
"step": 239
},
{
"epoch": 1.227621483375959,
"grad_norm": 0.47794124988866565,
"learning_rate": 3.7467831717762925e-05,
"loss": 1.0846,
"step": 240
},
{
"epoch": 1.2327365728900257,
"grad_norm": 0.44111411888891566,
"learning_rate": 3.743282771904035e-05,
"loss": 1.0834,
"step": 241
},
{
"epoch": 1.237851662404092,
"grad_norm": 0.5079015451764834,
"learning_rate": 3.739760001931419e-05,
"loss": 1.104,
"step": 242
},
{
"epoch": 1.2429667519181586,
"grad_norm": 0.5075602497339351,
"learning_rate": 3.7362149070632255e-05,
"loss": 1.0931,
"step": 243
},
{
"epoch": 1.248081841432225,
"grad_norm": 0.3971531368129775,
"learning_rate": 3.732647532790713e-05,
"loss": 1.0792,
"step": 244
},
{
"epoch": 1.2531969309462916,
"grad_norm": 0.33746211887589067,
"learning_rate": 3.729057924891035e-05,
"loss": 1.1235,
"step": 245
},
{
"epoch": 1.258312020460358,
"grad_norm": 0.3650974996204564,
"learning_rate": 3.7254461294266483e-05,
"loss": 1.0974,
"step": 246
},
{
"epoch": 1.2634271099744245,
"grad_norm": 0.377421756851408,
"learning_rate": 3.721812192744725e-05,
"loss": 1.0831,
"step": 247
},
{
"epoch": 1.2685421994884911,
"grad_norm": 0.3795307978882048,
"learning_rate": 3.718156161476558e-05,
"loss": 1.0957,
"step": 248
},
{
"epoch": 1.2736572890025575,
"grad_norm": 0.3800374146414857,
"learning_rate": 3.7144780825369615e-05,
"loss": 1.0709,
"step": 249
},
{
"epoch": 1.278772378516624,
"grad_norm": 0.4007139140880588,
"learning_rate": 3.710778003123667e-05,
"loss": 1.0833,
"step": 250
},
{
"epoch": 1.2838874680306906,
"grad_norm": 0.40407343707960797,
"learning_rate": 3.707055970716722e-05,
"loss": 1.0888,
"step": 251
},
{
"epoch": 1.289002557544757,
"grad_norm": 0.2994034629230688,
"learning_rate": 3.703312033077878e-05,
"loss": 1.1002,
"step": 252
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.3254270151573406,
"learning_rate": 3.699546238249979e-05,
"loss": 1.0737,
"step": 253
},
{
"epoch": 1.29923273657289,
"grad_norm": 0.3911289495044072,
"learning_rate": 3.6957586345563417e-05,
"loss": 1.0991,
"step": 254
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.38966577888960535,
"learning_rate": 3.691949270600141e-05,
"loss": 1.0791,
"step": 255
},
{
"epoch": 1.309462915601023,
"grad_norm": 0.39562876092393817,
"learning_rate": 3.68811819526378e-05,
"loss": 1.1031,
"step": 256
},
{
"epoch": 1.3145780051150895,
"grad_norm": 0.4269451879598667,
"learning_rate": 3.6842654577082686e-05,
"loss": 1.075,
"step": 257
},
{
"epoch": 1.319693094629156,
"grad_norm": 0.4819004002486693,
"learning_rate": 3.6803911073725895e-05,
"loss": 1.0701,
"step": 258
},
{
"epoch": 1.3248081841432224,
"grad_norm": 0.504125026080002,
"learning_rate": 3.6764951939730624e-05,
"loss": 1.0864,
"step": 259
},
{
"epoch": 1.329923273657289,
"grad_norm": 0.4870192901488172,
"learning_rate": 3.6725777675027095e-05,
"loss": 1.1046,
"step": 260
},
{
"epoch": 1.3350383631713556,
"grad_norm": 0.43270225897597664,
"learning_rate": 3.668638878230613e-05,
"loss": 1.0962,
"step": 261
},
{
"epoch": 1.340153452685422,
"grad_norm": 0.3494948322409856,
"learning_rate": 3.664678576701267e-05,
"loss": 1.1036,
"step": 262
},
{
"epoch": 1.3452685421994885,
"grad_norm": 0.29197512485904487,
"learning_rate": 3.660696913733934e-05,
"loss": 1.0955,
"step": 263
},
{
"epoch": 1.350383631713555,
"grad_norm": 0.26565447956578164,
"learning_rate": 3.6566939404219874e-05,
"loss": 1.1045,
"step": 264
},
{
"epoch": 1.3554987212276215,
"grad_norm": 0.29428244421319627,
"learning_rate": 3.652669708132261e-05,
"loss": 1.0592,
"step": 265
},
{
"epoch": 1.3606138107416879,
"grad_norm": 0.2791238009074844,
"learning_rate": 3.648624268504387e-05,
"loss": 1.0825,
"step": 266
},
{
"epoch": 1.3657289002557544,
"grad_norm": 0.2836207345495186,
"learning_rate": 3.644557673450133e-05,
"loss": 1.1182,
"step": 267
},
{
"epoch": 1.370843989769821,
"grad_norm": 0.2642712963707173,
"learning_rate": 3.6404699751527365e-05,
"loss": 1.1036,
"step": 268
},
{
"epoch": 1.3759590792838874,
"grad_norm": 0.2569721937955015,
"learning_rate": 3.6363612260662346e-05,
"loss": 1.1037,
"step": 269
},
{
"epoch": 1.381074168797954,
"grad_norm": 0.2626548247202908,
"learning_rate": 3.632231478914794e-05,
"loss": 1.0752,
"step": 270
},
{
"epoch": 1.3861892583120206,
"grad_norm": 0.33628877791383244,
"learning_rate": 3.628080786692032e-05,
"loss": 1.1241,
"step": 271
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.39928469448042375,
"learning_rate": 3.623909202660333e-05,
"loss": 1.0692,
"step": 272
},
{
"epoch": 1.3964194373401535,
"grad_norm": 0.41762818228894244,
"learning_rate": 3.619716780350174e-05,
"loss": 1.0635,
"step": 273
},
{
"epoch": 1.40153452685422,
"grad_norm": 0.5007186791076494,
"learning_rate": 3.615503573559426e-05,
"loss": 1.0869,
"step": 274
},
{
"epoch": 1.4066496163682864,
"grad_norm": 0.6087223158210029,
"learning_rate": 3.6112696363526774e-05,
"loss": 1.0802,
"step": 275
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.8238974925650274,
"learning_rate": 3.6070150230605264e-05,
"loss": 1.0911,
"step": 276
},
{
"epoch": 1.4168797953964194,
"grad_norm": 1.0295392330906121,
"learning_rate": 3.6027397882788944e-05,
"loss": 1.0882,
"step": 277
},
{
"epoch": 1.421994884910486,
"grad_norm": 0.9094341485583135,
"learning_rate": 3.5984439868683185e-05,
"loss": 1.0985,
"step": 278
},
{
"epoch": 1.4271099744245523,
"grad_norm": 0.6431552527387483,
"learning_rate": 3.594127673953251e-05,
"loss": 1.0917,
"step": 279
},
{
"epoch": 1.432225063938619,
"grad_norm": 0.39730871502755355,
"learning_rate": 3.589790904921353e-05,
"loss": 1.0649,
"step": 280
},
{
"epoch": 1.4373401534526855,
"grad_norm": 0.25033607318319917,
"learning_rate": 3.585433735422779e-05,
"loss": 1.0704,
"step": 281
},
{
"epoch": 1.4424552429667519,
"grad_norm": 0.2688599706833758,
"learning_rate": 3.581056221369469e-05,
"loss": 1.1132,
"step": 282
},
{
"epoch": 1.4475703324808185,
"grad_norm": 0.36659477814922325,
"learning_rate": 3.5766584189344255e-05,
"loss": 1.0987,
"step": 283
},
{
"epoch": 1.452685421994885,
"grad_norm": 0.4117135879547577,
"learning_rate": 3.572240384550996e-05,
"loss": 1.0782,
"step": 284
},
{
"epoch": 1.4578005115089514,
"grad_norm": 0.3858803623667727,
"learning_rate": 3.5678021749121465e-05,
"loss": 1.0792,
"step": 285
},
{
"epoch": 1.4629156010230178,
"grad_norm": 0.3491502416990134,
"learning_rate": 3.563343846969738e-05,
"loss": 1.0868,
"step": 286
},
{
"epoch": 1.4680306905370843,
"grad_norm": 0.2782819121589572,
"learning_rate": 3.558865457933789e-05,
"loss": 1.0874,
"step": 287
},
{
"epoch": 1.473145780051151,
"grad_norm": 0.2855530481636196,
"learning_rate": 3.5543670652717485e-05,
"loss": 1.1035,
"step": 288
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.2506736083535957,
"learning_rate": 3.5498487267077556e-05,
"loss": 1.0889,
"step": 289
},
{
"epoch": 1.4833759590792839,
"grad_norm": 0.3217152717517136,
"learning_rate": 3.545310500221896e-05,
"loss": 1.0743,
"step": 290
},
{
"epoch": 1.4884910485933505,
"grad_norm": 0.36398355801272403,
"learning_rate": 3.540752444049463e-05,
"loss": 1.1007,
"step": 291
},
{
"epoch": 1.4936061381074168,
"grad_norm": 0.4534774518127883,
"learning_rate": 3.536174616680206e-05,
"loss": 1.0998,
"step": 292
},
{
"epoch": 1.4987212276214834,
"grad_norm": 0.5782489918794124,
"learning_rate": 3.531577076857584e-05,
"loss": 1.0707,
"step": 293
},
{
"epoch": 1.50383631713555,
"grad_norm": 0.6360084025806246,
"learning_rate": 3.5269598835780074e-05,
"loss": 1.0882,
"step": 294
},
{
"epoch": 1.5089514066496164,
"grad_norm": 0.6564145997881213,
"learning_rate": 3.522323096090083e-05,
"loss": 1.0822,
"step": 295
},
{
"epoch": 1.5140664961636827,
"grad_norm": 0.6579103014127573,
"learning_rate": 3.517666773893856e-05,
"loss": 1.0659,
"step": 296
},
{
"epoch": 1.5191815856777495,
"grad_norm": 0.588797087024321,
"learning_rate": 3.512990976740043e-05,
"loss": 1.0701,
"step": 297
},
{
"epoch": 1.5242966751918159,
"grad_norm": 0.507691256755524,
"learning_rate": 3.5082957646292656e-05,
"loss": 1.0703,
"step": 298
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.46225068995130947,
"learning_rate": 3.503581197811283e-05,
"loss": 1.0998,
"step": 299
},
{
"epoch": 1.5345268542199488,
"grad_norm": 0.46321293283271464,
"learning_rate": 3.498847336784217e-05,
"loss": 1.1118,
"step": 300
},
{
"epoch": 1.5396419437340154,
"grad_norm": 0.47673226500293,
"learning_rate": 3.4940942422937745e-05,
"loss": 1.062,
"step": 301
},
{
"epoch": 1.5447570332480818,
"grad_norm": 0.4938233436299125,
"learning_rate": 3.4893219753324715e-05,
"loss": 1.0796,
"step": 302
},
{
"epoch": 1.5498721227621484,
"grad_norm": 0.454989222644801,
"learning_rate": 3.4845305971388474e-05,
"loss": 1.0599,
"step": 303
},
{
"epoch": 1.554987212276215,
"grad_norm": 0.42196538938547673,
"learning_rate": 3.4797201691966804e-05,
"loss": 1.0976,
"step": 304
},
{
"epoch": 1.5601023017902813,
"grad_norm": 0.438633061741324,
"learning_rate": 3.4748907532341974e-05,
"loss": 1.069,
"step": 305
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.35425686678327734,
"learning_rate": 3.470042411223284e-05,
"loss": 1.0876,
"step": 306
},
{
"epoch": 1.5703324808184145,
"grad_norm": 0.333738264976303,
"learning_rate": 3.465175205378688e-05,
"loss": 1.0832,
"step": 307
},
{
"epoch": 1.5754475703324808,
"grad_norm": 0.3167972686191401,
"learning_rate": 3.46028919815722e-05,
"loss": 1.0822,
"step": 308
},
{
"epoch": 1.5805626598465472,
"grad_norm": 0.30610984275864866,
"learning_rate": 3.4553844522569545e-05,
"loss": 1.0812,
"step": 309
},
{
"epoch": 1.5856777493606138,
"grad_norm": 0.3880104515073711,
"learning_rate": 3.4504610306164235e-05,
"loss": 1.0556,
"step": 310
},
{
"epoch": 1.5907928388746804,
"grad_norm": 0.5025355791144405,
"learning_rate": 3.4455189964138076e-05,
"loss": 1.0977,
"step": 311
},
{
"epoch": 1.5959079283887467,
"grad_norm": 0.4731775995897406,
"learning_rate": 3.4405584130661294e-05,
"loss": 1.0999,
"step": 312
},
{
"epoch": 1.6010230179028133,
"grad_norm": 0.3302367508833335,
"learning_rate": 3.435579344228436e-05,
"loss": 1.0947,
"step": 313
},
{
"epoch": 1.60613810741688,
"grad_norm": 0.3079090812550143,
"learning_rate": 3.430581853792983e-05,
"loss": 1.0855,
"step": 314
},
{
"epoch": 1.6112531969309463,
"grad_norm": 0.3346134543184726,
"learning_rate": 3.425566005888418e-05,
"loss": 1.0705,
"step": 315
},
{
"epoch": 1.6163682864450126,
"grad_norm": 0.306822833635453,
"learning_rate": 3.42053186487895e-05,
"loss": 1.0677,
"step": 316
},
{
"epoch": 1.6214833759590794,
"grad_norm": 0.2893288371945462,
"learning_rate": 3.4154794953635314e-05,
"loss": 1.0867,
"step": 317
},
{
"epoch": 1.6265984654731458,
"grad_norm": 0.3179300501326337,
"learning_rate": 3.410408962175026e-05,
"loss": 1.0751,
"step": 318
},
{
"epoch": 1.6317135549872122,
"grad_norm": 0.4094677547905539,
"learning_rate": 3.405320330379374e-05,
"loss": 1.0974,
"step": 319
},
{
"epoch": 1.6368286445012787,
"grad_norm": 0.44398101804683016,
"learning_rate": 3.4002136652747654e-05,
"loss": 1.068,
"step": 320
},
{
"epoch": 1.6419437340153453,
"grad_norm": 0.3672002423818593,
"learning_rate": 3.3950890323907906e-05,
"loss": 1.0595,
"step": 321
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.34831887181846766,
"learning_rate": 3.3899464974876095e-05,
"loss": 1.1265,
"step": 322
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.33785733953151414,
"learning_rate": 3.3847861265551034e-05,
"loss": 1.0635,
"step": 323
},
{
"epoch": 1.6572890025575449,
"grad_norm": 0.300108569750646,
"learning_rate": 3.379607985812026e-05,
"loss": 1.0663,
"step": 324
},
{
"epoch": 1.6624040920716112,
"grad_norm": 0.3168223492831874,
"learning_rate": 3.37441214170516e-05,
"loss": 1.0703,
"step": 325
},
{
"epoch": 1.6675191815856778,
"grad_norm": 0.30519376911285095,
"learning_rate": 3.369198660908457e-05,
"loss": 1.0989,
"step": 326
},
{
"epoch": 1.6726342710997444,
"grad_norm": 0.27340255530098717,
"learning_rate": 3.3639676103221885e-05,
"loss": 1.083,
"step": 327
},
{
"epoch": 1.6777493606138107,
"grad_norm": 0.24237557891013498,
"learning_rate": 3.358719057072082e-05,
"loss": 1.0888,
"step": 328
},
{
"epoch": 1.682864450127877,
"grad_norm": 0.33187922158308414,
"learning_rate": 3.353453068508465e-05,
"loss": 1.078,
"step": 329
},
{
"epoch": 1.6879795396419437,
"grad_norm": 0.33136779692393137,
"learning_rate": 3.348169712205396e-05,
"loss": 1.0955,
"step": 330
},
{
"epoch": 1.6930946291560103,
"grad_norm": 0.3896824184636105,
"learning_rate": 3.342869055959799e-05,
"loss": 1.0919,
"step": 331
},
{
"epoch": 1.6982097186700766,
"grad_norm": 0.4690457383285477,
"learning_rate": 3.337551167790594e-05,
"loss": 1.0834,
"step": 332
},
{
"epoch": 1.7033248081841432,
"grad_norm": 0.46902429127424217,
"learning_rate": 3.3322161159378266e-05,
"loss": 1.078,
"step": 333
},
{
"epoch": 1.7084398976982098,
"grad_norm": 0.5077584309818212,
"learning_rate": 3.326863968861785e-05,
"loss": 1.0797,
"step": 334
},
{
"epoch": 1.7135549872122762,
"grad_norm": 0.5218252773609796,
"learning_rate": 3.32149479524213e-05,
"loss": 1.0946,
"step": 335
},
{
"epoch": 1.7186700767263428,
"grad_norm": 0.4069002876418554,
"learning_rate": 3.3161086639770096e-05,
"loss": 1.0765,
"step": 336
},
{
"epoch": 1.7237851662404093,
"grad_norm": 0.31535846997146344,
"learning_rate": 3.310705644182172e-05,
"loss": 1.1063,
"step": 337
},
{
"epoch": 1.7289002557544757,
"grad_norm": 0.33692698144202765,
"learning_rate": 3.3052858051900855e-05,
"loss": 1.0702,
"step": 338
},
{
"epoch": 1.734015345268542,
"grad_norm": 0.38776356076458385,
"learning_rate": 3.299849216549043e-05,
"loss": 1.0671,
"step": 339
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.4685217600838493,
"learning_rate": 3.294395948022274e-05,
"loss": 1.0741,
"step": 340
},
{
"epoch": 1.7442455242966752,
"grad_norm": 0.5146785167418015,
"learning_rate": 3.288926069587043e-05,
"loss": 1.0872,
"step": 341
},
{
"epoch": 1.7493606138107416,
"grad_norm": 0.4361055840814472,
"learning_rate": 3.283439651433761e-05,
"loss": 1.0854,
"step": 342
},
{
"epoch": 1.7544757033248082,
"grad_norm": 0.3656787232466876,
"learning_rate": 3.277936763965076e-05,
"loss": 1.1032,
"step": 343
},
{
"epoch": 1.7595907928388748,
"grad_norm": 0.28217873995066556,
"learning_rate": 3.272417477794973e-05,
"loss": 1.0793,
"step": 344
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.3204034304733446,
"learning_rate": 3.266881863747869e-05,
"loss": 1.075,
"step": 345
},
{
"epoch": 1.7698209718670077,
"grad_norm": 0.3079284299575741,
"learning_rate": 3.261329992857703e-05,
"loss": 1.072,
"step": 346
},
{
"epoch": 1.7749360613810743,
"grad_norm": 0.38759228082284236,
"learning_rate": 3.255761936367025e-05,
"loss": 1.0739,
"step": 347
},
{
"epoch": 1.7800511508951407,
"grad_norm": 0.43446165090981037,
"learning_rate": 3.25017776572608e-05,
"loss": 1.0796,
"step": 348
},
{
"epoch": 1.785166240409207,
"grad_norm": 0.3249536808260686,
"learning_rate": 3.2445775525918934e-05,
"loss": 1.0863,
"step": 349
},
{
"epoch": 1.7902813299232738,
"grad_norm": 0.2899437031105078,
"learning_rate": 3.238961368827351e-05,
"loss": 1.0835,
"step": 350
},
{
"epoch": 1.7953964194373402,
"grad_norm": 0.317836365773142,
"learning_rate": 3.2333292865002754e-05,
"loss": 1.0826,
"step": 351
},
{
"epoch": 1.8005115089514065,
"grad_norm": 0.2916579319687349,
"learning_rate": 3.227681377882503e-05,
"loss": 1.0751,
"step": 352
},
{
"epoch": 1.8056265984654731,
"grad_norm": 0.21860087493949837,
"learning_rate": 3.2220177154489544e-05,
"loss": 1.0675,
"step": 353
},
{
"epoch": 1.8107416879795397,
"grad_norm": 0.3359559031535743,
"learning_rate": 3.216338371876709e-05,
"loss": 1.0534,
"step": 354
},
{
"epoch": 1.815856777493606,
"grad_norm": 0.39008315063327503,
"learning_rate": 3.2106434200440665e-05,
"loss": 1.0529,
"step": 355
},
{
"epoch": 1.8209718670076727,
"grad_norm": 0.32916920973912117,
"learning_rate": 3.204932933029615e-05,
"loss": 1.1093,
"step": 356
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.26659714069662344,
"learning_rate": 3.1992069841112936e-05,
"loss": 1.0707,
"step": 357
},
{
"epoch": 1.8312020460358056,
"grad_norm": 0.2408662731393292,
"learning_rate": 3.19346564676545e-05,
"loss": 1.0664,
"step": 358
},
{
"epoch": 1.836317135549872,
"grad_norm": 0.3149309446405476,
"learning_rate": 3.187708994665899e-05,
"loss": 1.0627,
"step": 359
},
{
"epoch": 1.8414322250639388,
"grad_norm": 0.2922367857765737,
"learning_rate": 3.181937101682977e-05,
"loss": 1.0955,
"step": 360
},
{
"epoch": 1.8465473145780051,
"grad_norm": 0.2732975527492678,
"learning_rate": 3.1761500418825955e-05,
"loss": 1.085,
"step": 361
},
{
"epoch": 1.8516624040920715,
"grad_norm": 0.32951032286545007,
"learning_rate": 3.170347889525287e-05,
"loss": 1.0667,
"step": 362
},
{
"epoch": 1.856777493606138,
"grad_norm": 0.2718568672274621,
"learning_rate": 3.1645307190652553e-05,
"loss": 1.0692,
"step": 363
},
{
"epoch": 1.8618925831202047,
"grad_norm": 0.3038697100173248,
"learning_rate": 3.1586986051494185e-05,
"loss": 1.0531,
"step": 364
},
{
"epoch": 1.867007672634271,
"grad_norm": 0.3482315834294184,
"learning_rate": 3.152851622616453e-05,
"loss": 1.0961,
"step": 365
},
{
"epoch": 1.8721227621483376,
"grad_norm": 0.3635382014306177,
"learning_rate": 3.146989846495831e-05,
"loss": 1.0594,
"step": 366
},
{
"epoch": 1.8772378516624042,
"grad_norm": 0.3801528737374951,
"learning_rate": 3.1411133520068565e-05,
"loss": 1.0719,
"step": 367
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.35767074568834234,
"learning_rate": 3.135222214557706e-05,
"loss": 1.0978,
"step": 368
},
{
"epoch": 1.887468030690537,
"grad_norm": 0.3177969921797143,
"learning_rate": 3.1293165097444545e-05,
"loss": 1.0824,
"step": 369
},
{
"epoch": 1.8925831202046037,
"grad_norm": 0.23887001529666604,
"learning_rate": 3.123396313350108e-05,
"loss": 1.0584,
"step": 370
},
{
"epoch": 1.89769820971867,
"grad_norm": 0.2180214891603909,
"learning_rate": 3.11746170134363e-05,
"loss": 1.0453,
"step": 371
},
{
"epoch": 1.9028132992327365,
"grad_norm": 0.2767381523731611,
"learning_rate": 3.111512749878972e-05,
"loss": 1.0809,
"step": 372
},
{
"epoch": 1.907928388746803,
"grad_norm": 0.2440349939808095,
"learning_rate": 3.105549535294086e-05,
"loss": 1.0767,
"step": 373
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.22664308140303832,
"learning_rate": 3.099572134109955e-05,
"loss": 1.0689,
"step": 374
},
{
"epoch": 1.918158567774936,
"grad_norm": 0.215228570214006,
"learning_rate": 3.093580623029605e-05,
"loss": 1.0757,
"step": 375
},
{
"epoch": 1.9232736572890026,
"grad_norm": 0.238491708219821,
"learning_rate": 3.087575078937121e-05,
"loss": 1.0821,
"step": 376
},
{
"epoch": 1.9283887468030692,
"grad_norm": 0.25677725654049566,
"learning_rate": 3.081555578896667e-05,
"loss": 1.076,
"step": 377
},
{
"epoch": 1.9335038363171355,
"grad_norm": 0.27990137738084897,
"learning_rate": 3.075522200151488e-05,
"loss": 1.056,
"step": 378
},
{
"epoch": 1.938618925831202,
"grad_norm": 0.2747809542407332,
"learning_rate": 3.069475020122923e-05,
"loss": 1.0657,
"step": 379
},
{
"epoch": 1.9437340153452687,
"grad_norm": 0.24205044950668198,
"learning_rate": 3.063414116409413e-05,
"loss": 1.1051,
"step": 380
},
{
"epoch": 1.948849104859335,
"grad_norm": 0.2286831747173763,
"learning_rate": 3.057339566785502e-05,
"loss": 1.1069,
"step": 381
},
{
"epoch": 1.9539641943734014,
"grad_norm": 0.21926358704649387,
"learning_rate": 3.0512514492008437e-05,
"loss": 1.068,
"step": 382
},
{
"epoch": 1.959079283887468,
"grad_norm": 0.23417329353885752,
"learning_rate": 3.045149841779194e-05,
"loss": 1.0575,
"step": 383
},
{
"epoch": 1.9641943734015346,
"grad_norm": 0.21864816336590975,
"learning_rate": 3.039034822817416e-05,
"loss": 1.0624,
"step": 384
},
{
"epoch": 1.969309462915601,
"grad_norm": 0.23037762810155873,
"learning_rate": 3.03290647078447e-05,
"loss": 1.0841,
"step": 385
},
{
"epoch": 1.9744245524296675,
"grad_norm": 0.24730861325346853,
"learning_rate": 3.0267648643204093e-05,
"loss": 1.0614,
"step": 386
},
{
"epoch": 1.979539641943734,
"grad_norm": 0.22748550621912642,
"learning_rate": 3.020610082235371e-05,
"loss": 1.0807,
"step": 387
},
{
"epoch": 1.9846547314578005,
"grad_norm": 0.2406930461729291,
"learning_rate": 3.0144422035085625e-05,
"loss": 1.0678,
"step": 388
},
{
"epoch": 1.989769820971867,
"grad_norm": 0.1947510056879471,
"learning_rate": 3.0082613072872512e-05,
"loss": 1.0706,
"step": 389
},
{
"epoch": 1.9948849104859336,
"grad_norm": 0.24668811547574934,
"learning_rate": 3.0020674728857446e-05,
"loss": 1.0882,
"step": 390
},
{
"epoch": 2.0,
"grad_norm": 0.3944045454129894,
"learning_rate": 2.9958607797843783e-05,
"loss": 1.5854,
"step": 391
},
{
"epoch": 2.0051150895140664,
"grad_norm": 0.571211331740568,
"learning_rate": 2.9896413076284915e-05,
"loss": 1.0541,
"step": 392
},
{
"epoch": 2.010230179028133,
"grad_norm": 0.6496305605729329,
"learning_rate": 2.9834091362274055e-05,
"loss": 1.0948,
"step": 393
},
{
"epoch": 2.0153452685421995,
"grad_norm": 0.5024579541598875,
"learning_rate": 2.9771643455534013e-05,
"loss": 1.0733,
"step": 394
},
{
"epoch": 2.020460358056266,
"grad_norm": 0.337503688669548,
"learning_rate": 2.9709070157406932e-05,
"loss": 1.0575,
"step": 395
},
{
"epoch": 2.0255754475703327,
"grad_norm": 0.4068958428270923,
"learning_rate": 2.9646372270843987e-05,
"loss": 1.0526,
"step": 396
},
{
"epoch": 2.030690537084399,
"grad_norm": 0.511369275269955,
"learning_rate": 2.9583550600395106e-05,
"loss": 1.0499,
"step": 397
},
{
"epoch": 2.0358056265984654,
"grad_norm": 0.4759521455282091,
"learning_rate": 2.952060595219861e-05,
"loss": 1.0576,
"step": 398
},
{
"epoch": 2.040920716112532,
"grad_norm": 0.30686443389558704,
"learning_rate": 2.9457539133970923e-05,
"loss": 1.0703,
"step": 399
},
{
"epoch": 2.0460358056265986,
"grad_norm": 0.48650529610865,
"learning_rate": 2.9394350954996147e-05,
"loss": 1.0805,
"step": 400
},
{
"epoch": 2.051150895140665,
"grad_norm": 0.536898019942215,
"learning_rate": 2.9331042226115722e-05,
"loss": 1.0595,
"step": 401
},
{
"epoch": 2.0562659846547313,
"grad_norm": 0.2894393958463196,
"learning_rate": 2.9267613759718002e-05,
"loss": 1.0819,
"step": 402
},
{
"epoch": 2.061381074168798,
"grad_norm": 0.35736846132665373,
"learning_rate": 2.9204066369727826e-05,
"loss": 1.0516,
"step": 403
},
{
"epoch": 2.0664961636828645,
"grad_norm": 0.5391010352210259,
"learning_rate": 2.914040087159609e-05,
"loss": 1.0873,
"step": 404
},
{
"epoch": 2.071611253196931,
"grad_norm": 0.4261141488201902,
"learning_rate": 2.9076618082289272e-05,
"loss": 1.0517,
"step": 405
},
{
"epoch": 2.0767263427109977,
"grad_norm": 0.32238839872169717,
"learning_rate": 2.901271882027894e-05,
"loss": 1.0624,
"step": 406
},
{
"epoch": 2.081841432225064,
"grad_norm": 0.4015293185545378,
"learning_rate": 2.894870390553128e-05,
"loss": 1.0844,
"step": 407
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.38973769415241016,
"learning_rate": 2.8884574159496524e-05,
"loss": 1.0455,
"step": 408
},
{
"epoch": 2.0920716112531967,
"grad_norm": 0.428656757682565,
"learning_rate": 2.882033040509848e-05,
"loss": 1.0522,
"step": 409
},
{
"epoch": 2.0971867007672635,
"grad_norm": 0.4067836857911701,
"learning_rate": 2.875597346672388e-05,
"loss": 1.0446,
"step": 410
},
{
"epoch": 2.10230179028133,
"grad_norm": 0.2658806981254451,
"learning_rate": 2.8691504170211896e-05,
"loss": 1.052,
"step": 411
},
{
"epoch": 2.1074168797953963,
"grad_norm": 0.29345767005827766,
"learning_rate": 2.862692334284347e-05,
"loss": 1.071,
"step": 412
},
{
"epoch": 2.112531969309463,
"grad_norm": 0.31643534871624796,
"learning_rate": 2.856223181333075e-05,
"loss": 1.0678,
"step": 413
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.30277925647085663,
"learning_rate": 2.849743041180641e-05,
"loss": 1.0596,
"step": 414
},
{
"epoch": 2.122762148337596,
"grad_norm": 0.28360578956825416,
"learning_rate": 2.8432519969813044e-05,
"loss": 1.0829,
"step": 415
},
{
"epoch": 2.1278772378516626,
"grad_norm": 0.25623419269068504,
"learning_rate": 2.836750132029244e-05,
"loss": 1.0863,
"step": 416
},
{
"epoch": 2.132992327365729,
"grad_norm": 0.3028046129485344,
"learning_rate": 2.8302375297574963e-05,
"loss": 1.0641,
"step": 417
},
{
"epoch": 2.1381074168797953,
"grad_norm": 0.243941002386985,
"learning_rate": 2.8237142737368767e-05,
"loss": 1.0815,
"step": 418
},
{
"epoch": 2.1432225063938617,
"grad_norm": 0.2439932508429276,
"learning_rate": 2.817180447674915e-05,
"loss": 1.0609,
"step": 419
},
{
"epoch": 2.1483375959079285,
"grad_norm": 0.33126365061709473,
"learning_rate": 2.8106361354147754e-05,
"loss": 1.0782,
"step": 420
},
{
"epoch": 2.153452685421995,
"grad_norm": 0.2315933256251212,
"learning_rate": 2.8040814209341834e-05,
"loss": 1.0732,
"step": 421
},
{
"epoch": 2.1585677749360612,
"grad_norm": 0.2531413000703873,
"learning_rate": 2.797516388344348e-05,
"loss": 1.0776,
"step": 422
},
{
"epoch": 2.163682864450128,
"grad_norm": 0.24520307389872711,
"learning_rate": 2.7909411218888805e-05,
"loss": 1.0575,
"step": 423
},
{
"epoch": 2.1687979539641944,
"grad_norm": 0.21837227419529545,
"learning_rate": 2.7843557059427165e-05,
"loss": 1.0648,
"step": 424
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.23257820290766587,
"learning_rate": 2.7777602250110312e-05,
"loss": 1.0698,
"step": 425
},
{
"epoch": 2.1790281329923276,
"grad_norm": 0.24175028544297208,
"learning_rate": 2.7711547637281547e-05,
"loss": 1.0827,
"step": 426
},
{
"epoch": 2.184143222506394,
"grad_norm": 0.24548626955618205,
"learning_rate": 2.764539406856487e-05,
"loss": 1.0174,
"step": 427
},
{
"epoch": 2.1892583120204603,
"grad_norm": 0.2400998129886217,
"learning_rate": 2.7579142392854108e-05,
"loss": 1.0736,
"step": 428
},
{
"epoch": 2.1943734015345266,
"grad_norm": 0.2556650098002111,
"learning_rate": 2.7512793460301996e-05,
"loss": 1.0969,
"step": 429
},
{
"epoch": 2.1994884910485935,
"grad_norm": 0.2657024393852246,
"learning_rate": 2.7446348122309304e-05,
"loss": 1.0871,
"step": 430
},
{
"epoch": 2.20460358056266,
"grad_norm": 0.30061197720631916,
"learning_rate": 2.7379807231513882e-05,
"loss": 1.0635,
"step": 431
},
{
"epoch": 2.209718670076726,
"grad_norm": 0.24434115350001423,
"learning_rate": 2.7313171641779736e-05,
"loss": 1.0718,
"step": 432
},
{
"epoch": 2.214833759590793,
"grad_norm": 0.1943553020243117,
"learning_rate": 2.724644220818605e-05,
"loss": 1.0464,
"step": 433
},
{
"epoch": 2.2199488491048593,
"grad_norm": 0.2409184010564603,
"learning_rate": 2.7179619787016257e-05,
"loss": 1.0745,
"step": 434
},
{
"epoch": 2.2250639386189257,
"grad_norm": 0.2256656815374472,
"learning_rate": 2.7112705235746985e-05,
"loss": 1.0747,
"step": 435
},
{
"epoch": 2.2301790281329925,
"grad_norm": 0.23084899035972747,
"learning_rate": 2.7045699413037133e-05,
"loss": 1.057,
"step": 436
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.21422250948749663,
"learning_rate": 2.697860317871677e-05,
"loss": 1.0787,
"step": 437
},
{
"epoch": 2.2404092071611252,
"grad_norm": 0.22055197289125814,
"learning_rate": 2.6911417393776172e-05,
"loss": 1.05,
"step": 438
},
{
"epoch": 2.2455242966751916,
"grad_norm": 0.2090525921429115,
"learning_rate": 2.6844142920354722e-05,
"loss": 1.0525,
"step": 439
},
{
"epoch": 2.2506393861892584,
"grad_norm": 0.23505089353994232,
"learning_rate": 2.677678062172989e-05,
"loss": 1.0435,
"step": 440
},
{
"epoch": 2.2557544757033248,
"grad_norm": 0.24040274933787345,
"learning_rate": 2.6709331362306122e-05,
"loss": 1.0345,
"step": 441
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.22951772137968907,
"learning_rate": 2.6641796007603756e-05,
"loss": 1.0554,
"step": 442
},
{
"epoch": 2.265984654731458,
"grad_norm": 0.1954752873279388,
"learning_rate": 2.6574175424247926e-05,
"loss": 1.0778,
"step": 443
},
{
"epoch": 2.2710997442455243,
"grad_norm": 0.20768996296631603,
"learning_rate": 2.6506470479957432e-05,
"loss": 1.0384,
"step": 444
},
{
"epoch": 2.2762148337595907,
"grad_norm": 0.2050178399434897,
"learning_rate": 2.6438682043533606e-05,
"loss": 1.0819,
"step": 445
},
{
"epoch": 2.2813299232736575,
"grad_norm": 0.22084478986907694,
"learning_rate": 2.637081098484918e-05,
"loss": 1.0681,
"step": 446
},
{
"epoch": 2.286445012787724,
"grad_norm": 0.2525194084168267,
"learning_rate": 2.6302858174837084e-05,
"loss": 1.0766,
"step": 447
},
{
"epoch": 2.29156010230179,
"grad_norm": 0.20434937850015836,
"learning_rate": 2.623482448547931e-05,
"loss": 1.0674,
"step": 448
},
{
"epoch": 2.296675191815857,
"grad_norm": 0.18111384656350762,
"learning_rate": 2.6166710789795704e-05,
"loss": 1.049,
"step": 449
},
{
"epoch": 2.3017902813299234,
"grad_norm": 0.19742247992531406,
"learning_rate": 2.6098517961832773e-05,
"loss": 1.0636,
"step": 450
},
{
"epoch": 2.3069053708439897,
"grad_norm": 0.23785950833253988,
"learning_rate": 2.6030246876652445e-05,
"loss": 1.0504,
"step": 451
},
{
"epoch": 2.312020460358056,
"grad_norm": 0.2097971952126859,
"learning_rate": 2.5961898410320894e-05,
"loss": 1.0361,
"step": 452
},
{
"epoch": 2.317135549872123,
"grad_norm": 0.2276417405483114,
"learning_rate": 2.5893473439897215e-05,
"loss": 1.0411,
"step": 453
},
{
"epoch": 2.3222506393861893,
"grad_norm": 0.2537489564697595,
"learning_rate": 2.5824972843422257e-05,
"loss": 1.0714,
"step": 454
},
{
"epoch": 2.3273657289002556,
"grad_norm": 0.25653394100856325,
"learning_rate": 2.5756397499907283e-05,
"loss": 1.0661,
"step": 455
},
{
"epoch": 2.3324808184143224,
"grad_norm": 0.22816918192865004,
"learning_rate": 2.5687748289322744e-05,
"loss": 1.0596,
"step": 456
},
{
"epoch": 2.337595907928389,
"grad_norm": 0.21376463186605166,
"learning_rate": 2.561902609258697e-05,
"loss": 1.0725,
"step": 457
},
{
"epoch": 2.342710997442455,
"grad_norm": 0.2242763029015445,
"learning_rate": 2.5550231791554833e-05,
"loss": 1.0815,
"step": 458
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.17279605910571388,
"learning_rate": 2.5481366269006497e-05,
"loss": 1.045,
"step": 459
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.21321751878224862,
"learning_rate": 2.541243040863602e-05,
"loss": 1.0449,
"step": 460
},
{
"epoch": 2.3580562659846547,
"grad_norm": 0.2068004135775521,
"learning_rate": 2.5343425095040065e-05,
"loss": 1.0484,
"step": 461
},
{
"epoch": 2.363171355498721,
"grad_norm": 0.1767285038930778,
"learning_rate": 2.527435121370653e-05,
"loss": 1.0554,
"step": 462
},
{
"epoch": 2.368286445012788,
"grad_norm": 0.18469974343909754,
"learning_rate": 2.5205209651003176e-05,
"loss": 1.0338,
"step": 463
},
{
"epoch": 2.373401534526854,
"grad_norm": 0.1692343543578853,
"learning_rate": 2.5136001294166263e-05,
"loss": 1.0831,
"step": 464
},
{
"epoch": 2.3785166240409206,
"grad_norm": 0.19527583181910754,
"learning_rate": 2.506672703128919e-05,
"loss": 1.0679,
"step": 465
},
{
"epoch": 2.3836317135549874,
"grad_norm": 0.21170860178119313,
"learning_rate": 2.4997387751311035e-05,
"loss": 1.0581,
"step": 466
},
{
"epoch": 2.3887468030690537,
"grad_norm": 0.21354376020274823,
"learning_rate": 2.4927984344005212e-05,
"loss": 1.0488,
"step": 467
},
{
"epoch": 2.39386189258312,
"grad_norm": 0.16943920612627475,
"learning_rate": 2.4858517699968027e-05,
"loss": 1.0498,
"step": 468
},
{
"epoch": 2.398976982097187,
"grad_norm": 0.2263431725640194,
"learning_rate": 2.4788988710607232e-05,
"loss": 1.0649,
"step": 469
},
{
"epoch": 2.4040920716112533,
"grad_norm": 0.2500711707765182,
"learning_rate": 2.471939826813063e-05,
"loss": 1.0549,
"step": 470
},
{
"epoch": 2.4092071611253196,
"grad_norm": 0.28197002741407323,
"learning_rate": 2.4649747265534584e-05,
"loss": 1.0568,
"step": 471
},
{
"epoch": 2.414322250639386,
"grad_norm": 0.2307560418077442,
"learning_rate": 2.458003659659257e-05,
"loss": 1.0643,
"step": 472
},
{
"epoch": 2.419437340153453,
"grad_norm": 0.2275883404782231,
"learning_rate": 2.451026715584374e-05,
"loss": 1.0263,
"step": 473
},
{
"epoch": 2.424552429667519,
"grad_norm": 0.1837213662498589,
"learning_rate": 2.4440439838581375e-05,
"loss": 1.0663,
"step": 474
},
{
"epoch": 2.4296675191815855,
"grad_norm": 0.2191908919218784,
"learning_rate": 2.4370555540841477e-05,
"loss": 1.0508,
"step": 475
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.2906689646420887,
"learning_rate": 2.4300615159391204e-05,
"loss": 1.0401,
"step": 476
},
{
"epoch": 2.4398976982097187,
"grad_norm": 0.2883194193305295,
"learning_rate": 2.423061959171741e-05,
"loss": 1.0767,
"step": 477
},
{
"epoch": 2.445012787723785,
"grad_norm": 0.17758442728560847,
"learning_rate": 2.4160569736015082e-05,
"loss": 1.0577,
"step": 478
},
{
"epoch": 2.4501278772378514,
"grad_norm": 0.16717169630312334,
"learning_rate": 2.4090466491175876e-05,
"loss": 1.0293,
"step": 479
},
{
"epoch": 2.455242966751918,
"grad_norm": 0.24782575757267775,
"learning_rate": 2.4020310756776506e-05,
"loss": 1.0336,
"step": 480
},
{
"epoch": 2.4603580562659846,
"grad_norm": 0.2636655812446855,
"learning_rate": 2.3950103433067273e-05,
"loss": 1.0809,
"step": 481
},
{
"epoch": 2.4654731457800514,
"grad_norm": 0.16902242053452235,
"learning_rate": 2.3879845420960458e-05,
"loss": 1.0785,
"step": 482
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.21078452585148946,
"learning_rate": 2.3809537622018812e-05,
"loss": 1.0398,
"step": 483
},
{
"epoch": 2.475703324808184,
"grad_norm": 0.21973365788168056,
"learning_rate": 2.373918093844393e-05,
"loss": 1.0591,
"step": 484
},
{
"epoch": 2.4808184143222505,
"grad_norm": 0.20412106095476204,
"learning_rate": 2.3668776273064717e-05,
"loss": 1.0452,
"step": 485
},
{
"epoch": 2.4859335038363173,
"grad_norm": 0.20054437938777525,
"learning_rate": 2.3598324529325783e-05,
"loss": 1.0632,
"step": 486
},
{
"epoch": 2.4910485933503836,
"grad_norm": 0.18477265021437198,
"learning_rate": 2.3527826611275865e-05,
"loss": 1.0536,
"step": 487
},
{
"epoch": 2.49616368286445,
"grad_norm": 0.20666512725520833,
"learning_rate": 2.3457283423556206e-05,
"loss": 1.0484,
"step": 488
},
{
"epoch": 2.501278772378517,
"grad_norm": 0.18951044504687325,
"learning_rate": 2.338669587138897e-05,
"loss": 1.0577,
"step": 489
},
{
"epoch": 2.506393861892583,
"grad_norm": 0.16411315959112188,
"learning_rate": 2.33160648605656e-05,
"loss": 1.0482,
"step": 490
},
{
"epoch": 2.5115089514066495,
"grad_norm": 0.20692388287261934,
"learning_rate": 2.3245391297435208e-05,
"loss": 1.0296,
"step": 491
},
{
"epoch": 2.516624040920716,
"grad_norm": 0.22236042948249146,
"learning_rate": 2.3174676088892955e-05,
"loss": 1.0461,
"step": 492
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.21800230689438133,
"learning_rate": 2.3103920142368392e-05,
"loss": 1.0433,
"step": 493
},
{
"epoch": 2.526854219948849,
"grad_norm": 0.18373456124142468,
"learning_rate": 2.3033124365813845e-05,
"loss": 1.0569,
"step": 494
},
{
"epoch": 2.531969309462916,
"grad_norm": 0.18720140331921853,
"learning_rate": 2.2962289667692717e-05,
"loss": 1.0729,
"step": 495
},
{
"epoch": 2.5370843989769822,
"grad_norm": 0.25688260624393866,
"learning_rate": 2.2891416956967883e-05,
"loss": 1.0662,
"step": 496
},
{
"epoch": 2.5421994884910486,
"grad_norm": 0.22685524650641428,
"learning_rate": 2.2820507143089986e-05,
"loss": 1.0482,
"step": 497
},
{
"epoch": 2.547314578005115,
"grad_norm": 0.22006061149849276,
"learning_rate": 2.27495611359858e-05,
"loss": 1.0413,
"step": 498
},
{
"epoch": 2.5524296675191813,
"grad_norm": 0.2511924102749484,
"learning_rate": 2.2678579846046526e-05,
"loss": 1.0525,
"step": 499
},
{
"epoch": 2.557544757033248,
"grad_norm": 0.2700011728698647,
"learning_rate": 2.2607564184116125e-05,
"loss": 1.0477,
"step": 500
},
{
"epoch": 2.5626598465473145,
"grad_norm": 0.1824818058130156,
"learning_rate": 2.2536515061479607e-05,
"loss": 1.0464,
"step": 501
},
{
"epoch": 2.5677749360613813,
"grad_norm": 0.1815390402019666,
"learning_rate": 2.2465433389851387e-05,
"loss": 1.0728,
"step": 502
},
{
"epoch": 2.5728900255754477,
"grad_norm": 0.20838139814968445,
"learning_rate": 2.2394320081363527e-05,
"loss": 1.037,
"step": 503
},
{
"epoch": 2.578005115089514,
"grad_norm": 0.21820814237765307,
"learning_rate": 2.2323176048554074e-05,
"loss": 1.0783,
"step": 504
},
{
"epoch": 2.5831202046035804,
"grad_norm": 0.18364164718719542,
"learning_rate": 2.2252002204355333e-05,
"loss": 1.0616,
"step": 505
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.1589951601627371,
"learning_rate": 2.2180799462082145e-05,
"loss": 1.0597,
"step": 506
},
{
"epoch": 2.5933503836317136,
"grad_norm": 0.16399196023280369,
"learning_rate": 2.2109568735420183e-05,
"loss": 1.0541,
"step": 507
},
{
"epoch": 2.59846547314578,
"grad_norm": 0.1797478143219866,
"learning_rate": 2.203831093841422e-05,
"loss": 1.042,
"step": 508
},
{
"epoch": 2.6035805626598467,
"grad_norm": 0.17837990860903477,
"learning_rate": 2.19670269854564e-05,
"loss": 1.057,
"step": 509
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.1824720167101557,
"learning_rate": 2.189571779127451e-05,
"loss": 1.0342,
"step": 510
},
{
"epoch": 2.6138107416879794,
"grad_norm": 0.196252683041784,
"learning_rate": 2.182438427092022e-05,
"loss": 1.0468,
"step": 511
},
{
"epoch": 2.618925831202046,
"grad_norm": 0.18345210192186157,
"learning_rate": 2.1753027339757367e-05,
"loss": 1.0555,
"step": 512
},
{
"epoch": 2.6240409207161126,
"grad_norm": 0.210072150573065,
"learning_rate": 2.1681647913450208e-05,
"loss": 1.0474,
"step": 513
},
{
"epoch": 2.629156010230179,
"grad_norm": 0.1649906960552718,
"learning_rate": 2.161024690795166e-05,
"loss": 1.0615,
"step": 514
},
{
"epoch": 2.634271099744246,
"grad_norm": 0.19273203824222748,
"learning_rate": 2.1538825239491525e-05,
"loss": 1.0619,
"step": 515
},
{
"epoch": 2.639386189258312,
"grad_norm": 0.17231795376570638,
"learning_rate": 2.1467383824564793e-05,
"loss": 1.0427,
"step": 516
},
{
"epoch": 2.6445012787723785,
"grad_norm": 0.2028517262474422,
"learning_rate": 2.1395923579919805e-05,
"loss": 1.0736,
"step": 517
},
{
"epoch": 2.649616368286445,
"grad_norm": 0.18318105559288345,
"learning_rate": 2.1324445422546562e-05,
"loss": 1.0541,
"step": 518
},
{
"epoch": 2.6547314578005117,
"grad_norm": 0.19128403854869414,
"learning_rate": 2.1252950269664897e-05,
"loss": 1.0483,
"step": 519
},
{
"epoch": 2.659846547314578,
"grad_norm": 0.18383821827497523,
"learning_rate": 2.1181439038712747e-05,
"loss": 1.0645,
"step": 520
},
{
"epoch": 2.6649616368286444,
"grad_norm": 0.18884844033700063,
"learning_rate": 2.1109912647334375e-05,
"loss": 1.062,
"step": 521
},
{
"epoch": 2.670076726342711,
"grad_norm": 0.20868590019750766,
"learning_rate": 2.1038372013368553e-05,
"loss": 1.0354,
"step": 522
},
{
"epoch": 2.6751918158567776,
"grad_norm": 0.19337083461260082,
"learning_rate": 2.0966818054836852e-05,
"loss": 1.0344,
"step": 523
},
{
"epoch": 2.680306905370844,
"grad_norm": 0.18868476730256678,
"learning_rate": 2.08952516899318e-05,
"loss": 1.0588,
"step": 524
},
{
"epoch": 2.6854219948849103,
"grad_norm": 0.22485066460796876,
"learning_rate": 2.0823673837005146e-05,
"loss": 1.0471,
"step": 525
},
{
"epoch": 2.690537084398977,
"grad_norm": 0.17863420314473608,
"learning_rate": 2.075208541455604e-05,
"loss": 1.0526,
"step": 526
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.16122683418094333,
"learning_rate": 2.068048734121927e-05,
"loss": 1.0419,
"step": 527
},
{
"epoch": 2.70076726342711,
"grad_norm": 0.19335277655052013,
"learning_rate": 2.0608880535753456e-05,
"loss": 1.0498,
"step": 528
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.17128814463154046,
"learning_rate": 2.0537265917029282e-05,
"loss": 1.0549,
"step": 529
},
{
"epoch": 2.710997442455243,
"grad_norm": 0.19141186780590191,
"learning_rate": 2.046564440401769e-05,
"loss": 1.0535,
"step": 530
},
{
"epoch": 2.7161125319693094,
"grad_norm": 0.20405582061795946,
"learning_rate": 2.039401691577809e-05,
"loss": 1.0692,
"step": 531
},
{
"epoch": 2.7212276214833757,
"grad_norm": 0.17731656359333237,
"learning_rate": 2.0322384371446563e-05,
"loss": 1.0752,
"step": 532
},
{
"epoch": 2.7263427109974425,
"grad_norm": 0.21842059415138476,
"learning_rate": 2.025074769022407e-05,
"loss": 1.0364,
"step": 533
},
{
"epoch": 2.731457800511509,
"grad_norm": 0.20651080785780135,
"learning_rate": 2.0179107791364662e-05,
"loss": 1.0626,
"step": 534
},
{
"epoch": 2.7365728900255757,
"grad_norm": 0.17155059878215104,
"learning_rate": 2.0107465594163686e-05,
"loss": 1.0715,
"step": 535
},
{
"epoch": 2.741687979539642,
"grad_norm": 0.191605422088092,
"learning_rate": 2.0035822017945964e-05,
"loss": 1.0586,
"step": 536
},
{
"epoch": 2.7468030690537084,
"grad_norm": 0.20091866570259082,
"learning_rate": 1.996417798205404e-05,
"loss": 1.0481,
"step": 537
},
{
"epoch": 2.7519181585677748,
"grad_norm": 0.1631767258894682,
"learning_rate": 1.9892534405836314e-05,
"loss": 1.0727,
"step": 538
},
{
"epoch": 2.7570332480818416,
"grad_norm": 0.17582792226780383,
"learning_rate": 1.982089220863534e-05,
"loss": 1.0516,
"step": 539
},
{
"epoch": 2.762148337595908,
"grad_norm": 0.21550165289844242,
"learning_rate": 1.974925230977594e-05,
"loss": 1.038,
"step": 540
},
{
"epoch": 2.7672634271099743,
"grad_norm": 0.1535322128634846,
"learning_rate": 1.9677615628553447e-05,
"loss": 1.0406,
"step": 541
},
{
"epoch": 2.772378516624041,
"grad_norm": 0.19027107621098235,
"learning_rate": 1.9605983084221918e-05,
"loss": 1.0602,
"step": 542
},
{
"epoch": 2.7774936061381075,
"grad_norm": 0.18202864426113524,
"learning_rate": 1.953435559598231e-05,
"loss": 1.0615,
"step": 543
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.174647540829116,
"learning_rate": 1.946273408297072e-05,
"loss": 1.0261,
"step": 544
},
{
"epoch": 2.78772378516624,
"grad_norm": 0.1792789344961753,
"learning_rate": 1.939111946424655e-05,
"loss": 1.0438,
"step": 545
},
{
"epoch": 2.792838874680307,
"grad_norm": 0.16977791807638054,
"learning_rate": 1.9319512658780735e-05,
"loss": 1.0342,
"step": 546
},
{
"epoch": 2.7979539641943734,
"grad_norm": 0.184556004403152,
"learning_rate": 1.9247914585443963e-05,
"loss": 1.0583,
"step": 547
},
{
"epoch": 2.80306905370844,
"grad_norm": 0.1657344560842826,
"learning_rate": 1.9176326162994854e-05,
"loss": 1.0709,
"step": 548
},
{
"epoch": 2.8081841432225065,
"grad_norm": 0.22030489192290154,
"learning_rate": 1.9104748310068203e-05,
"loss": 1.0708,
"step": 549
},
{
"epoch": 2.813299232736573,
"grad_norm": 0.16653582773895698,
"learning_rate": 1.9033181945163158e-05,
"loss": 1.0528,
"step": 550
},
{
"epoch": 2.8184143222506393,
"grad_norm": 0.19617601692800635,
"learning_rate": 1.8961627986631453e-05,
"loss": 1.0601,
"step": 551
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.19528571595370786,
"learning_rate": 1.889008735266564e-05,
"loss": 1.0538,
"step": 552
},
{
"epoch": 2.8286445012787724,
"grad_norm": 0.18171426693241638,
"learning_rate": 1.8818560961287257e-05,
"loss": 1.047,
"step": 553
},
{
"epoch": 2.833759590792839,
"grad_norm": 0.20058111644859636,
"learning_rate": 1.8747049730335113e-05,
"loss": 1.0645,
"step": 554
},
{
"epoch": 2.8388746803069056,
"grad_norm": 0.16553935567855677,
"learning_rate": 1.8675554577453445e-05,
"loss": 1.064,
"step": 555
},
{
"epoch": 2.843989769820972,
"grad_norm": 0.1967672535744814,
"learning_rate": 1.8604076420080198e-05,
"loss": 1.0577,
"step": 556
},
{
"epoch": 2.8491048593350383,
"grad_norm": 0.1756479122957693,
"learning_rate": 1.8532616175435218e-05,
"loss": 1.0505,
"step": 557
},
{
"epoch": 2.8542199488491047,
"grad_norm": 0.17168091235870145,
"learning_rate": 1.8461174760508475e-05,
"loss": 1.0519,
"step": 558
},
{
"epoch": 2.8593350383631715,
"grad_norm": 0.15321315491437887,
"learning_rate": 1.8389753092048347e-05,
"loss": 1.0486,
"step": 559
},
{
"epoch": 2.864450127877238,
"grad_norm": 0.18225430033662582,
"learning_rate": 1.8318352086549792e-05,
"loss": 1.066,
"step": 560
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.16006236619371164,
"learning_rate": 1.8246972660242636e-05,
"loss": 1.0515,
"step": 561
},
{
"epoch": 2.874680306905371,
"grad_norm": 0.16253091851156742,
"learning_rate": 1.8175615729079795e-05,
"loss": 1.0541,
"step": 562
},
{
"epoch": 2.8797953964194374,
"grad_norm": 0.16423315718989012,
"learning_rate": 1.8104282208725496e-05,
"loss": 1.0667,
"step": 563
},
{
"epoch": 2.8849104859335037,
"grad_norm": 0.16926338455307693,
"learning_rate": 1.8032973014543608e-05,
"loss": 1.0313,
"step": 564
},
{
"epoch": 2.89002557544757,
"grad_norm": 0.17421414000562616,
"learning_rate": 1.7961689061585778e-05,
"loss": 1.0536,
"step": 565
},
{
"epoch": 2.895140664961637,
"grad_norm": 0.17451819177181235,
"learning_rate": 1.7890431264579823e-05,
"loss": 1.0467,
"step": 566
},
{
"epoch": 2.9002557544757033,
"grad_norm": 0.15316592607524337,
"learning_rate": 1.7819200537917865e-05,
"loss": 1.0362,
"step": 567
},
{
"epoch": 2.90537084398977,
"grad_norm": 0.15871802556233625,
"learning_rate": 1.7747997795644673e-05,
"loss": 1.0464,
"step": 568
},
{
"epoch": 2.9104859335038364,
"grad_norm": 0.16383928906479925,
"learning_rate": 1.7676823951445932e-05,
"loss": 1.0643,
"step": 569
},
{
"epoch": 2.915601023017903,
"grad_norm": 0.1564037056868544,
"learning_rate": 1.7605679918636477e-05,
"loss": 1.0468,
"step": 570
},
{
"epoch": 2.920716112531969,
"grad_norm": 0.16292314246842024,
"learning_rate": 1.753456661014862e-05,
"loss": 1.0627,
"step": 571
},
{
"epoch": 2.9258312020460355,
"grad_norm": 0.17007157486665242,
"learning_rate": 1.7463484938520403e-05,
"loss": 1.0543,
"step": 572
},
{
"epoch": 2.9309462915601023,
"grad_norm": 0.18572662827402042,
"learning_rate": 1.7392435815883882e-05,
"loss": 1.0414,
"step": 573
},
{
"epoch": 2.9360613810741687,
"grad_norm": 0.17257165847671296,
"learning_rate": 1.732142015395348e-05,
"loss": 1.0529,
"step": 574
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.20412881132319738,
"learning_rate": 1.72504388640142e-05,
"loss": 1.0625,
"step": 575
},
{
"epoch": 2.946291560102302,
"grad_norm": 0.20481457378652648,
"learning_rate": 1.717949285691002e-05,
"loss": 1.0555,
"step": 576
},
{
"epoch": 2.9514066496163682,
"grad_norm": 0.20710495242836005,
"learning_rate": 1.7108583043032128e-05,
"loss": 1.0676,
"step": 577
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.18506419413597094,
"learning_rate": 1.703771033230729e-05,
"loss": 1.073,
"step": 578
},
{
"epoch": 2.9616368286445014,
"grad_norm": 0.21026685381302263,
"learning_rate": 1.6966875634186165e-05,
"loss": 1.0489,
"step": 579
},
{
"epoch": 2.9667519181585678,
"grad_norm": 0.1426296083677054,
"learning_rate": 1.6896079857631608e-05,
"loss": 1.0551,
"step": 580
},
{
"epoch": 2.971867007672634,
"grad_norm": 0.21547970346674888,
"learning_rate": 1.682532391110705e-05,
"loss": 1.0405,
"step": 581
},
{
"epoch": 2.976982097186701,
"grad_norm": 0.1789073483963138,
"learning_rate": 1.67546087025648e-05,
"loss": 1.0597,
"step": 582
},
{
"epoch": 2.9820971867007673,
"grad_norm": 0.17706811680146436,
"learning_rate": 1.6683935139434407e-05,
"loss": 1.0613,
"step": 583
},
{
"epoch": 2.9872122762148337,
"grad_norm": 0.15139119384163008,
"learning_rate": 1.6613304128611033e-05,
"loss": 1.0619,
"step": 584
},
{
"epoch": 2.9923273657289,
"grad_norm": 0.14879029530531915,
"learning_rate": 1.6542716576443794e-05,
"loss": 1.0425,
"step": 585
},
{
"epoch": 2.997442455242967,
"grad_norm": 0.17723730809616817,
"learning_rate": 1.647217338872414e-05,
"loss": 1.0587,
"step": 586
},
{
"epoch": 3.002557544757033,
"grad_norm": 0.3071736955013866,
"learning_rate": 1.6401675470674227e-05,
"loss": 1.6121,
"step": 587
},
{
"epoch": 3.0076726342710995,
"grad_norm": 0.26492500317946227,
"learning_rate": 1.633122372693529e-05,
"loss": 1.0672,
"step": 588
},
{
"epoch": 3.0127877237851663,
"grad_norm": 0.15890708426469805,
"learning_rate": 1.626081906155608e-05,
"loss": 1.0691,
"step": 589
},
{
"epoch": 3.0179028132992327,
"grad_norm": 0.2443938647122444,
"learning_rate": 1.6190462377981195e-05,
"loss": 1.0177,
"step": 590
},
{
"epoch": 3.023017902813299,
"grad_norm": 0.219676833544469,
"learning_rate": 1.6120154579039545e-05,
"loss": 1.059,
"step": 591
},
{
"epoch": 3.028132992327366,
"grad_norm": 0.1914974178554955,
"learning_rate": 1.6049896566932734e-05,
"loss": 1.0517,
"step": 592
},
{
"epoch": 3.0332480818414322,
"grad_norm": 0.2531309353890888,
"learning_rate": 1.59796892432235e-05,
"loss": 1.0297,
"step": 593
},
{
"epoch": 3.0383631713554986,
"grad_norm": 0.1792627598925816,
"learning_rate": 1.5909533508824134e-05,
"loss": 1.0276,
"step": 594
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.2125713178736253,
"learning_rate": 1.5839430263984918e-05,
"loss": 1.04,
"step": 595
},
{
"epoch": 3.0485933503836318,
"grad_norm": 0.25332731139582426,
"learning_rate": 1.5769380408282597e-05,
"loss": 1.0311,
"step": 596
},
{
"epoch": 3.053708439897698,
"grad_norm": 0.14333190077941838,
"learning_rate": 1.5699384840608796e-05,
"loss": 1.0431,
"step": 597
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.22250304106653512,
"learning_rate": 1.5629444459158526e-05,
"loss": 1.0448,
"step": 598
},
{
"epoch": 3.0639386189258313,
"grad_norm": 0.15763841427027347,
"learning_rate": 1.5559560161418635e-05,
"loss": 1.0447,
"step": 599
},
{
"epoch": 3.0690537084398977,
"grad_norm": 0.18154306752668684,
"learning_rate": 1.5489732844156267e-05,
"loss": 1.058,
"step": 600
},
{
"epoch": 3.074168797953964,
"grad_norm": 0.17175626290889617,
"learning_rate": 1.5419963403407437e-05,
"loss": 1.0456,
"step": 601
},
{
"epoch": 3.079283887468031,
"grad_norm": 0.15750320891130676,
"learning_rate": 1.535025273446542e-05,
"loss": 1.0565,
"step": 602
},
{
"epoch": 3.084398976982097,
"grad_norm": 0.1587964508315697,
"learning_rate": 1.5280601731869375e-05,
"loss": 1.0545,
"step": 603
},
{
"epoch": 3.0895140664961636,
"grad_norm": 0.15466633117165798,
"learning_rate": 1.5211011289392775e-05,
"loss": 1.0793,
"step": 604
},
{
"epoch": 3.0946291560102304,
"grad_norm": 0.16479484418980225,
"learning_rate": 1.514148230003198e-05,
"loss": 1.0527,
"step": 605
},
{
"epoch": 3.0997442455242967,
"grad_norm": 0.14980835645414117,
"learning_rate": 1.5072015655994793e-05,
"loss": 1.0261,
"step": 606
},
{
"epoch": 3.104859335038363,
"grad_norm": 0.14149499986126193,
"learning_rate": 1.500261224868897e-05,
"loss": 1.0639,
"step": 607
},
{
"epoch": 3.10997442455243,
"grad_norm": 0.14663927333662252,
"learning_rate": 1.4933272968710819e-05,
"loss": 1.0533,
"step": 608
},
{
"epoch": 3.1150895140664963,
"grad_norm": 0.12780353135571323,
"learning_rate": 1.486399870583374e-05,
"loss": 1.0507,
"step": 609
},
{
"epoch": 3.1202046035805626,
"grad_norm": 0.1568073693632928,
"learning_rate": 1.4794790348996833e-05,
"loss": 1.0581,
"step": 610
},
{
"epoch": 3.125319693094629,
"grad_norm": 0.1384187110673024,
"learning_rate": 1.4725648786293478e-05,
"loss": 1.0005,
"step": 611
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.15518779732930923,
"learning_rate": 1.4656574904959937e-05,
"loss": 1.0461,
"step": 612
},
{
"epoch": 3.135549872122762,
"grad_norm": 0.1377191810212374,
"learning_rate": 1.4587569591363988e-05,
"loss": 1.039,
"step": 613
},
{
"epoch": 3.1406649616368285,
"grad_norm": 0.14717408978482713,
"learning_rate": 1.4518633730993515e-05,
"loss": 1.0585,
"step": 614
},
{
"epoch": 3.1457800511508953,
"grad_norm": 0.14545458947244394,
"learning_rate": 1.444976820844517e-05,
"loss": 1.0589,
"step": 615
},
{
"epoch": 3.1508951406649617,
"grad_norm": 0.15779651560328295,
"learning_rate": 1.438097390741304e-05,
"loss": 1.0507,
"step": 616
},
{
"epoch": 3.156010230179028,
"grad_norm": 0.16768466110672925,
"learning_rate": 1.431225171067726e-05,
"loss": 1.0798,
"step": 617
},
{
"epoch": 3.1611253196930944,
"grad_norm": 0.1360648364823117,
"learning_rate": 1.4243602500092725e-05,
"loss": 1.0532,
"step": 618
},
{
"epoch": 3.166240409207161,
"grad_norm": 0.16595760551607416,
"learning_rate": 1.4175027156577757e-05,
"loss": 1.0399,
"step": 619
},
{
"epoch": 3.1713554987212276,
"grad_norm": 0.1698846080059336,
"learning_rate": 1.4106526560102788e-05,
"loss": 1.0342,
"step": 620
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.14590952827196066,
"learning_rate": 1.4038101589679115e-05,
"loss": 1.0259,
"step": 621
},
{
"epoch": 3.1815856777493607,
"grad_norm": 0.14992604919353308,
"learning_rate": 1.3969753123347553e-05,
"loss": 1.0317,
"step": 622
},
{
"epoch": 3.186700767263427,
"grad_norm": 0.15939032054557312,
"learning_rate": 1.3901482038167235e-05,
"loss": 1.0294,
"step": 623
},
{
"epoch": 3.1918158567774935,
"grad_norm": 0.14501109411782664,
"learning_rate": 1.3833289210204299e-05,
"loss": 1.0515,
"step": 624
},
{
"epoch": 3.1969309462915603,
"grad_norm": 0.1539538622034391,
"learning_rate": 1.3765175514520697e-05,
"loss": 1.0351,
"step": 625
},
{
"epoch": 3.2020460358056266,
"grad_norm": 0.13029762347002058,
"learning_rate": 1.3697141825162928e-05,
"loss": 1.0483,
"step": 626
},
{
"epoch": 3.207161125319693,
"grad_norm": 0.14812456235087262,
"learning_rate": 1.3629189015150824e-05,
"loss": 1.0208,
"step": 627
},
{
"epoch": 3.21227621483376,
"grad_norm": 0.13396106035236727,
"learning_rate": 1.3561317956466397e-05,
"loss": 1.0326,
"step": 628
},
{
"epoch": 3.217391304347826,
"grad_norm": 0.1341030709959724,
"learning_rate": 1.3493529520042574e-05,
"loss": 1.0398,
"step": 629
},
{
"epoch": 3.2225063938618925,
"grad_norm": 0.13625306065057746,
"learning_rate": 1.3425824575752082e-05,
"loss": 1.0564,
"step": 630
},
{
"epoch": 3.227621483375959,
"grad_norm": 0.14453871577407487,
"learning_rate": 1.3358203992396253e-05,
"loss": 1.0316,
"step": 631
},
{
"epoch": 3.2327365728900257,
"grad_norm": 0.12896562231375275,
"learning_rate": 1.3290668637693883e-05,
"loss": 1.0368,
"step": 632
},
{
"epoch": 3.237851662404092,
"grad_norm": 0.138959236048212,
"learning_rate": 1.3223219378270114e-05,
"loss": 1.0375,
"step": 633
},
{
"epoch": 3.2429667519181584,
"grad_norm": 0.1395052519380716,
"learning_rate": 1.315585707964528e-05,
"loss": 1.0195,
"step": 634
},
{
"epoch": 3.2480818414322252,
"grad_norm": 0.1374425398825904,
"learning_rate": 1.3088582606223836e-05,
"loss": 1.0337,
"step": 635
},
{
"epoch": 3.2531969309462916,
"grad_norm": 0.15730060552663264,
"learning_rate": 1.3021396821283242e-05,
"loss": 1.045,
"step": 636
},
{
"epoch": 3.258312020460358,
"grad_norm": 0.13746345154314507,
"learning_rate": 1.295430058696287e-05,
"loss": 1.0303,
"step": 637
},
{
"epoch": 3.2634271099744243,
"grad_norm": 0.15421183051065532,
"learning_rate": 1.288729476425302e-05,
"loss": 1.0418,
"step": 638
},
{
"epoch": 3.268542199488491,
"grad_norm": 0.1598425847647116,
"learning_rate": 1.2820380212983748e-05,
"loss": 1.0481,
"step": 639
},
{
"epoch": 3.2736572890025575,
"grad_norm": 0.1390236215037552,
"learning_rate": 1.2753557791813953e-05,
"loss": 1.0649,
"step": 640
},
{
"epoch": 3.2787723785166243,
"grad_norm": 0.19941059675590572,
"learning_rate": 1.2686828358220273e-05,
"loss": 1.0573,
"step": 641
},
{
"epoch": 3.2838874680306906,
"grad_norm": 0.14878037647197712,
"learning_rate": 1.2620192768486121e-05,
"loss": 1.0395,
"step": 642
},
{
"epoch": 3.289002557544757,
"grad_norm": 0.15545058295048259,
"learning_rate": 1.25536518776907e-05,
"loss": 1.0382,
"step": 643
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.14131847135345135,
"learning_rate": 1.2487206539698007e-05,
"loss": 1.0397,
"step": 644
},
{
"epoch": 3.29923273657289,
"grad_norm": 0.16100758922061306,
"learning_rate": 1.2420857607145897e-05,
"loss": 1.0117,
"step": 645
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.17269827345207395,
"learning_rate": 1.2354605931435133e-05,
"loss": 1.0463,
"step": 646
},
{
"epoch": 3.309462915601023,
"grad_norm": 0.1569003313816037,
"learning_rate": 1.2288452362718454e-05,
"loss": 1.0166,
"step": 647
},
{
"epoch": 3.3145780051150897,
"grad_norm": 0.16537040330310332,
"learning_rate": 1.2222397749889691e-05,
"loss": 1.0399,
"step": 648
},
{
"epoch": 3.319693094629156,
"grad_norm": 0.13784997627726225,
"learning_rate": 1.2156442940572835e-05,
"loss": 1.0235,
"step": 649
},
{
"epoch": 3.3248081841432224,
"grad_norm": 0.1545373822751002,
"learning_rate": 1.2090588781111197e-05,
"loss": 1.0643,
"step": 650
},
{
"epoch": 3.329923273657289,
"grad_norm": 0.14181454877401659,
"learning_rate": 1.202483611655653e-05,
"loss": 1.0407,
"step": 651
},
{
"epoch": 3.3350383631713556,
"grad_norm": 0.15648818999987013,
"learning_rate": 1.195918579065817e-05,
"loss": 1.0333,
"step": 652
},
{
"epoch": 3.340153452685422,
"grad_norm": 0.13765084151466453,
"learning_rate": 1.1893638645852254e-05,
"loss": 1.0534,
"step": 653
},
{
"epoch": 3.3452685421994883,
"grad_norm": 0.12576483675459751,
"learning_rate": 1.1828195523250857e-05,
"loss": 1.0149,
"step": 654
},
{
"epoch": 3.350383631713555,
"grad_norm": 0.13956835034375759,
"learning_rate": 1.176285726263124e-05,
"loss": 1.052,
"step": 655
},
{
"epoch": 3.3554987212276215,
"grad_norm": 0.1335543619596467,
"learning_rate": 1.1697624702425045e-05,
"loss": 1.035,
"step": 656
},
{
"epoch": 3.360613810741688,
"grad_norm": 0.1572699150312706,
"learning_rate": 1.1632498679707562e-05,
"loss": 1.0416,
"step": 657
},
{
"epoch": 3.3657289002557547,
"grad_norm": 0.14309357126008015,
"learning_rate": 1.1567480030186968e-05,
"loss": 1.0237,
"step": 658
},
{
"epoch": 3.370843989769821,
"grad_norm": 0.14894097005492454,
"learning_rate": 1.1502569588193586e-05,
"loss": 1.0584,
"step": 659
},
{
"epoch": 3.3759590792838874,
"grad_norm": 0.12923588532019384,
"learning_rate": 1.1437768186669253e-05,
"loss": 1.0327,
"step": 660
},
{
"epoch": 3.381074168797954,
"grad_norm": 0.16813432468192543,
"learning_rate": 1.1373076657156532e-05,
"loss": 1.0162,
"step": 661
},
{
"epoch": 3.3861892583120206,
"grad_norm": 0.13954296082267167,
"learning_rate": 1.1308495829788115e-05,
"loss": 1.0317,
"step": 662
},
{
"epoch": 3.391304347826087,
"grad_norm": 0.16248894290277624,
"learning_rate": 1.1244026533276127e-05,
"loss": 1.0551,
"step": 663
},
{
"epoch": 3.3964194373401533,
"grad_norm": 0.16453679177936717,
"learning_rate": 1.1179669594901528e-05,
"loss": 1.0159,
"step": 664
},
{
"epoch": 3.40153452685422,
"grad_norm": 0.14385678092390486,
"learning_rate": 1.1115425840503482e-05,
"loss": 1.065,
"step": 665
},
{
"epoch": 3.4066496163682864,
"grad_norm": 0.1465819532723495,
"learning_rate": 1.1051296094468729e-05,
"loss": 1.047,
"step": 666
},
{
"epoch": 3.411764705882353,
"grad_norm": 0.1695383670185565,
"learning_rate": 1.098728117972106e-05,
"loss": 1.0356,
"step": 667
},
{
"epoch": 3.4168797953964196,
"grad_norm": 0.13657609259702944,
"learning_rate": 1.0923381917710736e-05,
"loss": 1.054,
"step": 668
},
{
"epoch": 3.421994884910486,
"grad_norm": 0.15659973454341425,
"learning_rate": 1.0859599128403912e-05,
"loss": 1.0221,
"step": 669
},
{
"epoch": 3.4271099744245523,
"grad_norm": 0.10542430698183479,
"learning_rate": 1.0795933630272181e-05,
"loss": 1.0594,
"step": 670
},
{
"epoch": 3.4322250639386187,
"grad_norm": 0.17862668891732322,
"learning_rate": 1.0732386240281998e-05,
"loss": 1.0591,
"step": 671
},
{
"epoch": 3.4373401534526855,
"grad_norm": 0.12133606037305075,
"learning_rate": 1.0668957773884281e-05,
"loss": 1.0303,
"step": 672
},
{
"epoch": 3.442455242966752,
"grad_norm": 0.15936735682917638,
"learning_rate": 1.0605649045003861e-05,
"loss": 1.0516,
"step": 673
},
{
"epoch": 3.4475703324808182,
"grad_norm": 0.13145770125675021,
"learning_rate": 1.0542460866029086e-05,
"loss": 1.0402,
"step": 674
},
{
"epoch": 3.452685421994885,
"grad_norm": 0.16728115574775487,
"learning_rate": 1.0479394047801392e-05,
"loss": 1.0857,
"step": 675
},
{
"epoch": 3.4578005115089514,
"grad_norm": 0.13370080173440338,
"learning_rate": 1.0416449399604898e-05,
"loss": 1.0455,
"step": 676
},
{
"epoch": 3.4629156010230178,
"grad_norm": 0.1600795388469388,
"learning_rate": 1.035362772915602e-05,
"loss": 1.027,
"step": 677
},
{
"epoch": 3.4680306905370846,
"grad_norm": 0.15111783496094752,
"learning_rate": 1.0290929842593074e-05,
"loss": 1.0377,
"step": 678
},
{
"epoch": 3.473145780051151,
"grad_norm": 0.1523541044075201,
"learning_rate": 1.022835654446599e-05,
"loss": 1.0403,
"step": 679
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.14954178374830374,
"learning_rate": 1.0165908637725957e-05,
"loss": 1.0701,
"step": 680
},
{
"epoch": 3.483375959079284,
"grad_norm": 0.1524360548398074,
"learning_rate": 1.0103586923715092e-05,
"loss": 1.0332,
"step": 681
},
{
"epoch": 3.4884910485933505,
"grad_norm": 0.13627756259558588,
"learning_rate": 1.0041392202156217e-05,
"loss": 1.0426,
"step": 682
},
{
"epoch": 3.493606138107417,
"grad_norm": 0.13374904883755645,
"learning_rate": 9.979325271142561e-06,
"loss": 1.0804,
"step": 683
},
{
"epoch": 3.498721227621483,
"grad_norm": 0.1335636067760998,
"learning_rate": 9.917386927127498e-06,
"loss": 1.0181,
"step": 684
},
{
"epoch": 3.50383631713555,
"grad_norm": 0.13186547128081622,
"learning_rate": 9.855577964914385e-06,
"loss": 1.0611,
"step": 685
},
{
"epoch": 3.5089514066496164,
"grad_norm": 0.14160556585225306,
"learning_rate": 9.793899177646297e-06,
"loss": 1.0724,
"step": 686
},
{
"epoch": 3.5140664961636827,
"grad_norm": 0.14343913251962634,
"learning_rate": 9.73235135679591e-06,
"loss": 1.0421,
"step": 687
},
{
"epoch": 3.5191815856777495,
"grad_norm": 0.13744196044235218,
"learning_rate": 9.670935292155313e-06,
"loss": 1.0165,
"step": 688
},
{
"epoch": 3.524296675191816,
"grad_norm": 0.15599826782771067,
"learning_rate": 9.60965177182585e-06,
"loss": 1.0441,
"step": 689
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.134874002528585,
"learning_rate": 9.548501582208065e-06,
"loss": 1.0521,
"step": 690
},
{
"epoch": 3.5345268542199486,
"grad_norm": 0.1483611542190334,
"learning_rate": 9.48748550799157e-06,
"loss": 1.0797,
"step": 691
},
{
"epoch": 3.5396419437340154,
"grad_norm": 0.13453528613994495,
"learning_rate": 9.426604332144985e-06,
"loss": 1.0437,
"step": 692
},
{
"epoch": 3.544757033248082,
"grad_norm": 0.14159262869376904,
"learning_rate": 9.365858835905878e-06,
"loss": 1.0453,
"step": 693
},
{
"epoch": 3.5498721227621486,
"grad_norm": 0.12925051990957068,
"learning_rate": 9.305249798770774e-06,
"loss": 1.0373,
"step": 694
},
{
"epoch": 3.554987212276215,
"grad_norm": 0.12332769648279926,
"learning_rate": 9.244777998485129e-06,
"loss": 1.0673,
"step": 695
},
{
"epoch": 3.5601023017902813,
"grad_norm": 0.12304869088961108,
"learning_rate": 9.184444211033333e-06,
"loss": 1.0324,
"step": 696
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.11538632936434459,
"learning_rate": 9.124249210628795e-06,
"loss": 1.0584,
"step": 697
},
{
"epoch": 3.5703324808184145,
"grad_norm": 0.13689773750974635,
"learning_rate": 9.064193769703957e-06,
"loss": 1.0391,
"step": 698
},
{
"epoch": 3.575447570332481,
"grad_norm": 0.11857750937375874,
"learning_rate": 9.004278658900456e-06,
"loss": 1.0401,
"step": 699
},
{
"epoch": 3.580562659846547,
"grad_norm": 0.1492037904190607,
"learning_rate": 8.94450464705915e-06,
"loss": 1.0467,
"step": 700
},
{
"epoch": 3.585677749360614,
"grad_norm": 0.12369105867375763,
"learning_rate": 8.884872501210288e-06,
"loss": 1.0372,
"step": 701
},
{
"epoch": 3.5907928388746804,
"grad_norm": 0.13306031841676783,
"learning_rate": 8.8253829865637e-06,
"loss": 1.039,
"step": 702
},
{
"epoch": 3.5959079283887467,
"grad_norm": 0.14095435133920914,
"learning_rate": 8.766036866498929e-06,
"loss": 1.0441,
"step": 703
},
{
"epoch": 3.601023017902813,
"grad_norm": 0.12562296469541392,
"learning_rate": 8.706834902555465e-06,
"loss": 1.0566,
"step": 704
},
{
"epoch": 3.60613810741688,
"grad_norm": 0.13493082025934391,
"learning_rate": 8.647777854422945e-06,
"loss": 1.014,
"step": 705
},
{
"epoch": 3.6112531969309463,
"grad_norm": 0.12574671572281473,
"learning_rate": 8.588866479931436e-06,
"loss": 1.0562,
"step": 706
},
{
"epoch": 3.6163682864450126,
"grad_norm": 0.12741627369712372,
"learning_rate": 8.530101535041701e-06,
"loss": 1.0359,
"step": 707
},
{
"epoch": 3.6214833759590794,
"grad_norm": 0.13342253434879386,
"learning_rate": 8.471483773835472e-06,
"loss": 1.0348,
"step": 708
},
{
"epoch": 3.626598465473146,
"grad_norm": 0.13078373294734275,
"learning_rate": 8.413013948505822e-06,
"loss": 1.0244,
"step": 709
},
{
"epoch": 3.631713554987212,
"grad_norm": 0.12343477770132286,
"learning_rate": 8.354692809347455e-06,
"loss": 1.0624,
"step": 710
},
{
"epoch": 3.6368286445012785,
"grad_norm": 0.1235318396130475,
"learning_rate": 8.296521104747135e-06,
"loss": 1.0422,
"step": 711
},
{
"epoch": 3.6419437340153453,
"grad_norm": 0.13132079552275194,
"learning_rate": 8.238499581174055e-06,
"loss": 1.0344,
"step": 712
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.12173789633241248,
"learning_rate": 8.180628983170235e-06,
"loss": 1.0552,
"step": 713
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.1348770641930468,
"learning_rate": 8.122910053341018e-06,
"loss": 1.0255,
"step": 714
},
{
"epoch": 3.657289002557545,
"grad_norm": 0.1174470741078736,
"learning_rate": 8.065343532345513e-06,
"loss": 1.0349,
"step": 715
},
{
"epoch": 3.662404092071611,
"grad_norm": 0.13327709149809897,
"learning_rate": 8.00793015888707e-06,
"loss": 1.061,
"step": 716
},
{
"epoch": 3.6675191815856776,
"grad_norm": 0.11675888468655334,
"learning_rate": 7.95067066970385e-06,
"loss": 1.0161,
"step": 717
},
{
"epoch": 3.6726342710997444,
"grad_norm": 0.12060066317225653,
"learning_rate": 7.893565799559335e-06,
"loss": 1.056,
"step": 718
},
{
"epoch": 3.6777493606138107,
"grad_norm": 0.12379438618187949,
"learning_rate": 7.836616281232913e-06,
"loss": 1.0606,
"step": 719
},
{
"epoch": 3.682864450127877,
"grad_norm": 0.11445856861073442,
"learning_rate": 7.779822845510463e-06,
"loss": 1.0239,
"step": 720
},
{
"epoch": 3.687979539641944,
"grad_norm": 0.11526838488380206,
"learning_rate": 7.723186221174976e-06,
"loss": 1.0248,
"step": 721
},
{
"epoch": 3.6930946291560103,
"grad_norm": 0.11128815380192054,
"learning_rate": 7.666707134997255e-06,
"loss": 1.0227,
"step": 722
},
{
"epoch": 3.6982097186700766,
"grad_norm": 0.12789433531914984,
"learning_rate": 7.610386311726494e-06,
"loss": 1.0391,
"step": 723
},
{
"epoch": 3.703324808184143,
"grad_norm": 0.12654463075348907,
"learning_rate": 7.554224474081073e-06,
"loss": 1.0569,
"step": 724
},
{
"epoch": 3.70843989769821,
"grad_norm": 0.12120187889801023,
"learning_rate": 7.498222342739205e-06,
"loss": 1.0316,
"step": 725
},
{
"epoch": 3.713554987212276,
"grad_norm": 0.12870715819071762,
"learning_rate": 7.442380636329754e-06,
"loss": 1.0272,
"step": 726
},
{
"epoch": 3.718670076726343,
"grad_norm": 0.12038936558963169,
"learning_rate": 7.386700071422977e-06,
"loss": 1.0442,
"step": 727
},
{
"epoch": 3.7237851662404093,
"grad_norm": 0.12005588437284867,
"learning_rate": 7.331181362521316e-06,
"loss": 1.0219,
"step": 728
},
{
"epoch": 3.7289002557544757,
"grad_norm": 0.13885672282036873,
"learning_rate": 7.2758252220502766e-06,
"loss": 1.0549,
"step": 729
},
{
"epoch": 3.734015345268542,
"grad_norm": 0.114685790777722,
"learning_rate": 7.220632360349245e-06,
"loss": 1.0531,
"step": 730
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.12796701045791628,
"learning_rate": 7.165603485662394e-06,
"loss": 1.0161,
"step": 731
},
{
"epoch": 3.7442455242966752,
"grad_norm": 0.12255668076944778,
"learning_rate": 7.110739304129575e-06,
"loss": 1.0506,
"step": 732
},
{
"epoch": 3.7493606138107416,
"grad_norm": 0.11147398883269082,
"learning_rate": 7.056040519777265e-06,
"loss": 1.0386,
"step": 733
},
{
"epoch": 3.7544757033248084,
"grad_norm": 0.11571505214353113,
"learning_rate": 7.001507834509573e-06,
"loss": 1.0268,
"step": 734
},
{
"epoch": 3.7595907928388748,
"grad_norm": 0.11338997067713637,
"learning_rate": 6.9471419480991495e-06,
"loss": 1.042,
"step": 735
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.10429396376138998,
"learning_rate": 6.892943558178289e-06,
"loss": 1.0491,
"step": 736
},
{
"epoch": 3.7698209718670075,
"grad_norm": 0.19807030388806532,
"learning_rate": 6.838913360229913e-06,
"loss": 1.0402,
"step": 737
},
{
"epoch": 3.7749360613810743,
"grad_norm": 0.10549982080612258,
"learning_rate": 6.785052047578697e-06,
"loss": 1.0358,
"step": 738
},
{
"epoch": 3.7800511508951407,
"grad_norm": 0.12088290265838489,
"learning_rate": 6.731360311382156e-06,
"loss": 1.0291,
"step": 739
},
{
"epoch": 3.785166240409207,
"grad_norm": 0.10642570829304446,
"learning_rate": 6.677838840621742e-06,
"loss": 1.0225,
"step": 740
},
{
"epoch": 3.790281329923274,
"grad_norm": 0.11324080154482034,
"learning_rate": 6.624488322094058e-06,
"loss": 1.0345,
"step": 741
},
{
"epoch": 3.79539641943734,
"grad_norm": 0.11373484571233722,
"learning_rate": 6.571309440402021e-06,
"loss": 1.0839,
"step": 742
},
{
"epoch": 3.8005115089514065,
"grad_norm": 0.11462945002712063,
"learning_rate": 6.518302877946048e-06,
"loss": 1.0669,
"step": 743
},
{
"epoch": 3.805626598465473,
"grad_norm": 0.11204861716772031,
"learning_rate": 6.465469314915352e-06,
"loss": 1.046,
"step": 744
},
{
"epoch": 3.8107416879795397,
"grad_norm": 0.12022178279314234,
"learning_rate": 6.412809429279179e-06,
"loss": 1.0512,
"step": 745
},
{
"epoch": 3.815856777493606,
"grad_norm": 0.12165382757174402,
"learning_rate": 6.3603238967781245e-06,
"loss": 1.0341,
"step": 746
},
{
"epoch": 3.820971867007673,
"grad_norm": 0.12397109577620231,
"learning_rate": 6.308013390915439e-06,
"loss": 1.0286,
"step": 747
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.12905831522454317,
"learning_rate": 6.255878582948409e-06,
"loss": 1.0732,
"step": 748
},
{
"epoch": 3.8312020460358056,
"grad_norm": 0.10775412647558373,
"learning_rate": 6.203920141879742e-06,
"loss": 1.031,
"step": 749
},
{
"epoch": 3.836317135549872,
"grad_norm": 0.12988530370192228,
"learning_rate": 6.1521387344489716e-06,
"loss": 1.0456,
"step": 750
},
{
"epoch": 3.8414322250639388,
"grad_norm": 0.1304249761399152,
"learning_rate": 6.100535025123908e-06,
"loss": 1.0363,
"step": 751
},
{
"epoch": 3.846547314578005,
"grad_norm": 0.10738021940802008,
"learning_rate": 6.049109676092097e-06,
"loss": 1.0397,
"step": 752
},
{
"epoch": 3.8516624040920715,
"grad_norm": 0.12589865567427963,
"learning_rate": 5.9978633472523505e-06,
"loss": 1.0501,
"step": 753
},
{
"epoch": 3.8567774936061383,
"grad_norm": 0.11070976622585119,
"learning_rate": 5.94679669620626e-06,
"loss": 1.0591,
"step": 754
},
{
"epoch": 3.8618925831202047,
"grad_norm": 0.1117884773522583,
"learning_rate": 5.895910378249749e-06,
"loss": 1.04,
"step": 755
},
{
"epoch": 3.867007672634271,
"grad_norm": 0.11784667783075373,
"learning_rate": 5.845205046364688e-06,
"loss": 1.0224,
"step": 756
},
{
"epoch": 3.8721227621483374,
"grad_norm": 0.11631341575223461,
"learning_rate": 5.7946813512105025e-06,
"loss": 1.0242,
"step": 757
},
{
"epoch": 3.877237851662404,
"grad_norm": 0.11957689310265991,
"learning_rate": 5.744339941115826e-06,
"loss": 1.0501,
"step": 758
},
{
"epoch": 3.8823529411764706,
"grad_norm": 0.11842983649303031,
"learning_rate": 5.694181462070172e-06,
"loss": 1.0725,
"step": 759
},
{
"epoch": 3.887468030690537,
"grad_norm": 0.11952338302619617,
"learning_rate": 5.644206557715641e-06,
"loss": 1.0664,
"step": 760
},
{
"epoch": 3.8925831202046037,
"grad_norm": 0.11892536713687875,
"learning_rate": 5.5944158693387116e-06,
"loss": 0.998,
"step": 761
},
{
"epoch": 3.89769820971867,
"grad_norm": 0.12406906776609274,
"learning_rate": 5.54481003586193e-06,
"loss": 1.0383,
"step": 762
},
{
"epoch": 3.9028132992327365,
"grad_norm": 0.12004615203676204,
"learning_rate": 5.495389693835777e-06,
"loss": 1.0641,
"step": 763
},
{
"epoch": 3.907928388746803,
"grad_norm": 0.11520566020799254,
"learning_rate": 5.446155477430459e-06,
"loss": 1.0434,
"step": 764
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.1120035353943794,
"learning_rate": 5.397108018427804e-06,
"loss": 1.0196,
"step": 765
},
{
"epoch": 3.918158567774936,
"grad_norm": 0.13347354730054195,
"learning_rate": 5.3482479462131295e-06,
"loss": 1.0531,
"step": 766
},
{
"epoch": 3.923273657289003,
"grad_norm": 0.11377182994995544,
"learning_rate": 5.299575887767166e-06,
"loss": 1.0447,
"step": 767
},
{
"epoch": 3.928388746803069,
"grad_norm": 0.10277267322916264,
"learning_rate": 5.251092467658032e-06,
"loss": 1.0431,
"step": 768
},
{
"epoch": 3.9335038363171355,
"grad_norm": 0.11344118476671607,
"learning_rate": 5.202798308033206e-06,
"loss": 1.0364,
"step": 769
},
{
"epoch": 3.938618925831202,
"grad_norm": 0.12174309556410545,
"learning_rate": 5.1546940286115314e-06,
"loss": 1.027,
"step": 770
},
{
"epoch": 3.9437340153452687,
"grad_norm": 0.1131547612504401,
"learning_rate": 5.106780246675293e-06,
"loss": 1.0181,
"step": 771
},
{
"epoch": 3.948849104859335,
"grad_norm": 0.11462221970998479,
"learning_rate": 5.059057577062256e-06,
"loss": 1.0391,
"step": 772
},
{
"epoch": 3.9539641943734014,
"grad_norm": 0.10731649265711139,
"learning_rate": 5.011526632157837e-06,
"loss": 1.0385,
"step": 773
},
{
"epoch": 3.959079283887468,
"grad_norm": 0.1068334719582534,
"learning_rate": 4.9641880218871775e-06,
"loss": 1.026,
"step": 774
},
{
"epoch": 3.9641943734015346,
"grad_norm": 0.1086853971132405,
"learning_rate": 4.917042353707351e-06,
"loss": 1.0907,
"step": 775
},
{
"epoch": 3.969309462915601,
"grad_norm": 0.10897664106001581,
"learning_rate": 4.870090232599576e-06,
"loss": 1.0469,
"step": 776
},
{
"epoch": 3.9744245524296673,
"grad_norm": 0.10850930161429445,
"learning_rate": 4.823332261061442e-06,
"loss": 1.0442,
"step": 777
},
{
"epoch": 3.979539641943734,
"grad_norm": 0.10718500232069637,
"learning_rate": 4.776769039099176e-06,
"loss": 1.0612,
"step": 778
},
{
"epoch": 3.9846547314578005,
"grad_norm": 0.10897845999718717,
"learning_rate": 4.7304011642199355e-06,
"loss": 1.0457,
"step": 779
},
{
"epoch": 3.9897698209718673,
"grad_norm": 0.10338741515429783,
"learning_rate": 4.6842292314241626e-06,
"loss": 1.0423,
"step": 780
},
{
"epoch": 3.9948849104859336,
"grad_norm": 0.1046539074162833,
"learning_rate": 4.638253833197943e-06,
"loss": 1.0668,
"step": 781
},
{
"epoch": 4.0,
"grad_norm": 0.16987244243701544,
"learning_rate": 4.592475559505374e-06,
"loss": 1.5661,
"step": 782
},
{
"epoch": 4.005115089514066,
"grad_norm": 0.10825655007416356,
"learning_rate": 4.5468949977810415e-06,
"loss": 1.0295,
"step": 783
},
{
"epoch": 4.010230179028133,
"grad_norm": 0.09709211523460262,
"learning_rate": 4.50151273292245e-06,
"loss": 1.0199,
"step": 784
},
{
"epoch": 4.015345268542199,
"grad_norm": 0.1106600074433777,
"learning_rate": 4.456329347282515e-06,
"loss": 1.044,
"step": 785
},
{
"epoch": 4.020460358056266,
"grad_norm": 0.11786657875371895,
"learning_rate": 4.4113454206621185e-06,
"loss": 1.0322,
"step": 786
},
{
"epoch": 4.025575447570333,
"grad_norm": 0.10523548089343922,
"learning_rate": 4.366561530302631e-06,
"loss": 1.0252,
"step": 787
},
{
"epoch": 4.030690537084399,
"grad_norm": 0.10903480563934011,
"learning_rate": 4.321978250878536e-06,
"loss": 1.0361,
"step": 788
},
{
"epoch": 4.035805626598465,
"grad_norm": 0.11279131671447776,
"learning_rate": 4.277596154490047e-06,
"loss": 1.0259,
"step": 789
},
{
"epoch": 4.040920716112532,
"grad_norm": 0.10911611333237252,
"learning_rate": 4.233415810655748e-06,
"loss": 1.0512,
"step": 790
},
{
"epoch": 4.046035805626598,
"grad_norm": 0.10899009342627193,
"learning_rate": 4.189437786305313e-06,
"loss": 1.0265,
"step": 791
},
{
"epoch": 4.051150895140665,
"grad_norm": 0.1208890385053231,
"learning_rate": 4.14566264577221e-06,
"loss": 1.0196,
"step": 792
},
{
"epoch": 4.056265984654732,
"grad_norm": 0.10117429151986053,
"learning_rate": 4.102090950786479e-06,
"loss": 1.0488,
"step": 793
},
{
"epoch": 4.061381074168798,
"grad_norm": 0.10061697427381379,
"learning_rate": 4.058723260467494e-06,
"loss": 1.0075,
"step": 794
},
{
"epoch": 4.0664961636828645,
"grad_norm": 0.1215862753853224,
"learning_rate": 4.0155601313168204e-06,
"loss": 1.0446,
"step": 795
},
{
"epoch": 4.071611253196931,
"grad_norm": 0.1097589288716949,
"learning_rate": 3.972602117211062e-06,
"loss": 1.0603,
"step": 796
},
{
"epoch": 4.076726342710997,
"grad_norm": 0.1021864573752271,
"learning_rate": 3.929849769394733e-06,
"loss": 1.0668,
"step": 797
},
{
"epoch": 4.081841432225064,
"grad_norm": 0.10449100664299484,
"learning_rate": 3.887303636473232e-06,
"loss": 1.0411,
"step": 798
},
{
"epoch": 4.086956521739131,
"grad_norm": 0.10879423779843234,
"learning_rate": 3.844964264405735e-06,
"loss": 1.0223,
"step": 799
},
{
"epoch": 4.092071611253197,
"grad_norm": 0.09703196013660684,
"learning_rate": 3.802832196498272e-06,
"loss": 1.0285,
"step": 800
},
{
"epoch": 4.0971867007672635,
"grad_norm": 0.09959222058834097,
"learning_rate": 3.760907973396677e-06,
"loss": 1.0309,
"step": 801
},
{
"epoch": 4.10230179028133,
"grad_norm": 0.10177075731564252,
"learning_rate": 3.719192133079692e-06,
"loss": 1.0318,
"step": 802
},
{
"epoch": 4.107416879795396,
"grad_norm": 0.10333452571953791,
"learning_rate": 3.677685210852062e-06,
"loss": 1.0406,
"step": 803
},
{
"epoch": 4.112531969309463,
"grad_norm": 0.09844038300469107,
"learning_rate": 3.636387739337659e-06,
"loss": 1.0219,
"step": 804
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.09831929728264319,
"learning_rate": 3.5953002484726484e-06,
"loss": 1.0431,
"step": 805
},
{
"epoch": 4.122762148337596,
"grad_norm": 0.09989497009088748,
"learning_rate": 3.5544232654986744e-06,
"loss": 1.0276,
"step": 806
},
{
"epoch": 4.127877237851663,
"grad_norm": 0.09990437586089519,
"learning_rate": 3.513757314956128e-06,
"loss": 1.0428,
"step": 807
},
{
"epoch": 4.132992327365729,
"grad_norm": 0.09654332882972133,
"learning_rate": 3.4733029186773905e-06,
"loss": 1.0392,
"step": 808
},
{
"epoch": 4.138107416879795,
"grad_norm": 0.09700482096232281,
"learning_rate": 3.433060595780131e-06,
"loss": 1.0324,
"step": 809
},
{
"epoch": 4.143222506393862,
"grad_norm": 0.09733789290361096,
"learning_rate": 3.3930308626606733e-06,
"loss": 1.0337,
"step": 810
},
{
"epoch": 4.148337595907928,
"grad_norm": 0.09679936208456312,
"learning_rate": 3.3532142329873362e-06,
"loss": 1.0427,
"step": 811
},
{
"epoch": 4.153452685421995,
"grad_norm": 0.09850838614448618,
"learning_rate": 3.3136112176938774e-06,
"loss": 1.039,
"step": 812
},
{
"epoch": 4.158567774936062,
"grad_norm": 0.10014402864349467,
"learning_rate": 3.274222324972909e-06,
"loss": 1.0397,
"step": 813
},
{
"epoch": 4.163682864450128,
"grad_norm": 0.09846775188414943,
"learning_rate": 3.2350480602693813e-06,
"loss": 1.0438,
"step": 814
},
{
"epoch": 4.168797953964194,
"grad_norm": 0.09849530091060531,
"learning_rate": 3.196088926274108e-06,
"loss": 1.0487,
"step": 815
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.09894112470378255,
"learning_rate": 3.1573454229173173e-06,
"loss": 1.0511,
"step": 816
},
{
"epoch": 4.179028132992327,
"grad_norm": 0.09585569782795421,
"learning_rate": 3.1188180473622045e-06,
"loss": 1.033,
"step": 817
},
{
"epoch": 4.1841432225063935,
"grad_norm": 0.0907063248106673,
"learning_rate": 3.080507293998598e-06,
"loss": 1.0641,
"step": 818
},
{
"epoch": 4.189258312020461,
"grad_norm": 0.10068952997359636,
"learning_rate": 3.0424136544365846e-06,
"loss": 1.041,
"step": 819
},
{
"epoch": 4.194373401534527,
"grad_norm": 0.09500920205889321,
"learning_rate": 3.0045376175002185e-06,
"loss": 1.0282,
"step": 820
},
{
"epoch": 4.1994884910485935,
"grad_norm": 0.09153412621935421,
"learning_rate": 2.9668796692212253e-06,
"loss": 1.0279,
"step": 821
},
{
"epoch": 4.20460358056266,
"grad_norm": 0.09475771204466948,
"learning_rate": 2.9294402928327815e-06,
"loss": 1.029,
"step": 822
},
{
"epoch": 4.209718670076726,
"grad_norm": 0.09846697389243914,
"learning_rate": 2.892219968763337e-06,
"loss": 1.0286,
"step": 823
},
{
"epoch": 4.2148337595907925,
"grad_norm": 0.09305701318883604,
"learning_rate": 2.8552191746303904e-06,
"loss": 1.0262,
"step": 824
},
{
"epoch": 4.21994884910486,
"grad_norm": 0.09124936813337096,
"learning_rate": 2.8184383852344212e-06,
"loss": 1.0185,
"step": 825
},
{
"epoch": 4.225063938618926,
"grad_norm": 0.10077071629270502,
"learning_rate": 2.7818780725527505e-06,
"loss": 1.0366,
"step": 826
},
{
"epoch": 4.2301790281329925,
"grad_norm": 0.09979949490236956,
"learning_rate": 2.745538705733519e-06,
"loss": 1.0383,
"step": 827
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.09761402809078823,
"learning_rate": 2.7094207510896574e-06,
"loss": 1.0508,
"step": 828
},
{
"epoch": 4.240409207161125,
"grad_norm": 0.0935296858882418,
"learning_rate": 2.673524672092873e-06,
"loss": 1.0405,
"step": 829
},
{
"epoch": 4.245524296675192,
"grad_norm": 0.09509323461150619,
"learning_rate": 2.6378509293677533e-06,
"loss": 1.0426,
"step": 830
},
{
"epoch": 4.250639386189258,
"grad_norm": 0.09606218354815171,
"learning_rate": 2.602399980685815e-06,
"loss": 1.0488,
"step": 831
},
{
"epoch": 4.255754475703325,
"grad_norm": 0.0985764043857404,
"learning_rate": 2.567172280959653e-06,
"loss": 1.0257,
"step": 832
},
{
"epoch": 4.260869565217392,
"grad_norm": 0.09298961777037151,
"learning_rate": 2.532168282237084e-06,
"loss": 1.0301,
"step": 833
},
{
"epoch": 4.265984654731458,
"grad_norm": 0.09501389616813173,
"learning_rate": 2.4973884336953512e-06,
"loss": 1.0158,
"step": 834
},
{
"epoch": 4.271099744245524,
"grad_norm": 0.10060659630523315,
"learning_rate": 2.462833181635391e-06,
"loss": 1.0402,
"step": 835
},
{
"epoch": 4.276214833759591,
"grad_norm": 0.08985813903795348,
"learning_rate": 2.4285029694760475e-06,
"loss": 1.0278,
"step": 836
},
{
"epoch": 4.281329923273657,
"grad_norm": 0.09047049245073066,
"learning_rate": 2.3943982377484364e-06,
"loss": 1.007,
"step": 837
},
{
"epoch": 4.286445012787723,
"grad_norm": 0.09542213115984054,
"learning_rate": 2.3605194240902575e-06,
"loss": 1.0349,
"step": 838
},
{
"epoch": 4.291560102301791,
"grad_norm": 0.09592894134973562,
"learning_rate": 2.3268669632401997e-06,
"loss": 1.0461,
"step": 839
},
{
"epoch": 4.296675191815857,
"grad_norm": 0.09276563388354862,
"learning_rate": 2.293441287032354e-06,
"loss": 1.0409,
"step": 840
},
{
"epoch": 4.301790281329923,
"grad_norm": 0.08621024713151011,
"learning_rate": 2.2602428243906638e-06,
"loss": 1.0458,
"step": 841
},
{
"epoch": 4.30690537084399,
"grad_norm": 0.0947853691754098,
"learning_rate": 2.2272720013234372e-06,
"loss": 1.0597,
"step": 842
},
{
"epoch": 4.312020460358056,
"grad_norm": 0.09738935216253661,
"learning_rate": 2.1945292409178755e-06,
"loss": 1.0354,
"step": 843
},
{
"epoch": 4.3171355498721224,
"grad_norm": 0.08975381339730756,
"learning_rate": 2.162014963334631e-06,
"loss": 1.0121,
"step": 844
},
{
"epoch": 4.322250639386189,
"grad_norm": 0.08820100067894239,
"learning_rate": 2.1297295858024313e-06,
"loss": 1.0514,
"step": 845
},
{
"epoch": 4.327365728900256,
"grad_norm": 0.09617468579113231,
"learning_rate": 2.097673522612722e-06,
"loss": 1.0285,
"step": 846
},
{
"epoch": 4.332480818414322,
"grad_norm": 0.09593751503684586,
"learning_rate": 2.0658471851143513e-06,
"loss": 1.039,
"step": 847
},
{
"epoch": 4.337595907928389,
"grad_norm": 0.09444286356238825,
"learning_rate": 2.0342509817082747e-06,
"loss": 1.0414,
"step": 848
},
{
"epoch": 4.342710997442455,
"grad_norm": 0.09101012732573761,
"learning_rate": 2.0028853178423356e-06,
"loss": 1.0177,
"step": 849
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.09325815402802133,
"learning_rate": 1.971750596006059e-06,
"loss": 1.0409,
"step": 850
},
{
"epoch": 4.352941176470588,
"grad_norm": 0.08872400279590349,
"learning_rate": 1.9408472157254765e-06,
"loss": 1.0268,
"step": 851
},
{
"epoch": 4.358056265984655,
"grad_norm": 0.0885901270770776,
"learning_rate": 1.9101755735580128e-06,
"loss": 1.044,
"step": 852
},
{
"epoch": 4.3631713554987215,
"grad_norm": 0.0949449985202502,
"learning_rate": 1.8797360630873806e-06,
"loss": 1.0452,
"step": 853
},
{
"epoch": 4.368286445012788,
"grad_norm": 0.08410055531702589,
"learning_rate": 1.8495290749185435e-06,
"loss": 1.0325,
"step": 854
},
{
"epoch": 4.373401534526854,
"grad_norm": 0.09545532227569593,
"learning_rate": 1.8195549966727054e-06,
"loss": 1.0481,
"step": 855
},
{
"epoch": 4.378516624040921,
"grad_norm": 0.08476459696114018,
"learning_rate": 1.7898142129823171e-06,
"loss": 1.0639,
"step": 856
},
{
"epoch": 4.383631713554987,
"grad_norm": 0.08949442544964742,
"learning_rate": 1.7603071054861653e-06,
"loss": 1.0371,
"step": 857
},
{
"epoch": 4.388746803069053,
"grad_norm": 0.08879874215911474,
"learning_rate": 1.7310340528244607e-06,
"loss": 1.0405,
"step": 858
},
{
"epoch": 4.3938618925831205,
"grad_norm": 0.08840226801940593,
"learning_rate": 1.701995430633987e-06,
"loss": 1.0422,
"step": 859
},
{
"epoch": 4.398976982097187,
"grad_norm": 0.08626387602238281,
"learning_rate": 1.6731916115432678e-06,
"loss": 1.0362,
"step": 860
},
{
"epoch": 4.404092071611253,
"grad_norm": 0.0883006081970065,
"learning_rate": 1.6446229651677903e-06,
"loss": 1.0594,
"step": 861
},
{
"epoch": 4.40920716112532,
"grad_norm": 0.08763882002142795,
"learning_rate": 1.6162898581052866e-06,
"loss": 1.0402,
"step": 862
},
{
"epoch": 4.414322250639386,
"grad_norm": 0.08473283566109852,
"learning_rate": 1.5881926539309845e-06,
"loss": 1.0369,
"step": 863
},
{
"epoch": 4.419437340153452,
"grad_norm": 0.08882688131234197,
"learning_rate": 1.560331713192984e-06,
"loss": 1.0268,
"step": 864
},
{
"epoch": 4.42455242966752,
"grad_norm": 0.0919916289952921,
"learning_rate": 1.5327073934075954e-06,
"loss": 1.0548,
"step": 865
},
{
"epoch": 4.429667519181586,
"grad_norm": 0.09199096614028365,
"learning_rate": 1.5053200490547838e-06,
"loss": 1.0138,
"step": 866
},
{
"epoch": 4.434782608695652,
"grad_norm": 0.0871055821541935,
"learning_rate": 1.4781700315736002e-06,
"loss": 1.0355,
"step": 867
},
{
"epoch": 4.439897698209719,
"grad_norm": 0.08976011928625406,
"learning_rate": 1.4512576893576725e-06,
"loss": 1.0591,
"step": 868
},
{
"epoch": 4.445012787723785,
"grad_norm": 0.08644290864743312,
"learning_rate": 1.4245833677507448e-06,
"loss": 1.0304,
"step": 869
},
{
"epoch": 4.450127877237851,
"grad_norm": 0.08435192281249178,
"learning_rate": 1.3981474090422408e-06,
"loss": 1.0279,
"step": 870
},
{
"epoch": 4.455242966751918,
"grad_norm": 0.0846195173709492,
"learning_rate": 1.3719501524628643e-06,
"loss": 1.0472,
"step": 871
},
{
"epoch": 4.460358056265985,
"grad_norm": 0.08696234341868504,
"learning_rate": 1.3459919341802618e-06,
"loss": 1.0293,
"step": 872
},
{
"epoch": 4.465473145780051,
"grad_norm": 0.08749456032856028,
"learning_rate": 1.3202730872946878e-06,
"loss": 1.0424,
"step": 873
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.08679996505251306,
"learning_rate": 1.2947939418347599e-06,
"loss": 1.0337,
"step": 874
},
{
"epoch": 4.475703324808184,
"grad_norm": 0.08763262493448441,
"learning_rate": 1.269554824753192e-06,
"loss": 1.053,
"step": 875
},
{
"epoch": 4.4808184143222505,
"grad_norm": 0.08632279952905321,
"learning_rate": 1.2445560599226148e-06,
"loss": 1.0353,
"step": 876
},
{
"epoch": 4.485933503836317,
"grad_norm": 0.08740638922400544,
"learning_rate": 1.219797968131422e-06,
"loss": 1.0573,
"step": 877
},
{
"epoch": 4.491048593350383,
"grad_norm": 0.08830677643420132,
"learning_rate": 1.1952808670796511e-06,
"loss": 1.0372,
"step": 878
},
{
"epoch": 4.4961636828644505,
"grad_norm": 0.08336534008550968,
"learning_rate": 1.1710050713749067e-06,
"loss": 1.0335,
"step": 879
},
{
"epoch": 4.501278772378517,
"grad_norm": 0.089103955480962,
"learning_rate": 1.1469708925283095e-06,
"loss": 1.0326,
"step": 880
},
{
"epoch": 4.506393861892583,
"grad_norm": 0.08776482694311091,
"learning_rate": 1.123178638950526e-06,
"loss": 1.0575,
"step": 881
},
{
"epoch": 4.5115089514066495,
"grad_norm": 0.08866769991349784,
"learning_rate": 1.0996286159477943e-06,
"loss": 1.0354,
"step": 882
},
{
"epoch": 4.516624040920716,
"grad_norm": 0.08687727068648551,
"learning_rate": 1.0763211257180007e-06,
"loss": 1.0327,
"step": 883
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.0885626504483757,
"learning_rate": 1.053256467346817e-06,
"loss": 1.0566,
"step": 884
},
{
"epoch": 4.526854219948849,
"grad_norm": 0.08688698614964542,
"learning_rate": 1.030434936803857e-06,
"loss": 1.0262,
"step": 885
},
{
"epoch": 4.531969309462916,
"grad_norm": 0.08874324934826401,
"learning_rate": 1.0078568269388666e-06,
"loss": 1.0355,
"step": 886
},
{
"epoch": 4.537084398976982,
"grad_norm": 0.09066610304373438,
"learning_rate": 9.855224274779894e-07,
"loss": 1.0242,
"step": 887
},
{
"epoch": 4.542199488491049,
"grad_norm": 0.0899703471767722,
"learning_rate": 9.634320250200213e-07,
"loss": 1.0222,
"step": 888
},
{
"epoch": 4.547314578005115,
"grad_norm": 0.08777620122531764,
"learning_rate": 9.415859030327667e-07,
"loss": 1.049,
"step": 889
},
{
"epoch": 4.552429667519181,
"grad_norm": 0.08862100817321585,
"learning_rate": 9.199843418493625e-07,
"loss": 1.033,
"step": 890
},
{
"epoch": 4.557544757033249,
"grad_norm": 0.0924502407146251,
"learning_rate": 8.986276186647092e-07,
"loss": 1.0474,
"step": 891
},
{
"epoch": 4.562659846547315,
"grad_norm": 0.08475994721795894,
"learning_rate": 8.775160075319001e-07,
"loss": 1.0504,
"step": 892
},
{
"epoch": 4.567774936061381,
"grad_norm": 0.08839104158168999,
"learning_rate": 8.566497793587158e-07,
"loss": 1.0164,
"step": 893
},
{
"epoch": 4.572890025575448,
"grad_norm": 0.08678786116042993,
"learning_rate": 8.360292019041405e-07,
"loss": 1.0253,
"step": 894
},
{
"epoch": 4.578005115089514,
"grad_norm": 0.09067943165736797,
"learning_rate": 8.156545397749127e-07,
"loss": 1.0578,
"step": 895
},
{
"epoch": 4.58312020460358,
"grad_norm": 0.09064381529233775,
"learning_rate": 7.955260544221621e-07,
"loss": 1.0363,
"step": 896
},
{
"epoch": 4.588235294117647,
"grad_norm": 0.08444296552020542,
"learning_rate": 7.756440041380297e-07,
"loss": 1.0339,
"step": 897
},
{
"epoch": 4.593350383631714,
"grad_norm": 0.0890656704511778,
"learning_rate": 7.560086440523528e-07,
"loss": 1.0268,
"step": 898
},
{
"epoch": 4.59846547314578,
"grad_norm": 0.08823096230866823,
"learning_rate": 7.366202261294098e-07,
"loss": 1.0295,
"step": 899
},
{
"epoch": 4.603580562659847,
"grad_norm": 0.08543654183534846,
"learning_rate": 7.174789991646602e-07,
"loss": 1.0403,
"step": 900
},
{
"epoch": 4.608695652173913,
"grad_norm": 0.08651006361043005,
"learning_rate": 6.985852087815903e-07,
"loss": 1.0107,
"step": 901
},
{
"epoch": 4.6138107416879794,
"grad_norm": 0.0893115824376349,
"learning_rate": 6.799390974285169e-07,
"loss": 1.0148,
"step": 902
},
{
"epoch": 4.618925831202046,
"grad_norm": 0.0905211238552103,
"learning_rate": 6.615409043755039e-07,
"loss": 1.0235,
"step": 903
},
{
"epoch": 4.624040920716112,
"grad_norm": 0.08373868267565032,
"learning_rate": 6.433908657112886e-07,
"loss": 1.0583,
"step": 904
},
{
"epoch": 4.629156010230179,
"grad_norm": 0.08517234012511272,
"learning_rate": 6.254892143402469e-07,
"loss": 1.0428,
"step": 905
},
{
"epoch": 4.634271099744246,
"grad_norm": 0.0829556502483681,
"learning_rate": 6.078361799794086e-07,
"loss": 1.0495,
"step": 906
},
{
"epoch": 4.639386189258312,
"grad_norm": 0.08390852555662921,
"learning_rate": 5.904319891555021e-07,
"loss": 1.0353,
"step": 907
},
{
"epoch": 4.6445012787723785,
"grad_norm": 0.08446045440701189,
"learning_rate": 5.732768652020615e-07,
"loss": 1.0622,
"step": 908
},
{
"epoch": 4.649616368286445,
"grad_norm": 0.09204361483711933,
"learning_rate": 5.563710282565504e-07,
"loss": 1.0531,
"step": 909
},
{
"epoch": 4.654731457800511,
"grad_norm": 0.08394901633656762,
"learning_rate": 5.397146952575316e-07,
"loss": 1.0171,
"step": 910
},
{
"epoch": 4.659846547314578,
"grad_norm": 0.08343494614241122,
"learning_rate": 5.233080799418999e-07,
"loss": 1.0396,
"step": 911
},
{
"epoch": 4.664961636828645,
"grad_norm": 0.07955675153850743,
"learning_rate": 5.071513928421268e-07,
"loss": 1.0189,
"step": 912
},
{
"epoch": 4.670076726342711,
"grad_norm": 0.08101944215370342,
"learning_rate": 4.912448412835625e-07,
"loss": 1.0258,
"step": 913
},
{
"epoch": 4.675191815856778,
"grad_norm": 0.08331490147990027,
"learning_rate": 4.7558862938177796e-07,
"loss": 1.0217,
"step": 914
},
{
"epoch": 4.680306905370844,
"grad_norm": 0.08702803562906056,
"learning_rate": 4.601829580399364e-07,
"loss": 1.0396,
"step": 915
},
{
"epoch": 4.68542199488491,
"grad_norm": 0.0872058124086669,
"learning_rate": 4.4502802494623023e-07,
"loss": 1.0455,
"step": 916
},
{
"epoch": 4.690537084398977,
"grad_norm": 0.08338361238231247,
"learning_rate": 4.301240245713345e-07,
"loss": 1.0562,
"step": 917
},
{
"epoch": 4.695652173913043,
"grad_norm": 0.0851827498369095,
"learning_rate": 4.1547114816590684e-07,
"loss": 1.0537,
"step": 918
},
{
"epoch": 4.70076726342711,
"grad_norm": 0.08046194701441498,
"learning_rate": 4.010695837581446e-07,
"loss": 1.0238,
"step": 919
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.08281618992470802,
"learning_rate": 3.8691951615136946e-07,
"loss": 1.0356,
"step": 920
},
{
"epoch": 4.710997442455243,
"grad_norm": 0.08283580448405972,
"learning_rate": 3.730211269216488e-07,
"loss": 1.0239,
"step": 921
},
{
"epoch": 4.716112531969309,
"grad_norm": 0.08087364868453263,
"learning_rate": 3.593745944154692e-07,
"loss": 1.0363,
"step": 922
},
{
"epoch": 4.721227621483376,
"grad_norm": 0.08333792792360416,
"learning_rate": 3.459800937474533e-07,
"loss": 1.0377,
"step": 923
},
{
"epoch": 4.726342710997442,
"grad_norm": 0.08198078335380529,
"learning_rate": 3.328377967981089e-07,
"loss": 1.0525,
"step": 924
},
{
"epoch": 4.731457800511509,
"grad_norm": 0.08207405352725915,
"learning_rate": 3.1994787221161674e-07,
"loss": 1.0487,
"step": 925
},
{
"epoch": 4.736572890025576,
"grad_norm": 0.08127709502910237,
"learning_rate": 3.0731048539367924e-07,
"loss": 1.0317,
"step": 926
},
{
"epoch": 4.741687979539642,
"grad_norm": 0.08182420357301565,
"learning_rate": 2.949257985093845e-07,
"loss": 1.0645,
"step": 927
},
{
"epoch": 4.746803069053708,
"grad_norm": 0.08540342640125807,
"learning_rate": 2.827939704811433e-07,
"loss": 1.0225,
"step": 928
},
{
"epoch": 4.751918158567775,
"grad_norm": 0.08105962695785912,
"learning_rate": 2.7091515698662863e-07,
"loss": 1.0403,
"step": 929
},
{
"epoch": 4.757033248081841,
"grad_norm": 0.08316360123978976,
"learning_rate": 2.592895104567861e-07,
"loss": 1.0377,
"step": 930
},
{
"epoch": 4.762148337595908,
"grad_norm": 0.08916967768404561,
"learning_rate": 2.479171800738911e-07,
"loss": 1.0432,
"step": 931
},
{
"epoch": 4.767263427109975,
"grad_norm": 0.08337042713060398,
"learning_rate": 2.3679831176961487e-07,
"loss": 1.0586,
"step": 932
},
{
"epoch": 4.772378516624041,
"grad_norm": 0.08366883341035822,
"learning_rate": 2.2593304822316365e-07,
"loss": 1.0398,
"step": 933
},
{
"epoch": 4.7774936061381075,
"grad_norm": 0.08119050050082849,
"learning_rate": 2.153215288594379e-07,
"loss": 1.0269,
"step": 934
},
{
"epoch": 4.782608695652174,
"grad_norm": 0.0825222553646159,
"learning_rate": 2.0496388984726056e-07,
"loss": 1.0378,
"step": 935
},
{
"epoch": 4.78772378516624,
"grad_norm": 0.08164754045193766,
"learning_rate": 1.9486026409761162e-07,
"loss": 1.0507,
"step": 936
},
{
"epoch": 4.792838874680307,
"grad_norm": 0.08069756368018306,
"learning_rate": 1.8501078126193172e-07,
"loss": 1.0141,
"step": 937
},
{
"epoch": 4.797953964194374,
"grad_norm": 0.08172592463148753,
"learning_rate": 1.7541556773045255e-07,
"loss": 1.024,
"step": 938
},
{
"epoch": 4.80306905370844,
"grad_norm": 0.08172502645150909,
"learning_rate": 1.6607474663058677e-07,
"loss": 1.0431,
"step": 939
},
{
"epoch": 4.8081841432225065,
"grad_norm": 0.08077050134165903,
"learning_rate": 1.569884378253317e-07,
"loss": 1.0179,
"step": 940
},
{
"epoch": 4.813299232736573,
"grad_norm": 0.08523300001438115,
"learning_rate": 1.4815675791175043e-07,
"loss": 1.0318,
"step": 941
},
{
"epoch": 4.818414322250639,
"grad_norm": 0.08046827663058376,
"learning_rate": 1.3957982021945093e-07,
"loss": 1.0185,
"step": 942
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.0826399800178752,
"learning_rate": 1.3125773480915592e-07,
"loss": 1.0364,
"step": 943
},
{
"epoch": 4.828644501278772,
"grad_norm": 0.08143311184926899,
"learning_rate": 1.2319060847127972e-07,
"loss": 1.0441,
"step": 944
},
{
"epoch": 4.833759590792839,
"grad_norm": 0.0812111566830204,
"learning_rate": 1.1537854472455368e-07,
"loss": 1.0194,
"step": 945
},
{
"epoch": 4.838874680306906,
"grad_norm": 0.08087505193021655,
"learning_rate": 1.0782164381470506e-07,
"loss": 1.0272,
"step": 946
},
{
"epoch": 4.843989769820972,
"grad_norm": 0.08202686030913739,
"learning_rate": 1.0052000271317142e-07,
"loss": 1.0137,
"step": 947
},
{
"epoch": 4.849104859335038,
"grad_norm": 0.08725113377075328,
"learning_rate": 9.347371511585046e-08,
"loss": 1.0444,
"step": 948
},
{
"epoch": 4.854219948849105,
"grad_norm": 0.08074935681692166,
"learning_rate": 8.66828714418988e-08,
"loss": 1.0367,
"step": 949
},
{
"epoch": 4.859335038363171,
"grad_norm": 0.0833488513811118,
"learning_rate": 8.014755883257508e-08,
"loss": 1.0484,
"step": 950
},
{
"epoch": 4.864450127877237,
"grad_norm": 0.0811386973621194,
"learning_rate": 7.386786115011868e-08,
"loss": 1.0298,
"step": 951
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.08217471436638513,
"learning_rate": 6.784385897667723e-08,
"loss": 1.0481,
"step": 952
},
{
"epoch": 4.874680306905371,
"grad_norm": 0.08218495988946739,
"learning_rate": 6.207562961326963e-08,
"loss": 1.0419,
"step": 953
},
{
"epoch": 4.879795396419437,
"grad_norm": 0.08060090907067169,
"learning_rate": 5.656324707879357e-08,
"loss": 1.0535,
"step": 954
},
{
"epoch": 4.884910485933504,
"grad_norm": 0.08129135659128915,
"learning_rate": 5.130678210907514e-08,
"loss": 1.0504,
"step": 955
},
{
"epoch": 4.89002557544757,
"grad_norm": 0.08562145837052391,
"learning_rate": 4.630630215596732e-08,
"loss": 1.0332,
"step": 956
},
{
"epoch": 4.8951406649616365,
"grad_norm": 0.08242537780405768,
"learning_rate": 4.156187138647516e-08,
"loss": 1.0429,
"step": 957
},
{
"epoch": 4.900255754475703,
"grad_norm": 0.07982429118069245,
"learning_rate": 3.707355068194085e-08,
"loss": 1.0497,
"step": 958
},
{
"epoch": 4.90537084398977,
"grad_norm": 0.08337322755077417,
"learning_rate": 3.284139763725769e-08,
"loss": 1.0039,
"step": 959
},
{
"epoch": 4.910485933503836,
"grad_norm": 0.08052583915874226,
"learning_rate": 2.8865466560130673e-08,
"loss": 1.0475,
"step": 960
},
{
"epoch": 4.915601023017903,
"grad_norm": 0.08011637722476794,
"learning_rate": 2.5145808470383727e-08,
"loss": 1.0436,
"step": 961
},
{
"epoch": 4.920716112531969,
"grad_norm": 0.07954539736607873,
"learning_rate": 2.1682471099297996e-08,
"loss": 1.0232,
"step": 962
},
{
"epoch": 4.9258312020460355,
"grad_norm": 0.08203718970751432,
"learning_rate": 1.8475498889010125e-08,
"loss": 1.0611,
"step": 963
},
{
"epoch": 4.930946291560103,
"grad_norm": 0.08091342862475163,
"learning_rate": 1.5524932991928253e-08,
"loss": 1.0421,
"step": 964
},
{
"epoch": 4.936061381074169,
"grad_norm": 0.08209037571378622,
"learning_rate": 1.2830811270214682e-08,
"loss": 1.0559,
"step": 965
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.08253421197403137,
"learning_rate": 1.0393168295299571e-08,
"loss": 1.0556,
"step": 966
},
{
"epoch": 4.946291560102302,
"grad_norm": 0.08010862800679533,
"learning_rate": 8.212035347427983e-09,
"loss": 1.0281,
"step": 967
},
{
"epoch": 4.951406649616368,
"grad_norm": 0.08063925174776879,
"learning_rate": 6.287440415271295e-09,
"loss": 1.0279,
"step": 968
},
{
"epoch": 4.956521739130435,
"grad_norm": 0.08061660864354202,
"learning_rate": 4.619408195554176e-09,
"loss": 1.0224,
"step": 969
},
{
"epoch": 4.961636828644501,
"grad_norm": 2.467052842174403,
"learning_rate": 3.207960092752593e-09,
"loss": 1.0545,
"step": 970
},
{
"epoch": 4.966751918158568,
"grad_norm": 0.08327486179105495,
"learning_rate": 2.0531142188073837e-09,
"loss": 1.0395,
"step": 971
},
{
"epoch": 4.971867007672635,
"grad_norm": 0.07879283157932467,
"learning_rate": 1.154885392895544e-09,
"loss": 1.0258,
"step": 972
},
{
"epoch": 4.976982097186701,
"grad_norm": 0.08591637879884984,
"learning_rate": 5.132851412437135e-10,
"loss": 1.0512,
"step": 973
},
{
"epoch": 4.982097186700767,
"grad_norm": 0.07945597108084437,
"learning_rate": 1.283216969727441e-10,
"loss": 1.026,
"step": 974
},
{
"epoch": 4.987212276214834,
"grad_norm": 0.0809414158911861,
"learning_rate": 0.0,
"loss": 1.0639,
"step": 975
},
{
"epoch": 4.987212276214834,
"step": 975,
"total_flos": 7100358600687616.0,
"train_loss": 1.0860105241261995,
"train_runtime": 35157.9071,
"train_samples_per_second": 14.222,
"train_steps_per_second": 0.028
}
],
"logging_steps": 1.0,
"max_steps": 975,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7100358600687616.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}