Files
openthoughts3_100k_qwen25_1…/trainer_state.json
ModelHub XC 84f44cd484 初始化项目,由ModelHub XC社区提供模型
Model: mlfoundations-dev/openthoughts3_100k_qwen25_1b_bsz256_lr16e5_epochs5
Source: Original Platform
2026-06-16 07:03:13 +08:00

13728 lines
332 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025575447570332483,
"grad_norm": 2.9563186457899664,
"learning_rate": 8.163265306122449e-07,
"loss": 1.5213,
"step": 1
},
{
"epoch": 0.005115089514066497,
"grad_norm": 2.9570403686006705,
"learning_rate": 1.6326530612244897e-06,
"loss": 1.4742,
"step": 2
},
{
"epoch": 0.0076726342710997444,
"grad_norm": 3.010165733072805,
"learning_rate": 2.4489795918367347e-06,
"loss": 1.4946,
"step": 3
},
{
"epoch": 0.010230179028132993,
"grad_norm": 2.868210172096221,
"learning_rate": 3.2653061224489794e-06,
"loss": 1.482,
"step": 4
},
{
"epoch": 0.01278772378516624,
"grad_norm": 2.6518374719836477,
"learning_rate": 4.081632653061225e-06,
"loss": 1.4866,
"step": 5
},
{
"epoch": 0.015345268542199489,
"grad_norm": 2.0719187075670176,
"learning_rate": 4.897959183673469e-06,
"loss": 1.4844,
"step": 6
},
{
"epoch": 0.017902813299232736,
"grad_norm": 1.8691931463314044,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.4533,
"step": 7
},
{
"epoch": 0.020460358056265986,
"grad_norm": 1.8092896927352589,
"learning_rate": 6.530612244897959e-06,
"loss": 1.4433,
"step": 8
},
{
"epoch": 0.023017902813299233,
"grad_norm": 1.7543993002445608,
"learning_rate": 7.346938775510205e-06,
"loss": 1.4744,
"step": 9
},
{
"epoch": 0.02557544757033248,
"grad_norm": 1.6606628173638305,
"learning_rate": 8.16326530612245e-06,
"loss": 1.4586,
"step": 10
},
{
"epoch": 0.028132992327365727,
"grad_norm": 2.197985553952399,
"learning_rate": 8.979591836734695e-06,
"loss": 1.4315,
"step": 11
},
{
"epoch": 0.030690537084398978,
"grad_norm": 2.096672912966444,
"learning_rate": 9.795918367346939e-06,
"loss": 1.3907,
"step": 12
},
{
"epoch": 0.03324808184143223,
"grad_norm": 1.7669816182231157,
"learning_rate": 1.0612244897959186e-05,
"loss": 1.4234,
"step": 13
},
{
"epoch": 0.03580562659846547,
"grad_norm": 1.3020764177290665,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.3478,
"step": 14
},
{
"epoch": 0.03836317135549872,
"grad_norm": 1.2917276833945952,
"learning_rate": 1.2244897959183674e-05,
"loss": 1.378,
"step": 15
},
{
"epoch": 0.04092071611253197,
"grad_norm": 0.9647900041095249,
"learning_rate": 1.3061224489795918e-05,
"loss": 1.3273,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 0.998986811649884,
"learning_rate": 1.3877551020408165e-05,
"loss": 1.3424,
"step": 17
},
{
"epoch": 0.04603580562659847,
"grad_norm": 0.8293785359427173,
"learning_rate": 1.469387755102041e-05,
"loss": 1.3354,
"step": 18
},
{
"epoch": 0.04859335038363171,
"grad_norm": 0.7442208693017255,
"learning_rate": 1.5510204081632655e-05,
"loss": 1.3216,
"step": 19
},
{
"epoch": 0.05115089514066496,
"grad_norm": 0.8334097235660463,
"learning_rate": 1.63265306122449e-05,
"loss": 1.305,
"step": 20
},
{
"epoch": 0.05370843989769821,
"grad_norm": 0.7133053870238929,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.2863,
"step": 21
},
{
"epoch": 0.056265984654731455,
"grad_norm": 0.5994613004850937,
"learning_rate": 1.795918367346939e-05,
"loss": 1.31,
"step": 22
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.6168319603979278,
"learning_rate": 1.8775510204081636e-05,
"loss": 1.2652,
"step": 23
},
{
"epoch": 0.061381074168797956,
"grad_norm": 0.5934674503101482,
"learning_rate": 1.9591836734693877e-05,
"loss": 1.2848,
"step": 24
},
{
"epoch": 0.0639386189258312,
"grad_norm": 0.5809141308410171,
"learning_rate": 2.0408163265306126e-05,
"loss": 1.2605,
"step": 25
},
{
"epoch": 0.06649616368286446,
"grad_norm": 0.5544963829723922,
"learning_rate": 2.122448979591837e-05,
"loss": 1.2663,
"step": 26
},
{
"epoch": 0.06905370843989769,
"grad_norm": 0.5109525040926751,
"learning_rate": 2.2040816326530613e-05,
"loss": 1.2493,
"step": 27
},
{
"epoch": 0.07161125319693094,
"grad_norm": 0.47071086900075043,
"learning_rate": 2.2857142857142858e-05,
"loss": 1.2725,
"step": 28
},
{
"epoch": 0.0741687979539642,
"grad_norm": 0.47760033429842697,
"learning_rate": 2.3673469387755103e-05,
"loss": 1.2493,
"step": 29
},
{
"epoch": 0.07672634271099744,
"grad_norm": 0.47942640683455684,
"learning_rate": 2.448979591836735e-05,
"loss": 1.2635,
"step": 30
},
{
"epoch": 0.0792838874680307,
"grad_norm": 0.3817784984018378,
"learning_rate": 2.5306122448979597e-05,
"loss": 1.2581,
"step": 31
},
{
"epoch": 0.08184143222506395,
"grad_norm": 0.41863028873772656,
"learning_rate": 2.6122448979591835e-05,
"loss": 1.2319,
"step": 32
},
{
"epoch": 0.08439897698209718,
"grad_norm": 0.4561646749370822,
"learning_rate": 2.6938775510204084e-05,
"loss": 1.2647,
"step": 33
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.32944852639889954,
"learning_rate": 2.775510204081633e-05,
"loss": 1.2828,
"step": 34
},
{
"epoch": 0.08951406649616368,
"grad_norm": 0.36090683632534276,
"learning_rate": 2.8571428571428574e-05,
"loss": 1.2245,
"step": 35
},
{
"epoch": 0.09207161125319693,
"grad_norm": 0.36952861081098753,
"learning_rate": 2.938775510204082e-05,
"loss": 1.2383,
"step": 36
},
{
"epoch": 0.09462915601023018,
"grad_norm": 0.39714992118388376,
"learning_rate": 3.020408163265306e-05,
"loss": 1.2524,
"step": 37
},
{
"epoch": 0.09718670076726342,
"grad_norm": 0.3567290279003148,
"learning_rate": 3.102040816326531e-05,
"loss": 1.229,
"step": 38
},
{
"epoch": 0.09974424552429667,
"grad_norm": 0.3806643799838351,
"learning_rate": 3.183673469387755e-05,
"loss": 1.2438,
"step": 39
},
{
"epoch": 0.10230179028132992,
"grad_norm": 0.407422548294049,
"learning_rate": 3.26530612244898e-05,
"loss": 1.1862,
"step": 40
},
{
"epoch": 0.10485933503836317,
"grad_norm": 0.34463209168828013,
"learning_rate": 3.346938775510204e-05,
"loss": 1.2127,
"step": 41
},
{
"epoch": 0.10741687979539642,
"grad_norm": 0.36477387999624367,
"learning_rate": 3.4285714285714284e-05,
"loss": 1.2118,
"step": 42
},
{
"epoch": 0.10997442455242967,
"grad_norm": 0.33681318596769666,
"learning_rate": 3.510204081632653e-05,
"loss": 1.1849,
"step": 43
},
{
"epoch": 0.11253196930946291,
"grad_norm": 0.3683055012446813,
"learning_rate": 3.591836734693878e-05,
"loss": 1.1965,
"step": 44
},
{
"epoch": 0.11508951406649616,
"grad_norm": 0.3236097196989051,
"learning_rate": 3.673469387755102e-05,
"loss": 1.1973,
"step": 45
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.45336744047964317,
"learning_rate": 3.755102040816327e-05,
"loss": 1.219,
"step": 46
},
{
"epoch": 0.12020460358056266,
"grad_norm": 0.6485049911187234,
"learning_rate": 3.836734693877551e-05,
"loss": 1.22,
"step": 47
},
{
"epoch": 0.12276214833759591,
"grad_norm": 0.7308887737693851,
"learning_rate": 3.9183673469387755e-05,
"loss": 1.1927,
"step": 48
},
{
"epoch": 0.12531969309462915,
"grad_norm": 0.7412779741523179,
"learning_rate": 4e-05,
"loss": 1.207,
"step": 49
},
{
"epoch": 0.1278772378516624,
"grad_norm": 0.61907561491782,
"learning_rate": 4.081632653061225e-05,
"loss": 1.1761,
"step": 50
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.5645180027937694,
"learning_rate": 4.1632653061224494e-05,
"loss": 1.1828,
"step": 51
},
{
"epoch": 0.1329923273657289,
"grad_norm": 0.6097938476878244,
"learning_rate": 4.244897959183674e-05,
"loss": 1.1645,
"step": 52
},
{
"epoch": 0.13554987212276215,
"grad_norm": 0.68105585214221,
"learning_rate": 4.3265306122448984e-05,
"loss": 1.1663,
"step": 53
},
{
"epoch": 0.13810741687979539,
"grad_norm": 0.5148592364190684,
"learning_rate": 4.4081632653061226e-05,
"loss": 1.2013,
"step": 54
},
{
"epoch": 0.14066496163682865,
"grad_norm": 0.6290537917728678,
"learning_rate": 4.489795918367347e-05,
"loss": 1.2142,
"step": 55
},
{
"epoch": 0.1432225063938619,
"grad_norm": 0.8770682994258979,
"learning_rate": 4.5714285714285716e-05,
"loss": 1.2031,
"step": 56
},
{
"epoch": 0.14578005115089515,
"grad_norm": 1.211521452597314,
"learning_rate": 4.6530612244897965e-05,
"loss": 1.1872,
"step": 57
},
{
"epoch": 0.1483375959079284,
"grad_norm": 1.1706192692433377,
"learning_rate": 4.7346938775510206e-05,
"loss": 1.2026,
"step": 58
},
{
"epoch": 0.15089514066496162,
"grad_norm": 1.0347528096815952,
"learning_rate": 4.8163265306122455e-05,
"loss": 1.1698,
"step": 59
},
{
"epoch": 0.1534526854219949,
"grad_norm": 0.8917967843832559,
"learning_rate": 4.89795918367347e-05,
"loss": 1.1935,
"step": 60
},
{
"epoch": 0.15601023017902813,
"grad_norm": 0.8447536110052303,
"learning_rate": 4.9795918367346945e-05,
"loss": 1.1816,
"step": 61
},
{
"epoch": 0.1585677749360614,
"grad_norm": 1.0808910383761972,
"learning_rate": 5.0612244897959194e-05,
"loss": 1.2148,
"step": 62
},
{
"epoch": 0.16112531969309463,
"grad_norm": 1.0232789536513451,
"learning_rate": 5.1428571428571436e-05,
"loss": 1.1974,
"step": 63
},
{
"epoch": 0.1636828644501279,
"grad_norm": 0.870169282881004,
"learning_rate": 5.224489795918367e-05,
"loss": 1.1914,
"step": 64
},
{
"epoch": 0.16624040920716113,
"grad_norm": 0.7292663989493176,
"learning_rate": 5.306122448979592e-05,
"loss": 1.183,
"step": 65
},
{
"epoch": 0.16879795396419436,
"grad_norm": 0.8315009268144099,
"learning_rate": 5.387755102040817e-05,
"loss": 1.1457,
"step": 66
},
{
"epoch": 0.17135549872122763,
"grad_norm": 1.04261775715331,
"learning_rate": 5.469387755102041e-05,
"loss": 1.1724,
"step": 67
},
{
"epoch": 0.17391304347826086,
"grad_norm": 1.0040970248925822,
"learning_rate": 5.551020408163266e-05,
"loss": 1.1469,
"step": 68
},
{
"epoch": 0.17647058823529413,
"grad_norm": 1.0399514999943609,
"learning_rate": 5.63265306122449e-05,
"loss": 1.1335,
"step": 69
},
{
"epoch": 0.17902813299232737,
"grad_norm": 0.9541534570834667,
"learning_rate": 5.714285714285715e-05,
"loss": 1.1475,
"step": 70
},
{
"epoch": 0.1815856777493606,
"grad_norm": 1.155886502502828,
"learning_rate": 5.79591836734694e-05,
"loss": 1.1595,
"step": 71
},
{
"epoch": 0.18414322250639387,
"grad_norm": 1.4920355778823207,
"learning_rate": 5.877551020408164e-05,
"loss": 1.1764,
"step": 72
},
{
"epoch": 0.1867007672634271,
"grad_norm": 0.8392580472572768,
"learning_rate": 5.959183673469389e-05,
"loss": 1.2046,
"step": 73
},
{
"epoch": 0.18925831202046037,
"grad_norm": 1.3327976055634758,
"learning_rate": 6.040816326530612e-05,
"loss": 1.1601,
"step": 74
},
{
"epoch": 0.1918158567774936,
"grad_norm": 1.2349989957797203,
"learning_rate": 6.122448979591836e-05,
"loss": 1.1524,
"step": 75
},
{
"epoch": 0.19437340153452684,
"grad_norm": 1.1978584662405511,
"learning_rate": 6.204081632653062e-05,
"loss": 1.1559,
"step": 76
},
{
"epoch": 0.1969309462915601,
"grad_norm": 1.0353931821191475,
"learning_rate": 6.285714285714286e-05,
"loss": 1.1657,
"step": 77
},
{
"epoch": 0.19948849104859334,
"grad_norm": 0.9094148187384907,
"learning_rate": 6.36734693877551e-05,
"loss": 1.1471,
"step": 78
},
{
"epoch": 0.2020460358056266,
"grad_norm": 1.187032715727395,
"learning_rate": 6.448979591836736e-05,
"loss": 1.1408,
"step": 79
},
{
"epoch": 0.20460358056265984,
"grad_norm": 1.0732720468700825,
"learning_rate": 6.53061224489796e-05,
"loss": 1.1565,
"step": 80
},
{
"epoch": 0.2071611253196931,
"grad_norm": 0.8103161887096414,
"learning_rate": 6.612244897959184e-05,
"loss": 1.1394,
"step": 81
},
{
"epoch": 0.20971867007672634,
"grad_norm": 1.0683800405517745,
"learning_rate": 6.693877551020408e-05,
"loss": 1.1366,
"step": 82
},
{
"epoch": 0.21227621483375958,
"grad_norm": 1.0570001635125388,
"learning_rate": 6.775510204081634e-05,
"loss": 1.1521,
"step": 83
},
{
"epoch": 0.21483375959079284,
"grad_norm": 1.0038253962694932,
"learning_rate": 6.857142857142857e-05,
"loss": 1.1485,
"step": 84
},
{
"epoch": 0.21739130434782608,
"grad_norm": 1.1691956641199828,
"learning_rate": 6.938775510204082e-05,
"loss": 1.199,
"step": 85
},
{
"epoch": 0.21994884910485935,
"grad_norm": 1.160205747766507,
"learning_rate": 7.020408163265306e-05,
"loss": 1.15,
"step": 86
},
{
"epoch": 0.22250639386189258,
"grad_norm": 1.0403901594667788,
"learning_rate": 7.10204081632653e-05,
"loss": 1.1547,
"step": 87
},
{
"epoch": 0.22506393861892582,
"grad_norm": 1.253302691517826,
"learning_rate": 7.183673469387756e-05,
"loss": 1.1808,
"step": 88
},
{
"epoch": 0.22762148337595908,
"grad_norm": 1.0181115029064822,
"learning_rate": 7.26530612244898e-05,
"loss": 1.1399,
"step": 89
},
{
"epoch": 0.23017902813299232,
"grad_norm": 1.179029120883534,
"learning_rate": 7.346938775510205e-05,
"loss": 1.1709,
"step": 90
},
{
"epoch": 0.23273657289002558,
"grad_norm": 0.8065046535786934,
"learning_rate": 7.42857142857143e-05,
"loss": 1.1649,
"step": 91
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.9920804997259105,
"learning_rate": 7.510204081632654e-05,
"loss": 1.1693,
"step": 92
},
{
"epoch": 0.23785166240409208,
"grad_norm": 1.4041632222361236,
"learning_rate": 7.591836734693878e-05,
"loss": 1.1525,
"step": 93
},
{
"epoch": 0.24040920716112532,
"grad_norm": 1.1202267325769892,
"learning_rate": 7.673469387755103e-05,
"loss": 1.1727,
"step": 94
},
{
"epoch": 0.24296675191815856,
"grad_norm": 0.9214700487119486,
"learning_rate": 7.755102040816327e-05,
"loss": 1.1193,
"step": 95
},
{
"epoch": 0.24552429667519182,
"grad_norm": 0.9731714014969046,
"learning_rate": 7.836734693877551e-05,
"loss": 1.1605,
"step": 96
},
{
"epoch": 0.24808184143222506,
"grad_norm": 1.2370098654676154,
"learning_rate": 7.918367346938776e-05,
"loss": 1.1663,
"step": 97
},
{
"epoch": 0.2506393861892583,
"grad_norm": 0.856182416906324,
"learning_rate": 8e-05,
"loss": 1.134,
"step": 98
},
{
"epoch": 0.2531969309462916,
"grad_norm": 1.0086218583881408,
"learning_rate": 8.081632653061225e-05,
"loss": 1.1307,
"step": 99
},
{
"epoch": 0.2557544757033248,
"grad_norm": 1.3360855997576195,
"learning_rate": 8.16326530612245e-05,
"loss": 1.1283,
"step": 100
},
{
"epoch": 0.25831202046035806,
"grad_norm": 1.1442169715816302,
"learning_rate": 8.244897959183673e-05,
"loss": 1.1486,
"step": 101
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.9528964485268216,
"learning_rate": 8.326530612244899e-05,
"loss": 1.1539,
"step": 102
},
{
"epoch": 0.26342710997442453,
"grad_norm": 1.2014260964822987,
"learning_rate": 8.408163265306123e-05,
"loss": 1.1246,
"step": 103
},
{
"epoch": 0.2659846547314578,
"grad_norm": 1.2896301281378582,
"learning_rate": 8.489795918367348e-05,
"loss": 1.1193,
"step": 104
},
{
"epoch": 0.26854219948849106,
"grad_norm": 1.2365104040466046,
"learning_rate": 8.571428571428571e-05,
"loss": 1.1257,
"step": 105
},
{
"epoch": 0.2710997442455243,
"grad_norm": 0.8909578987791607,
"learning_rate": 8.653061224489797e-05,
"loss": 1.1127,
"step": 106
},
{
"epoch": 0.27365728900255754,
"grad_norm": 1.170325095506981,
"learning_rate": 8.734693877551021e-05,
"loss": 1.1441,
"step": 107
},
{
"epoch": 0.27621483375959077,
"grad_norm": 0.8299590325351531,
"learning_rate": 8.816326530612245e-05,
"loss": 1.1199,
"step": 108
},
{
"epoch": 0.27877237851662406,
"grad_norm": 1.0039851893132474,
"learning_rate": 8.897959183673471e-05,
"loss": 1.1454,
"step": 109
},
{
"epoch": 0.2813299232736573,
"grad_norm": 1.309094467948393,
"learning_rate": 8.979591836734694e-05,
"loss": 1.1534,
"step": 110
},
{
"epoch": 0.28388746803069054,
"grad_norm": 1.030513843956652,
"learning_rate": 9.061224489795919e-05,
"loss": 1.1518,
"step": 111
},
{
"epoch": 0.2864450127877238,
"grad_norm": 1.1548472835092134,
"learning_rate": 9.142857142857143e-05,
"loss": 1.1422,
"step": 112
},
{
"epoch": 0.289002557544757,
"grad_norm": 1.0781950243182032,
"learning_rate": 9.224489795918369e-05,
"loss": 1.1125,
"step": 113
},
{
"epoch": 0.2915601023017903,
"grad_norm": 1.4696697800626741,
"learning_rate": 9.306122448979593e-05,
"loss": 1.147,
"step": 114
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.8932168550895682,
"learning_rate": 9.387755102040817e-05,
"loss": 1.1225,
"step": 115
},
{
"epoch": 0.2966751918158568,
"grad_norm": 1.4609624921794502,
"learning_rate": 9.469387755102041e-05,
"loss": 1.1402,
"step": 116
},
{
"epoch": 0.29923273657289,
"grad_norm": 1.1608303447004447,
"learning_rate": 9.551020408163267e-05,
"loss": 1.1268,
"step": 117
},
{
"epoch": 0.30179028132992325,
"grad_norm": 1.3699566135342083,
"learning_rate": 9.632653061224491e-05,
"loss": 1.1782,
"step": 118
},
{
"epoch": 0.30434782608695654,
"grad_norm": 1.0951988856065036,
"learning_rate": 9.714285714285714e-05,
"loss": 1.1383,
"step": 119
},
{
"epoch": 0.3069053708439898,
"grad_norm": 1.311071103466961,
"learning_rate": 9.79591836734694e-05,
"loss": 1.1485,
"step": 120
},
{
"epoch": 0.309462915601023,
"grad_norm": 0.8986951965704776,
"learning_rate": 9.877551020408164e-05,
"loss": 1.1604,
"step": 121
},
{
"epoch": 0.31202046035805625,
"grad_norm": 1.2243542530871734,
"learning_rate": 9.959183673469389e-05,
"loss": 1.1129,
"step": 122
},
{
"epoch": 0.3145780051150895,
"grad_norm": 1.3033780963392814,
"learning_rate": 0.00010040816326530613,
"loss": 1.1344,
"step": 123
},
{
"epoch": 0.3171355498721228,
"grad_norm": 1.1948786110977876,
"learning_rate": 0.00010122448979591839,
"loss": 1.1269,
"step": 124
},
{
"epoch": 0.319693094629156,
"grad_norm": 1.142953671137177,
"learning_rate": 0.00010204081632653062,
"loss": 1.1078,
"step": 125
},
{
"epoch": 0.32225063938618925,
"grad_norm": 1.1524456987304121,
"learning_rate": 0.00010285714285714287,
"loss": 1.1653,
"step": 126
},
{
"epoch": 0.3248081841432225,
"grad_norm": 1.1658601331325984,
"learning_rate": 0.00010367346938775511,
"loss": 1.1088,
"step": 127
},
{
"epoch": 0.3273657289002558,
"grad_norm": 1.72409589259486,
"learning_rate": 0.00010448979591836734,
"loss": 1.1289,
"step": 128
},
{
"epoch": 0.329923273657289,
"grad_norm": 0.7330929431707807,
"learning_rate": 0.0001053061224489796,
"loss": 1.1191,
"step": 129
},
{
"epoch": 0.33248081841432225,
"grad_norm": 1.2646590423606026,
"learning_rate": 0.00010612244897959184,
"loss": 1.1527,
"step": 130
},
{
"epoch": 0.3350383631713555,
"grad_norm": 1.723349785426215,
"learning_rate": 0.0001069387755102041,
"loss": 1.118,
"step": 131
},
{
"epoch": 0.3375959079283887,
"grad_norm": 0.8139452555798524,
"learning_rate": 0.00010775510204081634,
"loss": 1.1702,
"step": 132
},
{
"epoch": 0.340153452685422,
"grad_norm": 1.1603914477308341,
"learning_rate": 0.00010857142857142859,
"loss": 1.1467,
"step": 133
},
{
"epoch": 0.34271099744245526,
"grad_norm": 1.2110578835398869,
"learning_rate": 0.00010938775510204082,
"loss": 1.1174,
"step": 134
},
{
"epoch": 0.3452685421994885,
"grad_norm": 1.3576198261777483,
"learning_rate": 0.00011020408163265307,
"loss": 1.1746,
"step": 135
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.0573462588351146,
"learning_rate": 0.00011102040816326532,
"loss": 1.1333,
"step": 136
},
{
"epoch": 0.35038363171355497,
"grad_norm": 1.3340765255200633,
"learning_rate": 0.00011183673469387757,
"loss": 1.1482,
"step": 137
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.9284689786425085,
"learning_rate": 0.0001126530612244898,
"loss": 1.1304,
"step": 138
},
{
"epoch": 0.3554987212276215,
"grad_norm": 1.4254480759114776,
"learning_rate": 0.00011346938775510204,
"loss": 1.1106,
"step": 139
},
{
"epoch": 0.35805626598465473,
"grad_norm": 1.3594890583091455,
"learning_rate": 0.0001142857142857143,
"loss": 1.1664,
"step": 140
},
{
"epoch": 0.36061381074168797,
"grad_norm": 1.086678024627712,
"learning_rate": 0.00011510204081632654,
"loss": 1.0802,
"step": 141
},
{
"epoch": 0.3631713554987212,
"grad_norm": 1.533977830454029,
"learning_rate": 0.0001159183673469388,
"loss": 1.1332,
"step": 142
},
{
"epoch": 0.3657289002557545,
"grad_norm": 0.940287237501315,
"learning_rate": 0.00011673469387755102,
"loss": 1.1573,
"step": 143
},
{
"epoch": 0.36828644501278773,
"grad_norm": 1.2572408225100642,
"learning_rate": 0.00011755102040816328,
"loss": 1.1292,
"step": 144
},
{
"epoch": 0.37084398976982097,
"grad_norm": 0.9995509690787548,
"learning_rate": 0.00011836734693877552,
"loss": 1.1375,
"step": 145
},
{
"epoch": 0.3734015345268542,
"grad_norm": 1.6478855533912629,
"learning_rate": 0.00011918367346938777,
"loss": 1.1281,
"step": 146
},
{
"epoch": 0.37595907928388744,
"grad_norm": 0.9807964464883856,
"learning_rate": 0.00012000000000000002,
"loss": 1.1604,
"step": 147
},
{
"epoch": 0.37851662404092073,
"grad_norm": 1.3424151204814954,
"learning_rate": 0.00012081632653061224,
"loss": 1.1247,
"step": 148
},
{
"epoch": 0.38107416879795397,
"grad_norm": 1.1827965041877697,
"learning_rate": 0.0001216326530612245,
"loss": 1.1087,
"step": 149
},
{
"epoch": 0.3836317135549872,
"grad_norm": 1.374289317317436,
"learning_rate": 0.00012244897959183673,
"loss": 1.1174,
"step": 150
},
{
"epoch": 0.38618925831202044,
"grad_norm": 1.4462982798920152,
"learning_rate": 0.00012326530612244898,
"loss": 1.1243,
"step": 151
},
{
"epoch": 0.3887468030690537,
"grad_norm": 1.2338591594860693,
"learning_rate": 0.00012408163265306124,
"loss": 1.127,
"step": 152
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.9926991217212723,
"learning_rate": 0.0001248979591836735,
"loss": 1.1152,
"step": 153
},
{
"epoch": 0.3938618925831202,
"grad_norm": 1.6602432782777794,
"learning_rate": 0.00012571428571428572,
"loss": 1.129,
"step": 154
},
{
"epoch": 0.39641943734015345,
"grad_norm": 1.0710563936657969,
"learning_rate": 0.00012653061224489798,
"loss": 1.1347,
"step": 155
},
{
"epoch": 0.3989769820971867,
"grad_norm": 1.0203164310897854,
"learning_rate": 0.0001273469387755102,
"loss": 1.1398,
"step": 156
},
{
"epoch": 0.40153452685422,
"grad_norm": 1.4486120558817688,
"learning_rate": 0.00012816326530612246,
"loss": 1.1572,
"step": 157
},
{
"epoch": 0.4040920716112532,
"grad_norm": 1.0665461325193415,
"learning_rate": 0.00012897959183673472,
"loss": 1.1443,
"step": 158
},
{
"epoch": 0.40664961636828645,
"grad_norm": 1.6999184553867208,
"learning_rate": 0.00012979591836734695,
"loss": 1.1027,
"step": 159
},
{
"epoch": 0.4092071611253197,
"grad_norm": 1.0289801155197138,
"learning_rate": 0.0001306122448979592,
"loss": 1.1194,
"step": 160
},
{
"epoch": 0.4117647058823529,
"grad_norm": 1.5775539926551432,
"learning_rate": 0.00013142857142857143,
"loss": 1.1209,
"step": 161
},
{
"epoch": 0.4143222506393862,
"grad_norm": 0.9132827293227751,
"learning_rate": 0.00013224489795918368,
"loss": 1.1115,
"step": 162
},
{
"epoch": 0.41687979539641945,
"grad_norm": 1.8502610336806449,
"learning_rate": 0.00013306122448979594,
"loss": 1.1237,
"step": 163
},
{
"epoch": 0.4194373401534527,
"grad_norm": 1.3709322904356605,
"learning_rate": 0.00013387755102040817,
"loss": 1.1353,
"step": 164
},
{
"epoch": 0.4219948849104859,
"grad_norm": 1.1361849330749851,
"learning_rate": 0.00013469387755102042,
"loss": 1.1043,
"step": 165
},
{
"epoch": 0.42455242966751916,
"grad_norm": 1.1401579492886242,
"learning_rate": 0.00013551020408163268,
"loss": 1.1252,
"step": 166
},
{
"epoch": 0.42710997442455245,
"grad_norm": 1.171525164401231,
"learning_rate": 0.0001363265306122449,
"loss": 1.1226,
"step": 167
},
{
"epoch": 0.4296675191815857,
"grad_norm": 1.7103135890270424,
"learning_rate": 0.00013714285714285713,
"loss": 1.1323,
"step": 168
},
{
"epoch": 0.4322250639386189,
"grad_norm": 1.0590485560747558,
"learning_rate": 0.0001379591836734694,
"loss": 1.1318,
"step": 169
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.1381323068879685,
"learning_rate": 0.00013877551020408165,
"loss": 1.1093,
"step": 170
},
{
"epoch": 0.4373401534526854,
"grad_norm": 1.8095148756504853,
"learning_rate": 0.0001395918367346939,
"loss": 1.1297,
"step": 171
},
{
"epoch": 0.4398976982097187,
"grad_norm": 1.022630524722603,
"learning_rate": 0.00014040816326530613,
"loss": 1.1217,
"step": 172
},
{
"epoch": 0.4424552429667519,
"grad_norm": 1.3822427448836618,
"learning_rate": 0.00014122448979591838,
"loss": 1.145,
"step": 173
},
{
"epoch": 0.44501278772378516,
"grad_norm": 1.3577366143882001,
"learning_rate": 0.0001420408163265306,
"loss": 1.151,
"step": 174
},
{
"epoch": 0.4475703324808184,
"grad_norm": 1.1001324929653025,
"learning_rate": 0.00014285714285714287,
"loss": 1.1209,
"step": 175
},
{
"epoch": 0.45012787723785164,
"grad_norm": 1.7043306971887084,
"learning_rate": 0.00014367346938775512,
"loss": 1.155,
"step": 176
},
{
"epoch": 0.45268542199488493,
"grad_norm": 0.8908531162714106,
"learning_rate": 0.00014448979591836735,
"loss": 1.1264,
"step": 177
},
{
"epoch": 0.45524296675191817,
"grad_norm": 2.0064867394818577,
"learning_rate": 0.0001453061224489796,
"loss": 1.1339,
"step": 178
},
{
"epoch": 0.4578005115089514,
"grad_norm": 1.272017394425661,
"learning_rate": 0.00014612244897959183,
"loss": 1.1179,
"step": 179
},
{
"epoch": 0.46035805626598464,
"grad_norm": 1.6809099682132984,
"learning_rate": 0.0001469387755102041,
"loss": 1.1306,
"step": 180
},
{
"epoch": 0.4629156010230179,
"grad_norm": 1.2729546637062361,
"learning_rate": 0.00014775510204081635,
"loss": 1.1547,
"step": 181
},
{
"epoch": 0.46547314578005117,
"grad_norm": 1.2637405257695475,
"learning_rate": 0.0001485714285714286,
"loss": 1.1234,
"step": 182
},
{
"epoch": 0.4680306905370844,
"grad_norm": 1.3792667256601667,
"learning_rate": 0.00014938775510204083,
"loss": 1.1384,
"step": 183
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.0581158807496975,
"learning_rate": 0.00015020408163265308,
"loss": 1.1308,
"step": 184
},
{
"epoch": 0.4731457800511509,
"grad_norm": 1.2395276036732317,
"learning_rate": 0.0001510204081632653,
"loss": 1.142,
"step": 185
},
{
"epoch": 0.47570332480818417,
"grad_norm": 1.1474988241030795,
"learning_rate": 0.00015183673469387757,
"loss": 1.1399,
"step": 186
},
{
"epoch": 0.4782608695652174,
"grad_norm": 1.4488607840873033,
"learning_rate": 0.0001526530612244898,
"loss": 1.1247,
"step": 187
},
{
"epoch": 0.48081841432225064,
"grad_norm": 0.9895262383072666,
"learning_rate": 0.00015346938775510205,
"loss": 1.1439,
"step": 188
},
{
"epoch": 0.4833759590792839,
"grad_norm": 1.509540789570866,
"learning_rate": 0.0001542857142857143,
"loss": 1.1268,
"step": 189
},
{
"epoch": 0.4859335038363171,
"grad_norm": 1.2634220572499701,
"learning_rate": 0.00015510204081632654,
"loss": 1.1315,
"step": 190
},
{
"epoch": 0.4884910485933504,
"grad_norm": 2.03411519572473,
"learning_rate": 0.0001559183673469388,
"loss": 1.0859,
"step": 191
},
{
"epoch": 0.49104859335038364,
"grad_norm": 1.1783378998438716,
"learning_rate": 0.00015673469387755102,
"loss": 1.122,
"step": 192
},
{
"epoch": 0.4936061381074169,
"grad_norm": 1.869178693106169,
"learning_rate": 0.00015755102040816327,
"loss": 1.0953,
"step": 193
},
{
"epoch": 0.4961636828644501,
"grad_norm": 1.4133576585465655,
"learning_rate": 0.00015836734693877553,
"loss": 1.0973,
"step": 194
},
{
"epoch": 0.49872122762148335,
"grad_norm": 1.1007402607506083,
"learning_rate": 0.00015918367346938778,
"loss": 1.1666,
"step": 195
},
{
"epoch": 0.5012787723785166,
"grad_norm": 1.0455333445001125,
"learning_rate": 0.00016,
"loss": 1.1244,
"step": 196
},
{
"epoch": 0.5038363171355499,
"grad_norm": 1.1414091012657146,
"learning_rate": 0.00015999987240667874,
"loss": 1.118,
"step": 197
},
{
"epoch": 0.5063938618925832,
"grad_norm": 1.1934725533176622,
"learning_rate": 0.0001599994896271219,
"loss": 1.1489,
"step": 198
},
{
"epoch": 0.5089514066496164,
"grad_norm": 1.3418673611629677,
"learning_rate": 0.0001599988516625505,
"loss": 1.1172,
"step": 199
},
{
"epoch": 0.5115089514066496,
"grad_norm": 1.2281301450926736,
"learning_rate": 0.00015999795851499954,
"loss": 1.124,
"step": 200
},
{
"epoch": 0.5140664961636828,
"grad_norm": 1.4232277874832118,
"learning_rate": 0.000159996810187318,
"loss": 1.1087,
"step": 201
},
{
"epoch": 0.5166240409207161,
"grad_norm": 1.2445810609035501,
"learning_rate": 0.0001599954066831689,
"loss": 1.0977,
"step": 202
},
{
"epoch": 0.5191815856777494,
"grad_norm": 1.4902156849341144,
"learning_rate": 0.00015999374800702916,
"loss": 1.1278,
"step": 203
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.9117749926569193,
"learning_rate": 0.00015999183416418963,
"loss": 1.0978,
"step": 204
},
{
"epoch": 0.5242966751918159,
"grad_norm": 1.521914055307176,
"learning_rate": 0.0001599896651607552,
"loss": 1.1255,
"step": 205
},
{
"epoch": 0.5268542199488491,
"grad_norm": 1.675086821646465,
"learning_rate": 0.00015998724100364464,
"loss": 1.1117,
"step": 206
},
{
"epoch": 0.5294117647058824,
"grad_norm": 1.0370916213463357,
"learning_rate": 0.00015998456170059059,
"loss": 1.1269,
"step": 207
},
{
"epoch": 0.5319693094629157,
"grad_norm": 1.4543936507994073,
"learning_rate": 0.00015998162726013954,
"loss": 1.1159,
"step": 208
},
{
"epoch": 0.5345268542199488,
"grad_norm": 1.628168132567413,
"learning_rate": 0.00015997843769165193,
"loss": 1.1025,
"step": 209
},
{
"epoch": 0.5370843989769821,
"grad_norm": 1.114123127352084,
"learning_rate": 0.0001599749930053019,
"loss": 1.0962,
"step": 210
},
{
"epoch": 0.5396419437340153,
"grad_norm": 1.7051681399590384,
"learning_rate": 0.00015997129321207747,
"loss": 1.1216,
"step": 211
},
{
"epoch": 0.5421994884910486,
"grad_norm": 0.9137353240287979,
"learning_rate": 0.00015996733832378032,
"loss": 1.0845,
"step": 212
},
{
"epoch": 0.5447570332480819,
"grad_norm": 1.3585376285654678,
"learning_rate": 0.00015996312835302593,
"loss": 1.1337,
"step": 213
},
{
"epoch": 0.5473145780051151,
"grad_norm": 0.986649874454745,
"learning_rate": 0.00015995866331324334,
"loss": 1.0791,
"step": 214
},
{
"epoch": 0.5498721227621484,
"grad_norm": 1.4872086766761456,
"learning_rate": 0.00015995394321867534,
"loss": 1.0898,
"step": 215
},
{
"epoch": 0.5524296675191815,
"grad_norm": 1.3583123340693906,
"learning_rate": 0.0001599489680843782,
"loss": 1.1221,
"step": 216
},
{
"epoch": 0.5549872122762148,
"grad_norm": 1.1209846232833984,
"learning_rate": 0.00015994373792622182,
"loss": 1.0914,
"step": 217
},
{
"epoch": 0.5575447570332481,
"grad_norm": 1.1159100799958372,
"learning_rate": 0.0001599382527608895,
"loss": 1.0659,
"step": 218
},
{
"epoch": 0.5601023017902813,
"grad_norm": 1.014792737157986,
"learning_rate": 0.00015993251260587796,
"loss": 1.0895,
"step": 219
},
{
"epoch": 0.5626598465473146,
"grad_norm": 1.3514884114926682,
"learning_rate": 0.00015992651747949742,
"loss": 1.1447,
"step": 220
},
{
"epoch": 0.5652173913043478,
"grad_norm": 1.3662814180004041,
"learning_rate": 0.00015992026740087125,
"loss": 1.082,
"step": 221
},
{
"epoch": 0.5677749360613811,
"grad_norm": 1.1729073479593213,
"learning_rate": 0.00015991376238993623,
"loss": 1.0858,
"step": 222
},
{
"epoch": 0.5703324808184144,
"grad_norm": 1.098894416827083,
"learning_rate": 0.0001599070024674422,
"loss": 1.0903,
"step": 223
},
{
"epoch": 0.5728900255754475,
"grad_norm": 0.975594652798118,
"learning_rate": 0.0001598999876549522,
"loss": 1.1162,
"step": 224
},
{
"epoch": 0.5754475703324808,
"grad_norm": 1.0143269006614197,
"learning_rate": 0.00015989271797484236,
"loss": 1.1131,
"step": 225
},
{
"epoch": 0.578005115089514,
"grad_norm": 1.3483287924450105,
"learning_rate": 0.00015988519345030167,
"loss": 1.0896,
"step": 226
},
{
"epoch": 0.5805626598465473,
"grad_norm": 0.7520971748388883,
"learning_rate": 0.00015987741410533217,
"loss": 1.0953,
"step": 227
},
{
"epoch": 0.5831202046035806,
"grad_norm": 1.3201762056381772,
"learning_rate": 0.0001598693799647486,
"loss": 1.0837,
"step": 228
},
{
"epoch": 0.5856777493606138,
"grad_norm": 1.2193125892583727,
"learning_rate": 0.00015986109105417862,
"loss": 1.1026,
"step": 229
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.3892856581992825,
"learning_rate": 0.0001598525474000624,
"loss": 1.1069,
"step": 230
},
{
"epoch": 0.5907928388746803,
"grad_norm": 0.8831793540357707,
"learning_rate": 0.00015984374902965284,
"loss": 1.1079,
"step": 231
},
{
"epoch": 0.5933503836317136,
"grad_norm": 0.8405263869404558,
"learning_rate": 0.00015983469597101517,
"loss": 1.088,
"step": 232
},
{
"epoch": 0.5959079283887468,
"grad_norm": 0.8048081062282874,
"learning_rate": 0.0001598253882530272,
"loss": 1.0947,
"step": 233
},
{
"epoch": 0.59846547314578,
"grad_norm": 1.1026453527649267,
"learning_rate": 0.00015981582590537897,
"loss": 1.0527,
"step": 234
},
{
"epoch": 0.6010230179028133,
"grad_norm": 1.945124480668707,
"learning_rate": 0.0001598060089585728,
"loss": 1.0747,
"step": 235
},
{
"epoch": 0.6035805626598465,
"grad_norm": 0.6633926296437849,
"learning_rate": 0.00015979593744392312,
"loss": 1.1013,
"step": 236
},
{
"epoch": 0.6061381074168798,
"grad_norm": 1.9149178380903846,
"learning_rate": 0.00015978561139355635,
"loss": 1.0967,
"step": 237
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.3222885863625786,
"learning_rate": 0.00015977503084041087,
"loss": 1.0733,
"step": 238
},
{
"epoch": 0.6112531969309463,
"grad_norm": 1.0130031801765467,
"learning_rate": 0.00015976419581823688,
"loss": 1.1196,
"step": 239
},
{
"epoch": 0.6138107416879796,
"grad_norm": 1.5551163600364186,
"learning_rate": 0.00015975310636159632,
"loss": 1.088,
"step": 240
},
{
"epoch": 0.6163682864450127,
"grad_norm": 1.2158294095692619,
"learning_rate": 0.00015974176250586265,
"loss": 1.0768,
"step": 241
},
{
"epoch": 0.618925831202046,
"grad_norm": 1.0765542476008974,
"learning_rate": 0.00015973016428722094,
"loss": 1.106,
"step": 242
},
{
"epoch": 0.6214833759590793,
"grad_norm": 1.1132699812581053,
"learning_rate": 0.0001597183117426675,
"loss": 1.1002,
"step": 243
},
{
"epoch": 0.6240409207161125,
"grad_norm": 1.3600712766399181,
"learning_rate": 0.00015970620491001004,
"loss": 1.1445,
"step": 244
},
{
"epoch": 0.6265984654731458,
"grad_norm": 1.0416236386170334,
"learning_rate": 0.00015969384382786729,
"loss": 1.1019,
"step": 245
},
{
"epoch": 0.629156010230179,
"grad_norm": 1.3027622469497735,
"learning_rate": 0.00015968122853566905,
"loss": 1.1002,
"step": 246
},
{
"epoch": 0.6317135549872123,
"grad_norm": 0.8037304289524585,
"learning_rate": 0.000159668359073656,
"loss": 1.0892,
"step": 247
},
{
"epoch": 0.6342710997442456,
"grad_norm": 0.9188404876547497,
"learning_rate": 0.00015965523548287956,
"loss": 1.1395,
"step": 248
},
{
"epoch": 0.6368286445012787,
"grad_norm": 1.1903100937742757,
"learning_rate": 0.0001596418578052018,
"loss": 1.1157,
"step": 249
},
{
"epoch": 0.639386189258312,
"grad_norm": 1.134136870599723,
"learning_rate": 0.0001596282260832953,
"loss": 1.0961,
"step": 250
},
{
"epoch": 0.6419437340153452,
"grad_norm": 1.1666299453160198,
"learning_rate": 0.00015961434036064294,
"loss": 1.1019,
"step": 251
},
{
"epoch": 0.6445012787723785,
"grad_norm": 0.8723696508206527,
"learning_rate": 0.00015960020068153785,
"loss": 1.1053,
"step": 252
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.9568431382175138,
"learning_rate": 0.00015958580709108332,
"loss": 1.0848,
"step": 253
},
{
"epoch": 0.649616368286445,
"grad_norm": 1.1129808719393837,
"learning_rate": 0.00015957115963519244,
"loss": 1.136,
"step": 254
},
{
"epoch": 0.6521739130434783,
"grad_norm": 1.3963252311082919,
"learning_rate": 0.00015955625836058815,
"loss": 1.0952,
"step": 255
},
{
"epoch": 0.6547314578005116,
"grad_norm": 0.9298685363556572,
"learning_rate": 0.00015954110331480302,
"loss": 1.0809,
"step": 256
},
{
"epoch": 0.6572890025575447,
"grad_norm": 0.7001103257159264,
"learning_rate": 0.00015952569454617916,
"loss": 1.116,
"step": 257
},
{
"epoch": 0.659846547314578,
"grad_norm": 0.9441648189630093,
"learning_rate": 0.00015951003210386793,
"loss": 1.0784,
"step": 258
},
{
"epoch": 0.6624040920716112,
"grad_norm": 1.4002615649377306,
"learning_rate": 0.0001594941160378299,
"loss": 1.1071,
"step": 259
},
{
"epoch": 0.6649616368286445,
"grad_norm": 0.8178386113146091,
"learning_rate": 0.00015947794639883473,
"loss": 1.087,
"step": 260
},
{
"epoch": 0.6675191815856778,
"grad_norm": 1.452979203118016,
"learning_rate": 0.0001594615232384608,
"loss": 1.0604,
"step": 261
},
{
"epoch": 0.670076726342711,
"grad_norm": 0.6774046196617319,
"learning_rate": 0.00015944484660909523,
"loss": 1.076,
"step": 262
},
{
"epoch": 0.6726342710997443,
"grad_norm": 0.7670969521082094,
"learning_rate": 0.00015942791656393376,
"loss": 1.1204,
"step": 263
},
{
"epoch": 0.6751918158567775,
"grad_norm": 1.0850513811767653,
"learning_rate": 0.00015941073315698035,
"loss": 1.0986,
"step": 264
},
{
"epoch": 0.6777493606138107,
"grad_norm": 1.472017968872445,
"learning_rate": 0.00015939329644304724,
"loss": 1.1274,
"step": 265
},
{
"epoch": 0.680306905370844,
"grad_norm": 0.9702787550395545,
"learning_rate": 0.0001593756064777546,
"loss": 1.0934,
"step": 266
},
{
"epoch": 0.6828644501278772,
"grad_norm": 1.0584827946044062,
"learning_rate": 0.00015935766331753049,
"loss": 1.0471,
"step": 267
},
{
"epoch": 0.6854219948849105,
"grad_norm": 0.8089889110807604,
"learning_rate": 0.00015933946701961055,
"loss": 1.0887,
"step": 268
},
{
"epoch": 0.6879795396419437,
"grad_norm": 1.0320882417148256,
"learning_rate": 0.000159321017642038,
"loss": 1.0667,
"step": 269
},
{
"epoch": 0.690537084398977,
"grad_norm": 1.4674982303373638,
"learning_rate": 0.00015930231524366326,
"loss": 1.1073,
"step": 270
},
{
"epoch": 0.6930946291560103,
"grad_norm": 0.7320918729382444,
"learning_rate": 0.0001592833598841438,
"loss": 1.1053,
"step": 271
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.8289503109780553,
"learning_rate": 0.00015926415162394414,
"loss": 1.0707,
"step": 272
},
{
"epoch": 0.6982097186700768,
"grad_norm": 1.130825151382903,
"learning_rate": 0.00015924469052433534,
"loss": 1.0878,
"step": 273
},
{
"epoch": 0.7007672634271099,
"grad_norm": 0.9816938036576663,
"learning_rate": 0.00015922497664739508,
"loss": 1.1036,
"step": 274
},
{
"epoch": 0.7033248081841432,
"grad_norm": 1.1744231549177595,
"learning_rate": 0.0001592050100560074,
"loss": 1.0826,
"step": 275
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.1244228971801966,
"learning_rate": 0.0001591847908138623,
"loss": 1.0992,
"step": 276
},
{
"epoch": 0.7084398976982097,
"grad_norm": 1.0273673884618308,
"learning_rate": 0.00015916431898545583,
"loss": 1.1122,
"step": 277
},
{
"epoch": 0.710997442455243,
"grad_norm": 1.3019719478481941,
"learning_rate": 0.0001591435946360897,
"loss": 1.0797,
"step": 278
},
{
"epoch": 0.7135549872122762,
"grad_norm": 0.9179007336169464,
"learning_rate": 0.00015912261783187113,
"loss": 1.1083,
"step": 279
},
{
"epoch": 0.7161125319693095,
"grad_norm": 1.3938652199122237,
"learning_rate": 0.00015910138863971265,
"loss": 1.0768,
"step": 280
},
{
"epoch": 0.7186700767263428,
"grad_norm": 0.8460589876687793,
"learning_rate": 0.00015907990712733176,
"loss": 1.0675,
"step": 281
},
{
"epoch": 0.7212276214833759,
"grad_norm": 1.2311027949600852,
"learning_rate": 0.00015905817336325098,
"loss": 1.095,
"step": 282
},
{
"epoch": 0.7237851662404092,
"grad_norm": 0.5637046057878358,
"learning_rate": 0.00015903618741679735,
"loss": 1.0227,
"step": 283
},
{
"epoch": 0.7263427109974424,
"grad_norm": 0.8864195638565602,
"learning_rate": 0.00015901394935810236,
"loss": 1.0894,
"step": 284
},
{
"epoch": 0.7289002557544757,
"grad_norm": 1.118154448385255,
"learning_rate": 0.00015899145925810172,
"loss": 1.0708,
"step": 285
},
{
"epoch": 0.731457800511509,
"grad_norm": 0.8797417608904688,
"learning_rate": 0.0001589687171885351,
"loss": 1.0973,
"step": 286
},
{
"epoch": 0.7340153452685422,
"grad_norm": 1.2417892204976435,
"learning_rate": 0.0001589457232219459,
"loss": 1.0959,
"step": 287
},
{
"epoch": 0.7365728900255755,
"grad_norm": 1.3823792436001885,
"learning_rate": 0.000158922477431681,
"loss": 1.0588,
"step": 288
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.5914973374896305,
"learning_rate": 0.00015889897989189065,
"loss": 1.0877,
"step": 289
},
{
"epoch": 0.7416879795396419,
"grad_norm": 0.6894697219091279,
"learning_rate": 0.00015887523067752805,
"loss": 1.0987,
"step": 290
},
{
"epoch": 0.7442455242966752,
"grad_norm": 0.9378104999898202,
"learning_rate": 0.0001588512298643492,
"loss": 1.0813,
"step": 291
},
{
"epoch": 0.7468030690537084,
"grad_norm": 1.5924222953617497,
"learning_rate": 0.00015882697752891273,
"loss": 1.0493,
"step": 292
},
{
"epoch": 0.7493606138107417,
"grad_norm": 0.8644236985398326,
"learning_rate": 0.0001588024737485795,
"loss": 1.0745,
"step": 293
},
{
"epoch": 0.7519181585677749,
"grad_norm": 1.2617771174370838,
"learning_rate": 0.00015877771860151255,
"loss": 1.0756,
"step": 294
},
{
"epoch": 0.7544757033248082,
"grad_norm": 0.6053221801377883,
"learning_rate": 0.00015875271216667658,
"loss": 1.0624,
"step": 295
},
{
"epoch": 0.7570332480818415,
"grad_norm": 0.8733719684486176,
"learning_rate": 0.00015872745452383797,
"loss": 1.0713,
"step": 296
},
{
"epoch": 0.7595907928388747,
"grad_norm": 1.0570673007983702,
"learning_rate": 0.00015870194575356444,
"loss": 1.1115,
"step": 297
},
{
"epoch": 0.7621483375959079,
"grad_norm": 0.7325728255149376,
"learning_rate": 0.00015867618593722464,
"loss": 1.0871,
"step": 298
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.7340524897043603,
"learning_rate": 0.00015865017515698807,
"loss": 1.0979,
"step": 299
},
{
"epoch": 0.7672634271099744,
"grad_norm": 1.1656279626023016,
"learning_rate": 0.00015862391349582484,
"loss": 1.0597,
"step": 300
},
{
"epoch": 0.7698209718670077,
"grad_norm": 0.9978239568565908,
"learning_rate": 0.00015859740103750522,
"loss": 1.0932,
"step": 301
},
{
"epoch": 0.7723785166240409,
"grad_norm": 1.878442480743071,
"learning_rate": 0.00015857063786659954,
"loss": 1.0938,
"step": 302
},
{
"epoch": 0.7749360613810742,
"grad_norm": 0.6117011045915516,
"learning_rate": 0.00015854362406847786,
"loss": 1.0623,
"step": 303
},
{
"epoch": 0.7774936061381074,
"grad_norm": 1.8420720325784072,
"learning_rate": 0.00015851635972930967,
"loss": 1.0699,
"step": 304
},
{
"epoch": 0.7800511508951407,
"grad_norm": 1.002131752478182,
"learning_rate": 0.00015848884493606367,
"loss": 1.0826,
"step": 305
},
{
"epoch": 0.782608695652174,
"grad_norm": 1.2471718061674597,
"learning_rate": 0.00015846107977650743,
"loss": 1.0755,
"step": 306
},
{
"epoch": 0.7851662404092071,
"grad_norm": 0.9634733361160541,
"learning_rate": 0.0001584330643392072,
"loss": 1.0416,
"step": 307
},
{
"epoch": 0.7877237851662404,
"grad_norm": 1.790526532103535,
"learning_rate": 0.00015840479871352754,
"loss": 1.0754,
"step": 308
},
{
"epoch": 0.7902813299232737,
"grad_norm": 0.8667875735812341,
"learning_rate": 0.00015837628298963105,
"loss": 1.0934,
"step": 309
},
{
"epoch": 0.7928388746803069,
"grad_norm": 1.4536288271279978,
"learning_rate": 0.00015834751725847816,
"loss": 1.0632,
"step": 310
},
{
"epoch": 0.7953964194373402,
"grad_norm": 1.3777516183353187,
"learning_rate": 0.00015831850161182677,
"loss": 1.0956,
"step": 311
},
{
"epoch": 0.7979539641943734,
"grad_norm": 0.7721449298753891,
"learning_rate": 0.0001582892361422319,
"loss": 1.1069,
"step": 312
},
{
"epoch": 0.8005115089514067,
"grad_norm": 1.174156872017157,
"learning_rate": 0.00015825972094304555,
"loss": 1.0728,
"step": 313
},
{
"epoch": 0.80306905370844,
"grad_norm": 1.2588808228888746,
"learning_rate": 0.00015822995610841623,
"loss": 1.0772,
"step": 314
},
{
"epoch": 0.8056265984654731,
"grad_norm": 0.8720000426242472,
"learning_rate": 0.00015819994173328885,
"loss": 1.0654,
"step": 315
},
{
"epoch": 0.8081841432225064,
"grad_norm": 0.923631788770043,
"learning_rate": 0.00015816967791340417,
"loss": 1.0668,
"step": 316
},
{
"epoch": 0.8107416879795396,
"grad_norm": 1.1357229877804957,
"learning_rate": 0.00015813916474529885,
"loss": 1.0911,
"step": 317
},
{
"epoch": 0.8132992327365729,
"grad_norm": 0.8907121901474587,
"learning_rate": 0.0001581084023263047,
"loss": 1.0826,
"step": 318
},
{
"epoch": 0.8158567774936062,
"grad_norm": 1.0350783431396418,
"learning_rate": 0.00015807739075454874,
"loss": 1.0426,
"step": 319
},
{
"epoch": 0.8184143222506394,
"grad_norm": 1.2795269410097496,
"learning_rate": 0.00015804613012895268,
"loss": 1.0731,
"step": 320
},
{
"epoch": 0.8209718670076727,
"grad_norm": 0.8440033467786482,
"learning_rate": 0.0001580146205492327,
"loss": 1.0491,
"step": 321
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.9336906509179427,
"learning_rate": 0.00015798286211589916,
"loss": 1.0796,
"step": 322
},
{
"epoch": 0.8260869565217391,
"grad_norm": 1.243210147279451,
"learning_rate": 0.00015795085493025608,
"loss": 1.0998,
"step": 323
},
{
"epoch": 0.8286445012787724,
"grad_norm": 0.985781736568132,
"learning_rate": 0.00015791859909440107,
"loss": 1.097,
"step": 324
},
{
"epoch": 0.8312020460358056,
"grad_norm": 1.115722030381177,
"learning_rate": 0.00015788609471122485,
"loss": 1.0594,
"step": 325
},
{
"epoch": 0.8337595907928389,
"grad_norm": 0.6317177707367972,
"learning_rate": 0.000157853341884411,
"loss": 1.0672,
"step": 326
},
{
"epoch": 0.8363171355498721,
"grad_norm": 0.7614994384747567,
"learning_rate": 0.00015782034071843557,
"loss": 1.1076,
"step": 327
},
{
"epoch": 0.8388746803069054,
"grad_norm": 0.6788203373242645,
"learning_rate": 0.00015778709131856675,
"loss": 1.0794,
"step": 328
},
{
"epoch": 0.8414322250639387,
"grad_norm": 0.6573621171258895,
"learning_rate": 0.00015775359379086455,
"loss": 1.1175,
"step": 329
},
{
"epoch": 0.8439897698209718,
"grad_norm": 0.865009547315977,
"learning_rate": 0.00015771984824218053,
"loss": 1.0893,
"step": 330
},
{
"epoch": 0.8465473145780051,
"grad_norm": 1.0982989183876286,
"learning_rate": 0.00015768585478015732,
"loss": 1.0628,
"step": 331
},
{
"epoch": 0.8491048593350383,
"grad_norm": 1.5816845014682415,
"learning_rate": 0.00015765161351322845,
"loss": 1.0553,
"step": 332
},
{
"epoch": 0.8516624040920716,
"grad_norm": 0.5583122236625028,
"learning_rate": 0.0001576171245506178,
"loss": 1.1007,
"step": 333
},
{
"epoch": 0.8542199488491049,
"grad_norm": 1.4589646002026686,
"learning_rate": 0.00015758238800233937,
"loss": 1.0354,
"step": 334
},
{
"epoch": 0.8567774936061381,
"grad_norm": 1.1988373358126654,
"learning_rate": 0.00015754740397919703,
"loss": 1.0609,
"step": 335
},
{
"epoch": 0.8593350383631714,
"grad_norm": 0.7798431918437426,
"learning_rate": 0.0001575121725927839,
"loss": 1.0599,
"step": 336
},
{
"epoch": 0.8618925831202046,
"grad_norm": 0.8001399476748517,
"learning_rate": 0.00015747669395548228,
"loss": 1.0825,
"step": 337
},
{
"epoch": 0.8644501278772379,
"grad_norm": 0.9268381518772149,
"learning_rate": 0.00015744096818046306,
"loss": 1.0867,
"step": 338
},
{
"epoch": 0.8670076726342711,
"grad_norm": 0.8482506857320948,
"learning_rate": 0.00015740499538168548,
"loss": 1.0519,
"step": 339
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.1051027320167537,
"learning_rate": 0.00015736877567389682,
"loss": 1.0926,
"step": 340
},
{
"epoch": 0.8721227621483376,
"grad_norm": 1.1295814345497992,
"learning_rate": 0.00015733230917263182,
"loss": 1.0485,
"step": 341
},
{
"epoch": 0.8746803069053708,
"grad_norm": 0.8381578992561258,
"learning_rate": 0.00015729559599421262,
"loss": 1.0742,
"step": 342
},
{
"epoch": 0.8772378516624041,
"grad_norm": 1.1355285501553987,
"learning_rate": 0.00015725863625574808,
"loss": 1.0731,
"step": 343
},
{
"epoch": 0.8797953964194374,
"grad_norm": 1.2716344612482289,
"learning_rate": 0.0001572214300751336,
"loss": 1.0818,
"step": 344
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.7977797928903454,
"learning_rate": 0.00015718397757105072,
"loss": 1.0592,
"step": 345
},
{
"epoch": 0.8849104859335039,
"grad_norm": 0.5888833117266756,
"learning_rate": 0.0001571462788629666,
"loss": 1.124,
"step": 346
},
{
"epoch": 0.887468030690537,
"grad_norm": 0.7277724084604381,
"learning_rate": 0.00015710833407113386,
"loss": 1.0076,
"step": 347
},
{
"epoch": 0.8900255754475703,
"grad_norm": 0.7175876926395411,
"learning_rate": 0.00015707014331659008,
"loss": 1.0735,
"step": 348
},
{
"epoch": 0.8925831202046036,
"grad_norm": 0.8127426786215441,
"learning_rate": 0.00015703170672115737,
"loss": 1.0582,
"step": 349
},
{
"epoch": 0.8951406649616368,
"grad_norm": 1.0648976192629485,
"learning_rate": 0.00015699302440744202,
"loss": 1.0788,
"step": 350
},
{
"epoch": 0.8976982097186701,
"grad_norm": 1.2133128800930093,
"learning_rate": 0.00015695409649883418,
"loss": 1.0986,
"step": 351
},
{
"epoch": 0.9002557544757033,
"grad_norm": 0.946491692276404,
"learning_rate": 0.0001569149231195074,
"loss": 1.0522,
"step": 352
},
{
"epoch": 0.9028132992327366,
"grad_norm": 1.2375939940771874,
"learning_rate": 0.0001568755043944182,
"loss": 1.077,
"step": 353
},
{
"epoch": 0.9053708439897699,
"grad_norm": 0.7734830655451521,
"learning_rate": 0.00015683584044930572,
"loss": 1.0659,
"step": 354
},
{
"epoch": 0.907928388746803,
"grad_norm": 0.6097683019560797,
"learning_rate": 0.00015679593141069132,
"loss": 1.0446,
"step": 355
},
{
"epoch": 0.9104859335038363,
"grad_norm": 0.5759587093662797,
"learning_rate": 0.0001567557774058782,
"loss": 1.0577,
"step": 356
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.5878753626840652,
"learning_rate": 0.0001567153785629509,
"loss": 1.0675,
"step": 357
},
{
"epoch": 0.9156010230179028,
"grad_norm": 0.6653732754348032,
"learning_rate": 0.000156674735010775,
"loss": 1.0891,
"step": 358
},
{
"epoch": 0.9181585677749361,
"grad_norm": 0.768263015413779,
"learning_rate": 0.00015663384687899663,
"loss": 1.0715,
"step": 359
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.9765055577703315,
"learning_rate": 0.00015659271429804215,
"loss": 1.0396,
"step": 360
},
{
"epoch": 0.9232736572890026,
"grad_norm": 1.4554265699809417,
"learning_rate": 0.00015655133739911757,
"loss": 1.0919,
"step": 361
},
{
"epoch": 0.9258312020460358,
"grad_norm": 0.7208280463855818,
"learning_rate": 0.0001565097163142083,
"loss": 1.0151,
"step": 362
},
{
"epoch": 0.928388746803069,
"grad_norm": 0.8611710190483517,
"learning_rate": 0.00015646785117607865,
"loss": 1.0796,
"step": 363
},
{
"epoch": 0.9309462915601023,
"grad_norm": 1.1291766944081427,
"learning_rate": 0.00015642574211827142,
"loss": 1.0651,
"step": 364
},
{
"epoch": 0.9335038363171355,
"grad_norm": 1.0023408896760695,
"learning_rate": 0.00015638338927510752,
"loss": 1.0785,
"step": 365
},
{
"epoch": 0.9360613810741688,
"grad_norm": 1.2325468393537922,
"learning_rate": 0.00015634079278168542,
"loss": 1.1032,
"step": 366
},
{
"epoch": 0.9386189258312021,
"grad_norm": 0.8116887550297889,
"learning_rate": 0.00015629795277388077,
"loss": 1.0784,
"step": 367
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.8465793191190484,
"learning_rate": 0.00015625486938834613,
"loss": 1.0729,
"step": 368
},
{
"epoch": 0.9437340153452686,
"grad_norm": 0.8630348039771475,
"learning_rate": 0.00015621154276251024,
"loss": 1.0676,
"step": 369
},
{
"epoch": 0.9462915601023018,
"grad_norm": 0.8909789093135501,
"learning_rate": 0.00015616797303457782,
"loss": 1.0626,
"step": 370
},
{
"epoch": 0.948849104859335,
"grad_norm": 1.3639686895279477,
"learning_rate": 0.00015612416034352906,
"loss": 1.0935,
"step": 371
},
{
"epoch": 0.9514066496163683,
"grad_norm": 0.7547937680438821,
"learning_rate": 0.00015608010482911908,
"loss": 1.0714,
"step": 372
},
{
"epoch": 0.9539641943734015,
"grad_norm": 0.6097577881338234,
"learning_rate": 0.00015603580663187765,
"loss": 1.0757,
"step": 373
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.7408592240149442,
"learning_rate": 0.00015599126589310857,
"loss": 1.0762,
"step": 374
},
{
"epoch": 0.959079283887468,
"grad_norm": 0.8123009573402776,
"learning_rate": 0.00015594648275488944,
"loss": 1.0991,
"step": 375
},
{
"epoch": 0.9616368286445013,
"grad_norm": 0.8997010834862542,
"learning_rate": 0.00015590145736007091,
"loss": 1.0493,
"step": 376
},
{
"epoch": 0.9641943734015346,
"grad_norm": 1.211365253216414,
"learning_rate": 0.00015585618985227657,
"loss": 1.0845,
"step": 377
},
{
"epoch": 0.9667519181585678,
"grad_norm": 1.1546641796621098,
"learning_rate": 0.00015581068037590212,
"loss": 1.0851,
"step": 378
},
{
"epoch": 0.969309462915601,
"grad_norm": 1.1673337321688009,
"learning_rate": 0.00015576492907611524,
"loss": 1.054,
"step": 379
},
{
"epoch": 0.9718670076726342,
"grad_norm": 0.6737544031199463,
"learning_rate": 0.00015571893609885493,
"loss": 1.0377,
"step": 380
},
{
"epoch": 0.9744245524296675,
"grad_norm": 0.8151328439701532,
"learning_rate": 0.00015567270159083107,
"loss": 1.0698,
"step": 381
},
{
"epoch": 0.9769820971867008,
"grad_norm": 0.9445758081131683,
"learning_rate": 0.00015562622569952408,
"loss": 1.0723,
"step": 382
},
{
"epoch": 0.979539641943734,
"grad_norm": 1.0143687259241263,
"learning_rate": 0.00015557950857318425,
"loss": 1.0753,
"step": 383
},
{
"epoch": 0.9820971867007673,
"grad_norm": 1.0909144236610384,
"learning_rate": 0.00015553255036083145,
"loss": 1.0301,
"step": 384
},
{
"epoch": 0.9846547314578005,
"grad_norm": 1.2562026829762518,
"learning_rate": 0.0001554853512122545,
"loss": 1.1103,
"step": 385
},
{
"epoch": 0.9872122762148338,
"grad_norm": 0.7752538678352305,
"learning_rate": 0.00015543791127801084,
"loss": 1.0633,
"step": 386
},
{
"epoch": 0.989769820971867,
"grad_norm": 0.6480828071883595,
"learning_rate": 0.0001553902307094259,
"loss": 1.0769,
"step": 387
},
{
"epoch": 0.9923273657289002,
"grad_norm": 0.8764236095011647,
"learning_rate": 0.00015534230965859276,
"loss": 1.0905,
"step": 388
},
{
"epoch": 0.9948849104859335,
"grad_norm": 1.1982183014384076,
"learning_rate": 0.00015529414827837156,
"loss": 1.0737,
"step": 389
},
{
"epoch": 0.9974424552429667,
"grad_norm": 1.0015924584874194,
"learning_rate": 0.00015524574672238906,
"loss": 1.0539,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 1.3714997731388885,
"learning_rate": 0.00015519710514503814,
"loss": 1.0846,
"step": 391
},
{
"epoch": 1.0025575447570332,
"grad_norm": 0.5566435857743947,
"learning_rate": 0.00015514822370147732,
"loss": 1.0432,
"step": 392
},
{
"epoch": 1.0051150895140666,
"grad_norm": 0.7918387632633654,
"learning_rate": 0.00015509910254763023,
"loss": 1.0578,
"step": 393
},
{
"epoch": 1.0076726342710998,
"grad_norm": 1.256938009132569,
"learning_rate": 0.0001550497418401852,
"loss": 1.0306,
"step": 394
},
{
"epoch": 1.010230179028133,
"grad_norm": 1.2314520681198668,
"learning_rate": 0.00015500014173659457,
"loss": 1.0383,
"step": 395
},
{
"epoch": 1.0127877237851663,
"grad_norm": 0.923069995672888,
"learning_rate": 0.00015495030239507442,
"loss": 1.0573,
"step": 396
},
{
"epoch": 1.0153452685421995,
"grad_norm": 0.936236903889318,
"learning_rate": 0.00015490022397460392,
"loss": 1.0573,
"step": 397
},
{
"epoch": 1.0179028132992327,
"grad_norm": 0.6628420746065794,
"learning_rate": 0.0001548499066349249,
"loss": 1.0474,
"step": 398
},
{
"epoch": 1.020460358056266,
"grad_norm": 0.47759016557709666,
"learning_rate": 0.00015479935053654126,
"loss": 1.0175,
"step": 399
},
{
"epoch": 1.0230179028132993,
"grad_norm": 0.61072929455943,
"learning_rate": 0.00015474855584071847,
"loss": 1.0724,
"step": 400
},
{
"epoch": 1.0255754475703325,
"grad_norm": 0.607075351205747,
"learning_rate": 0.0001546975227094832,
"loss": 1.0527,
"step": 401
},
{
"epoch": 1.0281329923273657,
"grad_norm": 0.5993295243529821,
"learning_rate": 0.00015464625130562256,
"loss": 1.0695,
"step": 402
},
{
"epoch": 1.030690537084399,
"grad_norm": 0.9177173231285568,
"learning_rate": 0.0001545947417926838,
"loss": 1.0344,
"step": 403
},
{
"epoch": 1.0332480818414322,
"grad_norm": 1.4911897806007488,
"learning_rate": 0.00015454299433497362,
"loss": 1.0443,
"step": 404
},
{
"epoch": 1.0358056265984654,
"grad_norm": 0.6069008914687445,
"learning_rate": 0.00015449100909755784,
"loss": 1.0393,
"step": 405
},
{
"epoch": 1.0383631713554988,
"grad_norm": 0.9163856494121054,
"learning_rate": 0.00015443878624626066,
"loss": 1.0737,
"step": 406
},
{
"epoch": 1.040920716112532,
"grad_norm": 1.369010227838881,
"learning_rate": 0.0001543863259476642,
"loss": 1.0106,
"step": 407
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.8651156065397383,
"learning_rate": 0.00015433362836910817,
"loss": 1.0399,
"step": 408
},
{
"epoch": 1.0460358056265984,
"grad_norm": 0.8527058058258006,
"learning_rate": 0.00015428069367868892,
"loss": 1.0222,
"step": 409
},
{
"epoch": 1.0485933503836318,
"grad_norm": 0.7680613356197566,
"learning_rate": 0.00015422752204525937,
"loss": 1.0161,
"step": 410
},
{
"epoch": 1.051150895140665,
"grad_norm": 1.0745283772693792,
"learning_rate": 0.0001541741136384281,
"loss": 1.0446,
"step": 411
},
{
"epoch": 1.0537084398976981,
"grad_norm": 1.0936408809378098,
"learning_rate": 0.00015412046862855902,
"loss": 1.0245,
"step": 412
},
{
"epoch": 1.0562659846547315,
"grad_norm": 0.9926125079651018,
"learning_rate": 0.00015406658718677076,
"loss": 1.0308,
"step": 413
},
{
"epoch": 1.0588235294117647,
"grad_norm": 1.1175953083121093,
"learning_rate": 0.00015401246948493612,
"loss": 1.0768,
"step": 414
},
{
"epoch": 1.061381074168798,
"grad_norm": 0.8210085027845057,
"learning_rate": 0.00015395811569568154,
"loss": 1.0473,
"step": 415
},
{
"epoch": 1.0639386189258313,
"grad_norm": 0.9226634652720442,
"learning_rate": 0.00015390352599238655,
"loss": 1.0299,
"step": 416
},
{
"epoch": 1.0664961636828645,
"grad_norm": 1.2471786951586945,
"learning_rate": 0.00015384870054918314,
"loss": 1.0139,
"step": 417
},
{
"epoch": 1.0690537084398977,
"grad_norm": 0.8806851237766041,
"learning_rate": 0.00015379363954095535,
"loss": 1.0237,
"step": 418
},
{
"epoch": 1.0716112531969308,
"grad_norm": 0.727069173053958,
"learning_rate": 0.0001537383431433386,
"loss": 1.0786,
"step": 419
},
{
"epoch": 1.0741687979539642,
"grad_norm": 0.6337579771769642,
"learning_rate": 0.00015368281153271918,
"loss": 1.0264,
"step": 420
},
{
"epoch": 1.0767263427109974,
"grad_norm": 0.8868138217653037,
"learning_rate": 0.0001536270448862336,
"loss": 1.0413,
"step": 421
},
{
"epoch": 1.0792838874680306,
"grad_norm": 0.8013668539540468,
"learning_rate": 0.00015357104338176823,
"loss": 1.0305,
"step": 422
},
{
"epoch": 1.081841432225064,
"grad_norm": 1.0111414586274687,
"learning_rate": 0.00015351480719795845,
"loss": 1.0177,
"step": 423
},
{
"epoch": 1.0843989769820972,
"grad_norm": 1.3128642093201517,
"learning_rate": 0.00015345833651418835,
"loss": 1.0663,
"step": 424
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.7074818377117421,
"learning_rate": 0.00015340163151058997,
"loss": 1.0262,
"step": 425
},
{
"epoch": 1.0895140664961638,
"grad_norm": 0.7476417982075203,
"learning_rate": 0.00015334469236804278,
"loss": 1.0166,
"step": 426
},
{
"epoch": 1.092071611253197,
"grad_norm": 0.7163607115802371,
"learning_rate": 0.00015328751926817314,
"loss": 1.041,
"step": 427
},
{
"epoch": 1.0946291560102301,
"grad_norm": 1.0614664295591614,
"learning_rate": 0.0001532301123933537,
"loss": 1.0236,
"step": 428
},
{
"epoch": 1.0971867007672633,
"grad_norm": 1.265439568931787,
"learning_rate": 0.00015317247192670282,
"loss": 1.0528,
"step": 429
},
{
"epoch": 1.0997442455242967,
"grad_norm": 0.7025263297795912,
"learning_rate": 0.00015311459805208397,
"loss": 1.0277,
"step": 430
},
{
"epoch": 1.10230179028133,
"grad_norm": 0.8167641509021383,
"learning_rate": 0.0001530564909541051,
"loss": 1.0582,
"step": 431
},
{
"epoch": 1.104859335038363,
"grad_norm": 0.8716549745993203,
"learning_rate": 0.0001529981508181182,
"loss": 1.077,
"step": 432
},
{
"epoch": 1.1074168797953965,
"grad_norm": 0.7246028123611893,
"learning_rate": 0.00015293957783021854,
"loss": 1.0542,
"step": 433
},
{
"epoch": 1.1099744245524297,
"grad_norm": 0.6784199036145839,
"learning_rate": 0.0001528807721772442,
"loss": 1.0418,
"step": 434
},
{
"epoch": 1.1125319693094629,
"grad_norm": 0.8506075875171634,
"learning_rate": 0.00015282173404677533,
"loss": 1.0343,
"step": 435
},
{
"epoch": 1.1150895140664963,
"grad_norm": 0.8375757880980345,
"learning_rate": 0.00015276246362713375,
"loss": 1.0341,
"step": 436
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.7540319449850698,
"learning_rate": 0.00015270296110738221,
"loss": 1.014,
"step": 437
},
{
"epoch": 1.1202046035805626,
"grad_norm": 0.9166441931706429,
"learning_rate": 0.0001526432266773238,
"loss": 1.0269,
"step": 438
},
{
"epoch": 1.1227621483375958,
"grad_norm": 1.0822305273066126,
"learning_rate": 0.0001525832605275014,
"loss": 1.0472,
"step": 439
},
{
"epoch": 1.1253196930946292,
"grad_norm": 0.9450917972251209,
"learning_rate": 0.000152523062849197,
"loss": 1.024,
"step": 440
},
{
"epoch": 1.1278772378516624,
"grad_norm": 1.1333566165350994,
"learning_rate": 0.0001524626338344311,
"loss": 1.0448,
"step": 441
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.177581998734778,
"learning_rate": 0.00015240197367596226,
"loss": 1.0244,
"step": 442
},
{
"epoch": 1.132992327365729,
"grad_norm": 0.8866480092962395,
"learning_rate": 0.00015234108256728616,
"loss": 1.0499,
"step": 443
},
{
"epoch": 1.1355498721227621,
"grad_norm": 0.6882160288370965,
"learning_rate": 0.00015227996070263535,
"loss": 1.0151,
"step": 444
},
{
"epoch": 1.1381074168797953,
"grad_norm": 0.7419397568748587,
"learning_rate": 0.00015221860827697832,
"loss": 1.0345,
"step": 445
},
{
"epoch": 1.1406649616368287,
"grad_norm": 0.854881931061872,
"learning_rate": 0.00015215702548601907,
"loss": 1.008,
"step": 446
},
{
"epoch": 1.143222506393862,
"grad_norm": 0.8138274292487687,
"learning_rate": 0.00015209521252619644,
"loss": 0.9962,
"step": 447
},
{
"epoch": 1.145780051150895,
"grad_norm": 0.7536271031473499,
"learning_rate": 0.00015203316959468344,
"loss": 1.0299,
"step": 448
},
{
"epoch": 1.1483375959079285,
"grad_norm": 0.9110426205382722,
"learning_rate": 0.0001519708968893867,
"loss": 1.019,
"step": 449
},
{
"epoch": 1.1508951406649617,
"grad_norm": 1.2088991550402766,
"learning_rate": 0.00015190839460894567,
"loss": 1.0708,
"step": 450
},
{
"epoch": 1.1534526854219949,
"grad_norm": 0.8573913285400658,
"learning_rate": 0.00015184566295273227,
"loss": 1.0417,
"step": 451
},
{
"epoch": 1.156010230179028,
"grad_norm": 0.6951469442919158,
"learning_rate": 0.00015178270212084995,
"loss": 1.0464,
"step": 452
},
{
"epoch": 1.1585677749360614,
"grad_norm": 0.6419948195410027,
"learning_rate": 0.00015171951231413328,
"loss": 1.0612,
"step": 453
},
{
"epoch": 1.1611253196930946,
"grad_norm": 0.6841619518854335,
"learning_rate": 0.00015165609373414722,
"loss": 1.0325,
"step": 454
},
{
"epoch": 1.1636828644501278,
"grad_norm": 0.8037291566188051,
"learning_rate": 0.0001515924465831864,
"loss": 1.0295,
"step": 455
},
{
"epoch": 1.1662404092071612,
"grad_norm": 1.1795212959071533,
"learning_rate": 0.00015152857106427462,
"loss": 1.0231,
"step": 456
},
{
"epoch": 1.1687979539641944,
"grad_norm": 1.1007425485117117,
"learning_rate": 0.00015146446738116412,
"loss": 1.015,
"step": 457
},
{
"epoch": 1.1713554987212276,
"grad_norm": 1.072656472389329,
"learning_rate": 0.00015140013573833498,
"loss": 1.0195,
"step": 458
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.9339605123999745,
"learning_rate": 0.00015133557634099435,
"loss": 1.026,
"step": 459
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.8580962355846978,
"learning_rate": 0.00015127078939507595,
"loss": 1.055,
"step": 460
},
{
"epoch": 1.1790281329923273,
"grad_norm": 1.028703820245517,
"learning_rate": 0.00015120577510723934,
"loss": 1.0768,
"step": 461
},
{
"epoch": 1.1815856777493605,
"grad_norm": 1.1535909770008528,
"learning_rate": 0.00015114053368486919,
"loss": 1.0227,
"step": 462
},
{
"epoch": 1.184143222506394,
"grad_norm": 0.7549525724152655,
"learning_rate": 0.0001510750653360748,
"loss": 1.0101,
"step": 463
},
{
"epoch": 1.186700767263427,
"grad_norm": 0.6560485854233202,
"learning_rate": 0.00015100937026968922,
"loss": 1.0372,
"step": 464
},
{
"epoch": 1.1892583120204603,
"grad_norm": 0.5946694031246916,
"learning_rate": 0.0001509434486952688,
"loss": 1.0471,
"step": 465
},
{
"epoch": 1.1918158567774937,
"grad_norm": 0.5311919492244818,
"learning_rate": 0.00015087730082309232,
"loss": 1.0431,
"step": 466
},
{
"epoch": 1.1943734015345269,
"grad_norm": 0.5154174371307244,
"learning_rate": 0.00015081092686416043,
"loss": 1.0199,
"step": 467
},
{
"epoch": 1.19693094629156,
"grad_norm": 0.505383670902881,
"learning_rate": 0.00015074432703019504,
"loss": 1.0706,
"step": 468
},
{
"epoch": 1.1994884910485935,
"grad_norm": 0.4907682209551291,
"learning_rate": 0.00015067750153363845,
"loss": 1.0346,
"step": 469
},
{
"epoch": 1.2020460358056266,
"grad_norm": 0.39066205442828883,
"learning_rate": 0.00015061045058765282,
"loss": 1.0554,
"step": 470
},
{
"epoch": 1.2046035805626598,
"grad_norm": 0.34420579713251814,
"learning_rate": 0.0001505431744061195,
"loss": 1.0279,
"step": 471
},
{
"epoch": 1.207161125319693,
"grad_norm": 0.43688810183174753,
"learning_rate": 0.0001504756732036383,
"loss": 0.9885,
"step": 472
},
{
"epoch": 1.2097186700767264,
"grad_norm": 0.4751633909038584,
"learning_rate": 0.00015040794719552676,
"loss": 1.0432,
"step": 473
},
{
"epoch": 1.2122762148337596,
"grad_norm": 0.5269656781598262,
"learning_rate": 0.00015033999659781953,
"loss": 1.027,
"step": 474
},
{
"epoch": 1.2148337595907928,
"grad_norm": 0.5712060191776948,
"learning_rate": 0.00015027182162726769,
"loss": 1.0421,
"step": 475
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.6411090148779058,
"learning_rate": 0.000150203422501338,
"loss": 1.013,
"step": 476
},
{
"epoch": 1.2199488491048593,
"grad_norm": 0.922985318540642,
"learning_rate": 0.00015013479943821225,
"loss": 1.0671,
"step": 477
},
{
"epoch": 1.2225063938618925,
"grad_norm": 1.411342942170953,
"learning_rate": 0.00015006595265678655,
"loss": 1.0506,
"step": 478
},
{
"epoch": 1.2250639386189257,
"grad_norm": 0.7044934707287243,
"learning_rate": 0.00014999688237667065,
"loss": 1.058,
"step": 479
},
{
"epoch": 1.227621483375959,
"grad_norm": 0.844446069080729,
"learning_rate": 0.00014992758881818722,
"loss": 1.0112,
"step": 480
},
{
"epoch": 1.2301790281329923,
"grad_norm": 0.863795773273135,
"learning_rate": 0.00014985807220237112,
"loss": 1.0223,
"step": 481
},
{
"epoch": 1.2327365728900257,
"grad_norm": 1.1955253111068895,
"learning_rate": 0.00014978833275096872,
"loss": 1.0437,
"step": 482
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.9710436321082059,
"learning_rate": 0.00014971837068643732,
"loss": 1.0331,
"step": 483
},
{
"epoch": 1.237851662404092,
"grad_norm": 0.9838152365395039,
"learning_rate": 0.00014964818623194412,
"loss": 1.0503,
"step": 484
},
{
"epoch": 1.2404092071611252,
"grad_norm": 1.3111101164937617,
"learning_rate": 0.00014957777961136588,
"loss": 1.0536,
"step": 485
},
{
"epoch": 1.2429667519181586,
"grad_norm": 0.9426881648292104,
"learning_rate": 0.00014950715104928794,
"loss": 1.0452,
"step": 486
},
{
"epoch": 1.2455242966751918,
"grad_norm": 0.9708865131907598,
"learning_rate": 0.0001494363007710036,
"loss": 1.0205,
"step": 487
},
{
"epoch": 1.248081841432225,
"grad_norm": 0.735118260321914,
"learning_rate": 0.00014936522900251348,
"loss": 1.0355,
"step": 488
},
{
"epoch": 1.2506393861892584,
"grad_norm": 0.8962772386972064,
"learning_rate": 0.00014929393597052458,
"loss": 1.0455,
"step": 489
},
{
"epoch": 1.2531969309462916,
"grad_norm": 0.6546912235303116,
"learning_rate": 0.00014922242190244981,
"loss": 1.0625,
"step": 490
},
{
"epoch": 1.2557544757033248,
"grad_norm": 0.5383201135001036,
"learning_rate": 0.0001491506870264071,
"loss": 1.0346,
"step": 491
},
{
"epoch": 1.258312020460358,
"grad_norm": 0.8097960021561659,
"learning_rate": 0.00014907873157121875,
"loss": 1.0605,
"step": 492
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.670808763781411,
"learning_rate": 0.00014900655576641057,
"loss": 1.0282,
"step": 493
},
{
"epoch": 1.2634271099744245,
"grad_norm": 0.7979394762122887,
"learning_rate": 0.00014893415984221141,
"loss": 1.0264,
"step": 494
},
{
"epoch": 1.265984654731458,
"grad_norm": 1.026770422301297,
"learning_rate": 0.00014886154402955217,
"loss": 1.0514,
"step": 495
},
{
"epoch": 1.2685421994884911,
"grad_norm": 1.032280976957703,
"learning_rate": 0.00014878870856006513,
"loss": 1.0408,
"step": 496
},
{
"epoch": 1.2710997442455243,
"grad_norm": 1.1296018012465836,
"learning_rate": 0.00014871565366608329,
"loss": 1.0338,
"step": 497
},
{
"epoch": 1.2736572890025575,
"grad_norm": 0.9749313409863054,
"learning_rate": 0.0001486423795806396,
"loss": 1.0193,
"step": 498
},
{
"epoch": 1.2762148337595907,
"grad_norm": 0.8177048634676223,
"learning_rate": 0.00014856888653746607,
"loss": 1.0324,
"step": 499
},
{
"epoch": 1.278772378516624,
"grad_norm": 0.7747012524305006,
"learning_rate": 0.00014849517477099334,
"loss": 1.0076,
"step": 500
},
{
"epoch": 1.2813299232736572,
"grad_norm": 0.8429034680075405,
"learning_rate": 0.00014842124451634956,
"loss": 1.0266,
"step": 501
},
{
"epoch": 1.2838874680306906,
"grad_norm": 1.0704964042478793,
"learning_rate": 0.00014834709600935995,
"loss": 1.033,
"step": 502
},
{
"epoch": 1.2864450127877238,
"grad_norm": 1.1030823411998563,
"learning_rate": 0.00014827272948654584,
"loss": 1.0519,
"step": 503
},
{
"epoch": 1.289002557544757,
"grad_norm": 0.7099638951621647,
"learning_rate": 0.00014819814518512403,
"loss": 1.0258,
"step": 504
},
{
"epoch": 1.2915601023017902,
"grad_norm": 0.5286675820388321,
"learning_rate": 0.000148123343343006,
"loss": 1.0398,
"step": 505
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.5306607233732565,
"learning_rate": 0.0001480483241987971,
"loss": 1.0155,
"step": 506
},
{
"epoch": 1.2966751918158568,
"grad_norm": 0.6060078277369222,
"learning_rate": 0.0001479730879917959,
"loss": 1.0486,
"step": 507
},
{
"epoch": 1.29923273657289,
"grad_norm": 0.8537119327365599,
"learning_rate": 0.00014789763496199335,
"loss": 1.0115,
"step": 508
},
{
"epoch": 1.3017902813299234,
"grad_norm": 1.0701098672995177,
"learning_rate": 0.00014782196535007198,
"loss": 1.0449,
"step": 509
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.0452113870678157,
"learning_rate": 0.00014774607939740524,
"loss": 1.0132,
"step": 510
},
{
"epoch": 1.3069053708439897,
"grad_norm": 1.0085703377598065,
"learning_rate": 0.0001476699773460567,
"loss": 1.0229,
"step": 511
},
{
"epoch": 1.309462915601023,
"grad_norm": 0.8918712650363909,
"learning_rate": 0.00014759365943877906,
"loss": 1.0509,
"step": 512
},
{
"epoch": 1.3120204603580563,
"grad_norm": 0.839691736422046,
"learning_rate": 0.00014751712591901385,
"loss": 1.0078,
"step": 513
},
{
"epoch": 1.3145780051150895,
"grad_norm": 0.7023292683764998,
"learning_rate": 0.00014744037703089014,
"loss": 1.0289,
"step": 514
},
{
"epoch": 1.317135549872123,
"grad_norm": 0.686332323144994,
"learning_rate": 0.00014736341301922406,
"loss": 1.0213,
"step": 515
},
{
"epoch": 1.319693094629156,
"grad_norm": 0.5991056794621004,
"learning_rate": 0.00014728623412951802,
"loss": 1.0164,
"step": 516
},
{
"epoch": 1.3222506393861893,
"grad_norm": 0.7507696949786656,
"learning_rate": 0.00014720884060795975,
"loss": 1.0119,
"step": 517
},
{
"epoch": 1.3248081841432224,
"grad_norm": 0.8658712614342154,
"learning_rate": 0.00014713123270142163,
"loss": 1.0295,
"step": 518
},
{
"epoch": 1.3273657289002558,
"grad_norm": 0.6119299788578647,
"learning_rate": 0.00014705341065745999,
"loss": 1.0197,
"step": 519
},
{
"epoch": 1.329923273657289,
"grad_norm": 0.4927851179899278,
"learning_rate": 0.00014697537472431411,
"loss": 1.0624,
"step": 520
},
{
"epoch": 1.3324808184143222,
"grad_norm": 0.4167468121183674,
"learning_rate": 0.0001468971251509056,
"loss": 1.0647,
"step": 521
},
{
"epoch": 1.3350383631713556,
"grad_norm": 0.47586787480372,
"learning_rate": 0.00014681866218683757,
"loss": 1.0402,
"step": 522
},
{
"epoch": 1.3375959079283888,
"grad_norm": 0.5745122439927115,
"learning_rate": 0.0001467399860823937,
"loss": 1.0304,
"step": 523
},
{
"epoch": 1.340153452685422,
"grad_norm": 0.7552655303578069,
"learning_rate": 0.00014666109708853767,
"loss": 1.0548,
"step": 524
},
{
"epoch": 1.3427109974424551,
"grad_norm": 1.06908823148847,
"learning_rate": 0.00014658199545691222,
"loss": 1.0287,
"step": 525
},
{
"epoch": 1.3452685421994885,
"grad_norm": 1.1444185918054413,
"learning_rate": 0.0001465026814398383,
"loss": 1.0539,
"step": 526
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.7989998085879703,
"learning_rate": 0.00014642315529031442,
"loss": 1.0035,
"step": 527
},
{
"epoch": 1.350383631713555,
"grad_norm": 0.6352155319789643,
"learning_rate": 0.00014634341726201572,
"loss": 1.0659,
"step": 528
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.5614215368601074,
"learning_rate": 0.00014626346760929316,
"loss": 1.0282,
"step": 529
},
{
"epoch": 1.3554987212276215,
"grad_norm": 0.5422618777488837,
"learning_rate": 0.00014618330658717278,
"loss": 1.0002,
"step": 530
},
{
"epoch": 1.3580562659846547,
"grad_norm": 0.4783637133302247,
"learning_rate": 0.00014610293445135492,
"loss": 1.0377,
"step": 531
},
{
"epoch": 1.3606138107416879,
"grad_norm": 0.4390483950197236,
"learning_rate": 0.00014602235145821322,
"loss": 1.023,
"step": 532
},
{
"epoch": 1.3631713554987213,
"grad_norm": 0.4768466306371761,
"learning_rate": 0.00014594155786479398,
"loss": 1.0601,
"step": 533
},
{
"epoch": 1.3657289002557544,
"grad_norm": 0.7582418871164014,
"learning_rate": 0.00014586055392881527,
"loss": 1.0292,
"step": 534
},
{
"epoch": 1.3682864450127878,
"grad_norm": 1.0430189228296438,
"learning_rate": 0.00014577933990866617,
"loss": 1.0397,
"step": 535
},
{
"epoch": 1.370843989769821,
"grad_norm": 1.2646327577842662,
"learning_rate": 0.00014569791606340577,
"loss": 1.0749,
"step": 536
},
{
"epoch": 1.3734015345268542,
"grad_norm": 0.6922891659849906,
"learning_rate": 0.00014561628265276257,
"loss": 1.0293,
"step": 537
},
{
"epoch": 1.3759590792838874,
"grad_norm": 0.44386889614919295,
"learning_rate": 0.00014553443993713355,
"loss": 1.0398,
"step": 538
},
{
"epoch": 1.3785166240409208,
"grad_norm": 0.5439717030086442,
"learning_rate": 0.00014545238817758327,
"loss": 1.0268,
"step": 539
},
{
"epoch": 1.381074168797954,
"grad_norm": 0.8373630963710572,
"learning_rate": 0.00014537012763584316,
"loss": 1.0354,
"step": 540
},
{
"epoch": 1.3836317135549872,
"grad_norm": 1.3266757684220118,
"learning_rate": 0.0001452876585743106,
"loss": 1.0642,
"step": 541
},
{
"epoch": 1.3861892583120206,
"grad_norm": 0.7488029622406787,
"learning_rate": 0.00014520498125604814,
"loss": 1.0534,
"step": 542
},
{
"epoch": 1.3887468030690537,
"grad_norm": 0.7282698103684015,
"learning_rate": 0.00014512209594478263,
"loss": 1.01,
"step": 543
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.7969771518742094,
"learning_rate": 0.00014503900290490436,
"loss": 1.0307,
"step": 544
},
{
"epoch": 1.39386189258312,
"grad_norm": 0.9263524028660353,
"learning_rate": 0.00014495570240146625,
"loss": 1.0211,
"step": 545
},
{
"epoch": 1.3964194373401535,
"grad_norm": 1.1608361715103017,
"learning_rate": 0.000144872194700183,
"loss": 1.0005,
"step": 546
},
{
"epoch": 1.3989769820971867,
"grad_norm": 0.836914057851843,
"learning_rate": 0.00014478848006743022,
"loss": 1.0387,
"step": 547
},
{
"epoch": 1.40153452685422,
"grad_norm": 0.6826412525653701,
"learning_rate": 0.00014470455877024365,
"loss": 1.0292,
"step": 548
},
{
"epoch": 1.4040920716112533,
"grad_norm": 0.48703773893723834,
"learning_rate": 0.00014462043107631818,
"loss": 1.0511,
"step": 549
},
{
"epoch": 1.4066496163682864,
"grad_norm": 0.6223475644721191,
"learning_rate": 0.00014453609725400713,
"loss": 0.9925,
"step": 550
},
{
"epoch": 1.4092071611253196,
"grad_norm": 0.8882232962821335,
"learning_rate": 0.0001444515575723213,
"loss": 1.0061,
"step": 551
},
{
"epoch": 1.4117647058823528,
"grad_norm": 1.1304081971561695,
"learning_rate": 0.00014436681230092815,
"loss": 1.0488,
"step": 552
},
{
"epoch": 1.4143222506393862,
"grad_norm": 0.8848381914341709,
"learning_rate": 0.00014428186171015097,
"loss": 1.0324,
"step": 553
},
{
"epoch": 1.4168797953964194,
"grad_norm": 0.7483522323458203,
"learning_rate": 0.00014419670607096791,
"loss": 1.0422,
"step": 554
},
{
"epoch": 1.4194373401534528,
"grad_norm": 0.7721209602826212,
"learning_rate": 0.00014411134565501133,
"loss": 1.056,
"step": 555
},
{
"epoch": 1.421994884910486,
"grad_norm": 0.8535777213626637,
"learning_rate": 0.00014402578073456661,
"loss": 1.0408,
"step": 556
},
{
"epoch": 1.4245524296675192,
"grad_norm": 0.6959036355749549,
"learning_rate": 0.00014394001158257163,
"loss": 1.0271,
"step": 557
},
{
"epoch": 1.4271099744245523,
"grad_norm": 0.6014343484373971,
"learning_rate": 0.00014385403847261562,
"loss": 1.0193,
"step": 558
},
{
"epoch": 1.4296675191815857,
"grad_norm": 0.7106873814775013,
"learning_rate": 0.00014376786167893846,
"loss": 1.0122,
"step": 559
},
{
"epoch": 1.432225063938619,
"grad_norm": 0.8444210941994957,
"learning_rate": 0.00014368148147642974,
"loss": 1.0045,
"step": 560
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.8805969266684864,
"learning_rate": 0.00014359489814062788,
"loss": 1.0144,
"step": 561
},
{
"epoch": 1.4373401534526855,
"grad_norm": 1.009450224204603,
"learning_rate": 0.00014350811194771928,
"loss": 1.0287,
"step": 562
},
{
"epoch": 1.4398976982097187,
"grad_norm": 1.2351992837125931,
"learning_rate": 0.00014342112317453738,
"loss": 1.0566,
"step": 563
},
{
"epoch": 1.4424552429667519,
"grad_norm": 0.6573457770192163,
"learning_rate": 0.00014333393209856182,
"loss": 1.052,
"step": 564
},
{
"epoch": 1.445012787723785,
"grad_norm": 0.5070847718255479,
"learning_rate": 0.00014324653899791765,
"loss": 1.0608,
"step": 565
},
{
"epoch": 1.4475703324808185,
"grad_norm": 0.6935855951791632,
"learning_rate": 0.00014315894415137416,
"loss": 1.0234,
"step": 566
},
{
"epoch": 1.4501278772378516,
"grad_norm": 0.7956146938043426,
"learning_rate": 0.00014307114783834442,
"loss": 1.0048,
"step": 567
},
{
"epoch": 1.452685421994885,
"grad_norm": 0.9003410836319078,
"learning_rate": 0.0001429831503388839,
"loss": 1.0363,
"step": 568
},
{
"epoch": 1.4552429667519182,
"grad_norm": 1.0643618726104027,
"learning_rate": 0.00014289495193368996,
"loss": 1.0269,
"step": 569
},
{
"epoch": 1.4578005115089514,
"grad_norm": 0.9080907950888324,
"learning_rate": 0.0001428065529041008,
"loss": 1.017,
"step": 570
},
{
"epoch": 1.4603580562659846,
"grad_norm": 0.8536436997073572,
"learning_rate": 0.00014271795353209456,
"loss": 1.0375,
"step": 571
},
{
"epoch": 1.4629156010230178,
"grad_norm": 0.9398461282489688,
"learning_rate": 0.00014262915410028848,
"loss": 1.0434,
"step": 572
},
{
"epoch": 1.4654731457800512,
"grad_norm": 0.9631928132083718,
"learning_rate": 0.00014254015489193782,
"loss": 1.0292,
"step": 573
},
{
"epoch": 1.4680306905370843,
"grad_norm": 0.9076791954370104,
"learning_rate": 0.00014245095619093532,
"loss": 1.0159,
"step": 574
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.9587339014454659,
"learning_rate": 0.00014236155828180983,
"loss": 1.0484,
"step": 575
},
{
"epoch": 1.473145780051151,
"grad_norm": 0.8891566782622077,
"learning_rate": 0.00014227196144972582,
"loss": 1.0508,
"step": 576
},
{
"epoch": 1.4757033248081841,
"grad_norm": 0.6581614104684226,
"learning_rate": 0.0001421821659804822,
"loss": 1.0403,
"step": 577
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.5861192400584929,
"learning_rate": 0.00014209217216051156,
"loss": 1.0304,
"step": 578
},
{
"epoch": 1.4808184143222507,
"grad_norm": 0.5774127863656433,
"learning_rate": 0.00014200198027687912,
"loss": 1.0102,
"step": 579
},
{
"epoch": 1.4833759590792839,
"grad_norm": 0.6502157171768282,
"learning_rate": 0.00014191159061728193,
"loss": 1.0253,
"step": 580
},
{
"epoch": 1.485933503836317,
"grad_norm": 0.5386614139768452,
"learning_rate": 0.00014182100347004793,
"loss": 1.044,
"step": 581
},
{
"epoch": 1.4884910485933505,
"grad_norm": 0.4786011997004328,
"learning_rate": 0.000141730219124135,
"loss": 1.0322,
"step": 582
},
{
"epoch": 1.4910485933503836,
"grad_norm": 0.5755235187273994,
"learning_rate": 0.00014163923786913004,
"loss": 1.0572,
"step": 583
},
{
"epoch": 1.4936061381074168,
"grad_norm": 0.641263771557679,
"learning_rate": 0.00014154805999524802,
"loss": 1.0627,
"step": 584
},
{
"epoch": 1.49616368286445,
"grad_norm": 0.798665776000645,
"learning_rate": 0.0001414566857933312,
"loss": 1.0017,
"step": 585
},
{
"epoch": 1.4987212276214834,
"grad_norm": 0.8759678129527348,
"learning_rate": 0.00014136511555484798,
"loss": 1.0168,
"step": 586
},
{
"epoch": 1.5012787723785166,
"grad_norm": 0.7904395533793586,
"learning_rate": 0.00014127334957189219,
"loss": 1.0253,
"step": 587
},
{
"epoch": 1.50383631713555,
"grad_norm": 0.6451046472087583,
"learning_rate": 0.00014118138813718192,
"loss": 1.0523,
"step": 588
},
{
"epoch": 1.5063938618925832,
"grad_norm": 0.5705461372803496,
"learning_rate": 0.0001410892315440588,
"loss": 0.9921,
"step": 589
},
{
"epoch": 1.5089514066496164,
"grad_norm": 0.6000400371240294,
"learning_rate": 0.00014099688008648703,
"loss": 1.0219,
"step": 590
},
{
"epoch": 1.5115089514066495,
"grad_norm": 0.6112952152068515,
"learning_rate": 0.0001409043340590523,
"loss": 0.9963,
"step": 591
},
{
"epoch": 1.5140664961636827,
"grad_norm": 0.5886324573188866,
"learning_rate": 0.00014081159375696102,
"loss": 1.0484,
"step": 592
},
{
"epoch": 1.5166240409207161,
"grad_norm": 0.5048817308801855,
"learning_rate": 0.00014071865947603922,
"loss": 0.978,
"step": 593
},
{
"epoch": 1.5191815856777495,
"grad_norm": 0.5000111304078102,
"learning_rate": 0.00014062553151273177,
"loss": 1.0431,
"step": 594
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.47701322805085783,
"learning_rate": 0.0001405322101641013,
"loss": 1.0157,
"step": 595
},
{
"epoch": 1.5242966751918159,
"grad_norm": 0.45047959305759844,
"learning_rate": 0.00014043869572782737,
"loss": 1.026,
"step": 596
},
{
"epoch": 1.526854219948849,
"grad_norm": 0.37562193605886857,
"learning_rate": 0.00014034498850220537,
"loss": 1.0334,
"step": 597
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.44055163797782626,
"learning_rate": 0.00014025108878614576,
"loss": 1.0353,
"step": 598
},
{
"epoch": 1.5319693094629157,
"grad_norm": 0.39725606847915634,
"learning_rate": 0.0001401569968791729,
"loss": 1.0115,
"step": 599
},
{
"epoch": 1.5345268542199488,
"grad_norm": 0.39650786805208904,
"learning_rate": 0.00014006271308142433,
"loss": 1.0604,
"step": 600
},
{
"epoch": 1.5370843989769822,
"grad_norm": 0.32569926641458746,
"learning_rate": 0.0001399682376936495,
"loss": 1.0096,
"step": 601
},
{
"epoch": 1.5396419437340154,
"grad_norm": 0.43543100187257516,
"learning_rate": 0.00013987357101720929,
"loss": 1.0059,
"step": 602
},
{
"epoch": 1.5421994884910486,
"grad_norm": 0.458695174168892,
"learning_rate": 0.00013977871335407445,
"loss": 1.0197,
"step": 603
},
{
"epoch": 1.5447570332480818,
"grad_norm": 0.43690410697330667,
"learning_rate": 0.00013968366500682514,
"loss": 1.0302,
"step": 604
},
{
"epoch": 1.547314578005115,
"grad_norm": 0.4143725631119223,
"learning_rate": 0.00013958842627864975,
"loss": 1.0167,
"step": 605
},
{
"epoch": 1.5498721227621484,
"grad_norm": 0.36509470245988934,
"learning_rate": 0.00013949299747334387,
"loss": 0.994,
"step": 606
},
{
"epoch": 1.5524296675191815,
"grad_norm": 0.42997115738098735,
"learning_rate": 0.00013939737889530948,
"loss": 1.0182,
"step": 607
},
{
"epoch": 1.554987212276215,
"grad_norm": 0.519737904298238,
"learning_rate": 0.00013930157084955387,
"loss": 1.0432,
"step": 608
},
{
"epoch": 1.5575447570332481,
"grad_norm": 0.5413718715320616,
"learning_rate": 0.00013920557364168872,
"loss": 1.0392,
"step": 609
},
{
"epoch": 1.5601023017902813,
"grad_norm": 0.4622784565390988,
"learning_rate": 0.00013910938757792911,
"loss": 1.0089,
"step": 610
},
{
"epoch": 1.5626598465473145,
"grad_norm": 0.517572135003303,
"learning_rate": 0.00013901301296509247,
"loss": 1.0433,
"step": 611
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.6472771877158792,
"learning_rate": 0.00013891645011059774,
"loss": 1.033,
"step": 612
},
{
"epoch": 1.567774936061381,
"grad_norm": 0.73777975779115,
"learning_rate": 0.00013881969932246434,
"loss": 1.0233,
"step": 613
},
{
"epoch": 1.5703324808184145,
"grad_norm": 0.6556752106938734,
"learning_rate": 0.00013872276090931112,
"loss": 1.0283,
"step": 614
},
{
"epoch": 1.5728900255754477,
"grad_norm": 0.647001672639268,
"learning_rate": 0.0001386256351803554,
"loss": 1.0449,
"step": 615
},
{
"epoch": 1.5754475703324808,
"grad_norm": 0.755466796600313,
"learning_rate": 0.00013852832244541207,
"loss": 1.0005,
"step": 616
},
{
"epoch": 1.578005115089514,
"grad_norm": 0.9067726592525303,
"learning_rate": 0.00013843082301489247,
"loss": 1.034,
"step": 617
},
{
"epoch": 1.5805626598465472,
"grad_norm": 1.205016289595881,
"learning_rate": 0.00013833313719980358,
"loss": 1.0292,
"step": 618
},
{
"epoch": 1.5831202046035806,
"grad_norm": 0.8478168612376876,
"learning_rate": 0.00013823526531174675,
"loss": 1.0142,
"step": 619
},
{
"epoch": 1.5856777493606138,
"grad_norm": 0.7403592560784086,
"learning_rate": 0.000138137207662917,
"loss": 1.0019,
"step": 620
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.6403376151233803,
"learning_rate": 0.00013803896456610187,
"loss": 1.0308,
"step": 621
},
{
"epoch": 1.5907928388746804,
"grad_norm": 0.712308710605845,
"learning_rate": 0.0001379405363346804,
"loss": 1.0455,
"step": 622
},
{
"epoch": 1.5933503836317136,
"grad_norm": 0.6512025986675177,
"learning_rate": 0.00013784192328262227,
"loss": 1.018,
"step": 623
},
{
"epoch": 1.5959079283887467,
"grad_norm": 0.6467882755688008,
"learning_rate": 0.00013774312572448658,
"loss": 1.0566,
"step": 624
},
{
"epoch": 1.59846547314578,
"grad_norm": 0.7409770827879977,
"learning_rate": 0.00013764414397542113,
"loss": 1.0759,
"step": 625
},
{
"epoch": 1.6010230179028133,
"grad_norm": 0.8147656835217053,
"learning_rate": 0.0001375449783511611,
"loss": 1.0041,
"step": 626
},
{
"epoch": 1.6035805626598465,
"grad_norm": 0.9034624506464588,
"learning_rate": 0.0001374456291680283,
"loss": 1.0141,
"step": 627
},
{
"epoch": 1.60613810741688,
"grad_norm": 1.0050570938199166,
"learning_rate": 0.00013734609674293001,
"loss": 1.0532,
"step": 628
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.9807521253903259,
"learning_rate": 0.00013724638139335808,
"loss": 1.0079,
"step": 629
},
{
"epoch": 1.6112531969309463,
"grad_norm": 1.0251289878636651,
"learning_rate": 0.00013714648343738785,
"loss": 1.014,
"step": 630
},
{
"epoch": 1.6138107416879794,
"grad_norm": 1.1145588268761022,
"learning_rate": 0.00013704640319367706,
"loss": 1.0217,
"step": 631
},
{
"epoch": 1.6163682864450126,
"grad_norm": 0.9024588644594059,
"learning_rate": 0.000136946140981465,
"loss": 1.0151,
"step": 632
},
{
"epoch": 1.618925831202046,
"grad_norm": 0.7164435145214515,
"learning_rate": 0.00013684569712057141,
"loss": 0.9972,
"step": 633
},
{
"epoch": 1.6214833759590794,
"grad_norm": 0.40989603024156007,
"learning_rate": 0.0001367450719313954,
"loss": 1.0438,
"step": 634
},
{
"epoch": 1.6240409207161126,
"grad_norm": 0.4621187072292993,
"learning_rate": 0.00013664426573491454,
"loss": 0.9964,
"step": 635
},
{
"epoch": 1.6265984654731458,
"grad_norm": 0.7796243265332405,
"learning_rate": 0.0001365432788526838,
"loss": 1.0428,
"step": 636
},
{
"epoch": 1.629156010230179,
"grad_norm": 0.9807118313427811,
"learning_rate": 0.0001364421116068344,
"loss": 1.0374,
"step": 637
},
{
"epoch": 1.6317135549872122,
"grad_norm": 1.0521751456854462,
"learning_rate": 0.00013634076432007298,
"loss": 1.022,
"step": 638
},
{
"epoch": 1.6342710997442456,
"grad_norm": 1.014819808376515,
"learning_rate": 0.00013623923731568053,
"loss": 1.0555,
"step": 639
},
{
"epoch": 1.6368286445012787,
"grad_norm": 0.8908217824529507,
"learning_rate": 0.00013613753091751117,
"loss": 0.9896,
"step": 640
},
{
"epoch": 1.6393861892583121,
"grad_norm": 0.7338590542416318,
"learning_rate": 0.00013603564544999134,
"loss": 1.0104,
"step": 641
},
{
"epoch": 1.6419437340153453,
"grad_norm": 0.4947515917010355,
"learning_rate": 0.00013593358123811873,
"loss": 1.013,
"step": 642
},
{
"epoch": 1.6445012787723785,
"grad_norm": 0.3613565103885808,
"learning_rate": 0.00013583133860746102,
"loss": 1.0285,
"step": 643
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.44918465574622884,
"learning_rate": 0.00013572891788415526,
"loss": 1.0735,
"step": 644
},
{
"epoch": 1.6496163682864449,
"grad_norm": 0.6919277753013154,
"learning_rate": 0.00013562631939490638,
"loss": 0.9838,
"step": 645
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.998596135317296,
"learning_rate": 0.00013552354346698644,
"loss": 1.0407,
"step": 646
},
{
"epoch": 1.6547314578005117,
"grad_norm": 1.1274200277350097,
"learning_rate": 0.0001354205904282335,
"loss": 0.9994,
"step": 647
},
{
"epoch": 1.6572890025575449,
"grad_norm": 0.7298162047765786,
"learning_rate": 0.0001353174606070505,
"loss": 1.0158,
"step": 648
},
{
"epoch": 1.659846547314578,
"grad_norm": 0.4959923867676345,
"learning_rate": 0.00013521415433240448,
"loss": 1.0223,
"step": 649
},
{
"epoch": 1.6624040920716112,
"grad_norm": 0.4028073795408234,
"learning_rate": 0.0001351106719338251,
"loss": 1.0048,
"step": 650
},
{
"epoch": 1.6649616368286444,
"grad_norm": 0.4151895967851957,
"learning_rate": 0.000135007013741404,
"loss": 1.031,
"step": 651
},
{
"epoch": 1.6675191815856778,
"grad_norm": 0.493296338959119,
"learning_rate": 0.0001349031800857934,
"loss": 1.0551,
"step": 652
},
{
"epoch": 1.670076726342711,
"grad_norm": 0.5474927271625798,
"learning_rate": 0.00013479917129820547,
"loss": 1.0296,
"step": 653
},
{
"epoch": 1.6726342710997444,
"grad_norm": 0.6314250125042725,
"learning_rate": 0.00013469498771041078,
"loss": 1.0355,
"step": 654
},
{
"epoch": 1.6751918158567776,
"grad_norm": 0.7183033795455095,
"learning_rate": 0.0001345906296547376,
"loss": 1.0239,
"step": 655
},
{
"epoch": 1.6777493606138107,
"grad_norm": 0.6627049343116693,
"learning_rate": 0.00013448609746407076,
"loss": 1.0107,
"step": 656
},
{
"epoch": 1.680306905370844,
"grad_norm": 0.8323267890128159,
"learning_rate": 0.0001343813914718504,
"loss": 1.0132,
"step": 657
},
{
"epoch": 1.682864450127877,
"grad_norm": 1.0100396544553614,
"learning_rate": 0.0001342765120120712,
"loss": 1.034,
"step": 658
},
{
"epoch": 1.6854219948849105,
"grad_norm": 0.9397586944756832,
"learning_rate": 0.0001341714594192811,
"loss": 1.0359,
"step": 659
},
{
"epoch": 1.6879795396419437,
"grad_norm": 0.60948367814948,
"learning_rate": 0.00013406623402858038,
"loss": 1.0515,
"step": 660
},
{
"epoch": 1.690537084398977,
"grad_norm": 0.4064851961480879,
"learning_rate": 0.00013396083617562041,
"loss": 1.0295,
"step": 661
},
{
"epoch": 1.6930946291560103,
"grad_norm": 0.4835321670487211,
"learning_rate": 0.0001338552661966028,
"loss": 1.0218,
"step": 662
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.5087590456762057,
"learning_rate": 0.00013374952442827813,
"loss": 1.0438,
"step": 663
},
{
"epoch": 1.6982097186700766,
"grad_norm": 0.487251739240553,
"learning_rate": 0.00013364361120794495,
"loss": 1.0293,
"step": 664
},
{
"epoch": 1.7007672634271098,
"grad_norm": 0.5712982739684782,
"learning_rate": 0.00013353752687344882,
"loss": 1.0332,
"step": 665
},
{
"epoch": 1.7033248081841432,
"grad_norm": 0.7033661782388088,
"learning_rate": 0.000133431271763181,
"loss": 1.0053,
"step": 666
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.6935444307133046,
"learning_rate": 0.00013332484621607758,
"loss": 1.0262,
"step": 667
},
{
"epoch": 1.7084398976982098,
"grad_norm": 0.7341105705188075,
"learning_rate": 0.00013321825057161825,
"loss": 1.0156,
"step": 668
},
{
"epoch": 1.710997442455243,
"grad_norm": 0.7907280681410083,
"learning_rate": 0.00013311148516982534,
"loss": 1.0413,
"step": 669
},
{
"epoch": 1.7135549872122762,
"grad_norm": 0.7112672488330658,
"learning_rate": 0.00013300455035126268,
"loss": 1.0199,
"step": 670
},
{
"epoch": 1.7161125319693094,
"grad_norm": 0.5766576717286938,
"learning_rate": 0.00013289744645703444,
"loss": 1.0361,
"step": 671
},
{
"epoch": 1.7186700767263428,
"grad_norm": 0.5059688666618373,
"learning_rate": 0.0001327901738287842,
"loss": 1.0385,
"step": 672
},
{
"epoch": 1.721227621483376,
"grad_norm": 0.45263501963427877,
"learning_rate": 0.0001326827328086937,
"loss": 1.0163,
"step": 673
},
{
"epoch": 1.7237851662404093,
"grad_norm": 0.5156404930129397,
"learning_rate": 0.00013257512373948186,
"loss": 1.0592,
"step": 674
},
{
"epoch": 1.7263427109974425,
"grad_norm": 0.6373966994332245,
"learning_rate": 0.00013246734696440368,
"loss": 1.0303,
"step": 675
},
{
"epoch": 1.7289002557544757,
"grad_norm": 0.6497706378399105,
"learning_rate": 0.000132359402827249,
"loss": 0.9963,
"step": 676
},
{
"epoch": 1.7314578005115089,
"grad_norm": 0.6649205635237081,
"learning_rate": 0.0001322512916723417,
"loss": 1.0133,
"step": 677
},
{
"epoch": 1.734015345268542,
"grad_norm": 0.7302337459964975,
"learning_rate": 0.00013214301384453824,
"loss": 1.0143,
"step": 678
},
{
"epoch": 1.7365728900255755,
"grad_norm": 0.7742690150052379,
"learning_rate": 0.00013203456968922684,
"loss": 1.0164,
"step": 679
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.6798309822233196,
"learning_rate": 0.0001319259595523262,
"loss": 1.0172,
"step": 680
},
{
"epoch": 1.741687979539642,
"grad_norm": 0.5208733748449712,
"learning_rate": 0.0001318171837802846,
"loss": 1.0048,
"step": 681
},
{
"epoch": 1.7442455242966752,
"grad_norm": 0.41856841228081965,
"learning_rate": 0.00013170824272007854,
"loss": 1.0508,
"step": 682
},
{
"epoch": 1.7468030690537084,
"grad_norm": 0.41744052183195546,
"learning_rate": 0.00013159913671921184,
"loss": 1.0433,
"step": 683
},
{
"epoch": 1.7493606138107416,
"grad_norm": 0.45034351237029546,
"learning_rate": 0.00013148986612571438,
"loss": 1.0281,
"step": 684
},
{
"epoch": 1.7519181585677748,
"grad_norm": 0.5021896906440644,
"learning_rate": 0.00013138043128814114,
"loss": 1.0207,
"step": 685
},
{
"epoch": 1.7544757033248082,
"grad_norm": 0.6367316434278153,
"learning_rate": 0.000131270832555571,
"loss": 1.0509,
"step": 686
},
{
"epoch": 1.7570332480818416,
"grad_norm": 0.9449450079946309,
"learning_rate": 0.00013116107027760557,
"loss": 1.0263,
"step": 687
},
{
"epoch": 1.7595907928388748,
"grad_norm": 1.2671861813793404,
"learning_rate": 0.00013105114480436823,
"loss": 1.015,
"step": 688
},
{
"epoch": 1.762148337595908,
"grad_norm": 0.6133472053088566,
"learning_rate": 0.00013094105648650285,
"loss": 0.9964,
"step": 689
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.5563333895443464,
"learning_rate": 0.00013083080567517284,
"loss": 1.0221,
"step": 690
},
{
"epoch": 1.7672634271099743,
"grad_norm": 0.8984060988722041,
"learning_rate": 0.0001307203927220598,
"loss": 1.0333,
"step": 691
},
{
"epoch": 1.7698209718670077,
"grad_norm": 1.1600459077736829,
"learning_rate": 0.0001306098179793627,
"loss": 1.0281,
"step": 692
},
{
"epoch": 1.772378516624041,
"grad_norm": 0.8749748158295617,
"learning_rate": 0.00013049908179979644,
"loss": 1.0414,
"step": 693
},
{
"epoch": 1.7749360613810743,
"grad_norm": 0.6456013771393564,
"learning_rate": 0.00013038818453659098,
"loss": 0.9934,
"step": 694
},
{
"epoch": 1.7774936061381075,
"grad_norm": 0.4834000513881869,
"learning_rate": 0.00013027712654349003,
"loss": 1.0077,
"step": 695
},
{
"epoch": 1.7800511508951407,
"grad_norm": 0.46969762642929197,
"learning_rate": 0.0001301659081747501,
"loss": 1.0408,
"step": 696
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.5147779689056563,
"learning_rate": 0.0001300545297851392,
"loss": 1.0186,
"step": 697
},
{
"epoch": 1.785166240409207,
"grad_norm": 0.55729153001615,
"learning_rate": 0.0001299429917299358,
"loss": 1.0329,
"step": 698
},
{
"epoch": 1.7877237851662404,
"grad_norm": 0.5260414108398854,
"learning_rate": 0.00012983129436492763,
"loss": 1.0233,
"step": 699
},
{
"epoch": 1.7902813299232738,
"grad_norm": 0.5427361149590243,
"learning_rate": 0.00012971943804641068,
"loss": 1.0409,
"step": 700
},
{
"epoch": 1.792838874680307,
"grad_norm": 0.5405520825559765,
"learning_rate": 0.0001296074231311879,
"loss": 1.0066,
"step": 701
},
{
"epoch": 1.7953964194373402,
"grad_norm": 0.6297890907155308,
"learning_rate": 0.0001294952499765682,
"loss": 1.0254,
"step": 702
},
{
"epoch": 1.7979539641943734,
"grad_norm": 0.6644546067252105,
"learning_rate": 0.00012938291894036522,
"loss": 1.0285,
"step": 703
},
{
"epoch": 1.8005115089514065,
"grad_norm": 0.683427488866508,
"learning_rate": 0.00012927043038089616,
"loss": 1.0091,
"step": 704
},
{
"epoch": 1.80306905370844,
"grad_norm": 0.6319295334248269,
"learning_rate": 0.00012915778465698077,
"loss": 1.0397,
"step": 705
},
{
"epoch": 1.8056265984654731,
"grad_norm": 0.5438735087695892,
"learning_rate": 0.00012904498212794007,
"loss": 0.991,
"step": 706
},
{
"epoch": 1.8081841432225065,
"grad_norm": 0.5047705166677889,
"learning_rate": 0.00012893202315359537,
"loss": 0.9944,
"step": 707
},
{
"epoch": 1.8107416879795397,
"grad_norm": 0.5361496724146492,
"learning_rate": 0.00012881890809426688,
"loss": 1.0212,
"step": 708
},
{
"epoch": 1.813299232736573,
"grad_norm": 0.4758891777297796,
"learning_rate": 0.00012870563731077277,
"loss": 0.9717,
"step": 709
},
{
"epoch": 1.815856777493606,
"grad_norm": 0.41562952895729655,
"learning_rate": 0.0001285922111644279,
"loss": 1.0162,
"step": 710
},
{
"epoch": 1.8184143222506393,
"grad_norm": 0.4923656957788762,
"learning_rate": 0.00012847863001704278,
"loss": 1.0685,
"step": 711
},
{
"epoch": 1.8209718670076727,
"grad_norm": 0.43817036243213936,
"learning_rate": 0.00012836489423092225,
"loss": 1.0166,
"step": 712
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.36194875273904087,
"learning_rate": 0.00012825100416886454,
"loss": 1.0255,
"step": 713
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.5507986270387409,
"learning_rate": 0.0001281369601941599,
"loss": 1.0135,
"step": 714
},
{
"epoch": 1.8286445012787724,
"grad_norm": 0.685338916623197,
"learning_rate": 0.00012802276267058957,
"loss": 0.999,
"step": 715
},
{
"epoch": 1.8312020460358056,
"grad_norm": 0.5568312967518175,
"learning_rate": 0.00012790841196242458,
"loss": 1.0153,
"step": 716
},
{
"epoch": 1.8337595907928388,
"grad_norm": 0.4401729278401454,
"learning_rate": 0.00012779390843442462,
"loss": 0.9855,
"step": 717
},
{
"epoch": 1.836317135549872,
"grad_norm": 0.4249893778808539,
"learning_rate": 0.00012767925245183676,
"loss": 1.0351,
"step": 718
},
{
"epoch": 1.8388746803069054,
"grad_norm": 0.47539299147834413,
"learning_rate": 0.00012756444438039453,
"loss": 1.035,
"step": 719
},
{
"epoch": 1.8414322250639388,
"grad_norm": 0.5475741371560751,
"learning_rate": 0.00012744948458631646,
"loss": 1.0412,
"step": 720
},
{
"epoch": 1.843989769820972,
"grad_norm": 0.5751955332609484,
"learning_rate": 0.0001273343734363051,
"loss": 1.0419,
"step": 721
},
{
"epoch": 1.8465473145780051,
"grad_norm": 0.5673429560849089,
"learning_rate": 0.00012721911129754578,
"loss": 0.9993,
"step": 722
},
{
"epoch": 1.8491048593350383,
"grad_norm": 0.475786389030356,
"learning_rate": 0.0001271036985377055,
"loss": 1.0255,
"step": 723
},
{
"epoch": 1.8516624040920715,
"grad_norm": 0.4435215042959613,
"learning_rate": 0.00012698813552493174,
"loss": 1.0159,
"step": 724
},
{
"epoch": 1.854219948849105,
"grad_norm": 0.6384652673350472,
"learning_rate": 0.00012687242262785116,
"loss": 1.0468,
"step": 725
},
{
"epoch": 1.856777493606138,
"grad_norm": 0.660707948092585,
"learning_rate": 0.00012675656021556855,
"loss": 0.9702,
"step": 726
},
{
"epoch": 1.8593350383631715,
"grad_norm": 0.5190779530078301,
"learning_rate": 0.00012664054865766573,
"loss": 0.9959,
"step": 727
},
{
"epoch": 1.8618925831202047,
"grad_norm": 0.59002541889049,
"learning_rate": 0.00012652438832420017,
"loss": 1.0009,
"step": 728
},
{
"epoch": 1.8644501278772379,
"grad_norm": 0.724406502768554,
"learning_rate": 0.00012640807958570394,
"loss": 1.0572,
"step": 729
},
{
"epoch": 1.867007672634271,
"grad_norm": 0.606082979636232,
"learning_rate": 0.00012629162281318248,
"loss": 1.0123,
"step": 730
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.3890444487309348,
"learning_rate": 0.00012617501837811347,
"loss": 0.9835,
"step": 731
},
{
"epoch": 1.8721227621483376,
"grad_norm": 0.4748189131220067,
"learning_rate": 0.00012605826665244559,
"loss": 1.0206,
"step": 732
},
{
"epoch": 1.8746803069053708,
"grad_norm": 0.5894024279814004,
"learning_rate": 0.00012594136800859733,
"loss": 1.0312,
"step": 733
},
{
"epoch": 1.8772378516624042,
"grad_norm": 0.8812294314944346,
"learning_rate": 0.00012582432281945587,
"loss": 0.9929,
"step": 734
},
{
"epoch": 1.8797953964194374,
"grad_norm": 1.2695722544281176,
"learning_rate": 0.0001257071314583758,
"loss": 1.0232,
"step": 735
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.7877721338048511,
"learning_rate": 0.00012558979429917803,
"loss": 1.0528,
"step": 736
},
{
"epoch": 1.8849104859335037,
"grad_norm": 0.6479567586178989,
"learning_rate": 0.00012547231171614845,
"loss": 1.0262,
"step": 737
},
{
"epoch": 1.887468030690537,
"grad_norm": 0.6844520570754378,
"learning_rate": 0.00012535468408403697,
"loss": 1.0333,
"step": 738
},
{
"epoch": 1.8900255754475703,
"grad_norm": 0.6085957966970293,
"learning_rate": 0.00012523691177805597,
"loss": 1.0168,
"step": 739
},
{
"epoch": 1.8925831202046037,
"grad_norm": 0.5254572324853038,
"learning_rate": 0.00012511899517387955,
"loss": 0.9883,
"step": 740
},
{
"epoch": 1.895140664961637,
"grad_norm": 0.6139364866532532,
"learning_rate": 0.00012500093464764197,
"loss": 0.9977,
"step": 741
},
{
"epoch": 1.89769820971867,
"grad_norm": 0.6998963267481692,
"learning_rate": 0.00012488273057593654,
"loss": 1.0044,
"step": 742
},
{
"epoch": 1.9002557544757033,
"grad_norm": 0.5270554785542413,
"learning_rate": 0.00012476438333581456,
"loss": 1.0412,
"step": 743
},
{
"epoch": 1.9028132992327365,
"grad_norm": 0.5157043265448235,
"learning_rate": 0.00012464589330478398,
"loss": 0.9978,
"step": 744
},
{
"epoch": 1.9053708439897699,
"grad_norm": 0.5631065206891138,
"learning_rate": 0.0001245272608608082,
"loss": 0.9944,
"step": 745
},
{
"epoch": 1.907928388746803,
"grad_norm": 0.4807212257749526,
"learning_rate": 0.00012440848638230485,
"loss": 1.0184,
"step": 746
},
{
"epoch": 1.9104859335038364,
"grad_norm": 0.42670701279562534,
"learning_rate": 0.00012428957024814477,
"loss": 1.0105,
"step": 747
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.41188284810782877,
"learning_rate": 0.00012417051283765055,
"loss": 1.0256,
"step": 748
},
{
"epoch": 1.9156010230179028,
"grad_norm": 0.39912216267661754,
"learning_rate": 0.0001240513145305954,
"loss": 1.0479,
"step": 749
},
{
"epoch": 1.918158567774936,
"grad_norm": 0.40181896505552256,
"learning_rate": 0.00012393197570720208,
"loss": 1.0006,
"step": 750
},
{
"epoch": 1.9207161125319692,
"grad_norm": 0.4686514718132313,
"learning_rate": 0.0001238124967481415,
"loss": 1.0527,
"step": 751
},
{
"epoch": 1.9232736572890026,
"grad_norm": 0.4847458570755899,
"learning_rate": 0.00012369287803453156,
"loss": 1.0039,
"step": 752
},
{
"epoch": 1.9258312020460358,
"grad_norm": 0.5873940841619928,
"learning_rate": 0.00012357311994793603,
"loss": 1.0191,
"step": 753
},
{
"epoch": 1.9283887468030692,
"grad_norm": 0.6710549953392281,
"learning_rate": 0.00012345322287036315,
"loss": 1.014,
"step": 754
},
{
"epoch": 1.9309462915601023,
"grad_norm": 0.7897611598340533,
"learning_rate": 0.0001233331871842646,
"loss": 0.9853,
"step": 755
},
{
"epoch": 1.9335038363171355,
"grad_norm": 0.870069888372245,
"learning_rate": 0.0001232130132725342,
"loss": 1.022,
"step": 756
},
{
"epoch": 1.9360613810741687,
"grad_norm": 1.0698935466826593,
"learning_rate": 0.00012309270151850666,
"loss": 1.0199,
"step": 757
},
{
"epoch": 1.938618925831202,
"grad_norm": 1.0318153691478889,
"learning_rate": 0.00012297225230595637,
"loss": 1.0008,
"step": 758
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.8031059628622865,
"learning_rate": 0.0001228516660190962,
"loss": 1.0464,
"step": 759
},
{
"epoch": 1.9437340153452687,
"grad_norm": 0.4432470641559668,
"learning_rate": 0.00012273094304257633,
"loss": 1.0486,
"step": 760
},
{
"epoch": 1.9462915601023019,
"grad_norm": 0.4413834236432169,
"learning_rate": 0.00012261008376148282,
"loss": 1.0483,
"step": 761
},
{
"epoch": 1.948849104859335,
"grad_norm": 0.5753204802658383,
"learning_rate": 0.0001224890885613366,
"loss": 1.026,
"step": 762
},
{
"epoch": 1.9514066496163682,
"grad_norm": 0.6330964706251369,
"learning_rate": 0.00012236795782809225,
"loss": 1.017,
"step": 763
},
{
"epoch": 1.9539641943734014,
"grad_norm": 0.6869010778127252,
"learning_rate": 0.00012224669194813647,
"loss": 1.031,
"step": 764
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.7455335150670086,
"learning_rate": 0.00012212529130828725,
"loss": 0.9639,
"step": 765
},
{
"epoch": 1.959079283887468,
"grad_norm": 0.6598851148094896,
"learning_rate": 0.00012200375629579234,
"loss": 1.0298,
"step": 766
},
{
"epoch": 1.9616368286445014,
"grad_norm": 0.44847708135640946,
"learning_rate": 0.0001218820872983281,
"loss": 0.9979,
"step": 767
},
{
"epoch": 1.9641943734015346,
"grad_norm": 0.4421542384496395,
"learning_rate": 0.00012176028470399836,
"loss": 1.0219,
"step": 768
},
{
"epoch": 1.9667519181585678,
"grad_norm": 0.5551681283301225,
"learning_rate": 0.00012163834890133303,
"loss": 1.0321,
"step": 769
},
{
"epoch": 1.969309462915601,
"grad_norm": 0.5433680138372817,
"learning_rate": 0.000121516280279287,
"loss": 1.0152,
"step": 770
},
{
"epoch": 1.9718670076726341,
"grad_norm": 0.3927534411279976,
"learning_rate": 0.00012139407922723875,
"loss": 1.0056,
"step": 771
},
{
"epoch": 1.9744245524296675,
"grad_norm": 0.3504638375301521,
"learning_rate": 0.00012127174613498925,
"loss": 1.0211,
"step": 772
},
{
"epoch": 1.976982097186701,
"grad_norm": 0.5235226714465111,
"learning_rate": 0.00012114928139276064,
"loss": 1.0298,
"step": 773
},
{
"epoch": 1.979539641943734,
"grad_norm": 0.47218634270204046,
"learning_rate": 0.00012102668539119501,
"loss": 0.997,
"step": 774
},
{
"epoch": 1.9820971867007673,
"grad_norm": 0.3909468495312419,
"learning_rate": 0.00012090395852135314,
"loss": 1.008,
"step": 775
},
{
"epoch": 1.9846547314578005,
"grad_norm": 0.3354579546285365,
"learning_rate": 0.0001207811011747132,
"loss": 1.0247,
"step": 776
},
{
"epoch": 1.9872122762148337,
"grad_norm": 0.3467079716757078,
"learning_rate": 0.00012065811374316966,
"loss": 1.0049,
"step": 777
},
{
"epoch": 1.989769820971867,
"grad_norm": 0.3407603167118022,
"learning_rate": 0.0001205349966190319,
"loss": 1.0454,
"step": 778
},
{
"epoch": 1.9923273657289002,
"grad_norm": 0.3172074392515775,
"learning_rate": 0.00012041175019502295,
"loss": 1.0269,
"step": 779
},
{
"epoch": 1.9948849104859336,
"grad_norm": 0.38289682905322714,
"learning_rate": 0.00012028837486427837,
"loss": 1.0085,
"step": 780
},
{
"epoch": 1.9974424552429668,
"grad_norm": 0.3409699287203162,
"learning_rate": 0.00012016487102034482,
"loss": 1.0151,
"step": 781
},
{
"epoch": 2.0,
"grad_norm": 0.4841721621140613,
"learning_rate": 0.00012004123905717898,
"loss": 0.9888,
"step": 782
},
{
"epoch": 2.002557544757033,
"grad_norm": 0.5947034995797379,
"learning_rate": 0.00011991747936914614,
"loss": 0.98,
"step": 783
},
{
"epoch": 2.0051150895140664,
"grad_norm": 0.5314717777356649,
"learning_rate": 0.00011979359235101906,
"loss": 0.966,
"step": 784
},
{
"epoch": 2.0076726342710995,
"grad_norm": 0.4148615363763489,
"learning_rate": 0.00011966957839797664,
"loss": 0.9695,
"step": 785
},
{
"epoch": 2.010230179028133,
"grad_norm": 0.4001599305252567,
"learning_rate": 0.00011954543790560267,
"loss": 1.0493,
"step": 786
},
{
"epoch": 2.0127877237851663,
"grad_norm": 0.43752065357850173,
"learning_rate": 0.00011942117126988461,
"loss": 0.9883,
"step": 787
},
{
"epoch": 2.0153452685421995,
"grad_norm": 0.5092717368916159,
"learning_rate": 0.00011929677888721227,
"loss": 0.9984,
"step": 788
},
{
"epoch": 2.0179028132992327,
"grad_norm": 0.5840375290444557,
"learning_rate": 0.00011917226115437656,
"loss": 0.9833,
"step": 789
},
{
"epoch": 2.020460358056266,
"grad_norm": 0.573138093028074,
"learning_rate": 0.00011904761846856831,
"loss": 0.9724,
"step": 790
},
{
"epoch": 2.023017902813299,
"grad_norm": 0.5890770850578259,
"learning_rate": 0.00011892285122737683,
"loss": 0.9699,
"step": 791
},
{
"epoch": 2.0255754475703327,
"grad_norm": 0.5692021165096304,
"learning_rate": 0.00011879795982878883,
"loss": 0.9741,
"step": 792
},
{
"epoch": 2.028132992327366,
"grad_norm": 0.6399550167383995,
"learning_rate": 0.00011867294467118698,
"loss": 0.9682,
"step": 793
},
{
"epoch": 2.030690537084399,
"grad_norm": 0.7338640869363395,
"learning_rate": 0.00011854780615334875,
"loss": 0.9683,
"step": 794
},
{
"epoch": 2.0332480818414322,
"grad_norm": 0.806906500405086,
"learning_rate": 0.00011842254467444517,
"loss": 0.9756,
"step": 795
},
{
"epoch": 2.0358056265984654,
"grad_norm": 0.7925351913713344,
"learning_rate": 0.0001182971606340394,
"loss": 0.9853,
"step": 796
},
{
"epoch": 2.0383631713554986,
"grad_norm": 0.6258347835444797,
"learning_rate": 0.00011817165443208562,
"loss": 1.0054,
"step": 797
},
{
"epoch": 2.040920716112532,
"grad_norm": 0.4512585898690294,
"learning_rate": 0.00011804602646892762,
"loss": 0.9792,
"step": 798
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.3681772077619349,
"learning_rate": 0.00011792027714529767,
"loss": 0.9788,
"step": 799
},
{
"epoch": 2.0460358056265986,
"grad_norm": 0.4769785686846811,
"learning_rate": 0.0001177944068623151,
"loss": 1.023,
"step": 800
},
{
"epoch": 2.0485933503836318,
"grad_norm": 0.5513670753501893,
"learning_rate": 0.00011766841602148507,
"loss": 0.9758,
"step": 801
},
{
"epoch": 2.051150895140665,
"grad_norm": 0.5343242524485008,
"learning_rate": 0.00011754230502469739,
"loss": 0.9828,
"step": 802
},
{
"epoch": 2.053708439897698,
"grad_norm": 0.3790786798266737,
"learning_rate": 0.00011741607427422502,
"loss": 0.9891,
"step": 803
},
{
"epoch": 2.0562659846547313,
"grad_norm": 0.3356594047836669,
"learning_rate": 0.000117289724172723,
"loss": 1.0182,
"step": 804
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.4979916614188739,
"learning_rate": 0.00011716325512322707,
"loss": 0.9653,
"step": 805
},
{
"epoch": 2.061381074168798,
"grad_norm": 0.5917115439040083,
"learning_rate": 0.00011703666752915235,
"loss": 0.9779,
"step": 806
},
{
"epoch": 2.0639386189258313,
"grad_norm": 0.7711282568070231,
"learning_rate": 0.00011690996179429219,
"loss": 1.0192,
"step": 807
},
{
"epoch": 2.0664961636828645,
"grad_norm": 0.9738458712850159,
"learning_rate": 0.00011678313832281664,
"loss": 0.9929,
"step": 808
},
{
"epoch": 2.0690537084398977,
"grad_norm": 1.0543246508556696,
"learning_rate": 0.00011665619751927146,
"loss": 0.9711,
"step": 809
},
{
"epoch": 2.071611253196931,
"grad_norm": 0.7273546848221022,
"learning_rate": 0.00011652913978857664,
"loss": 0.9732,
"step": 810
},
{
"epoch": 2.074168797953964,
"grad_norm": 0.5119256334998138,
"learning_rate": 0.00011640196553602505,
"loss": 0.9955,
"step": 811
},
{
"epoch": 2.0767263427109977,
"grad_norm": 0.36268273560962566,
"learning_rate": 0.00011627467516728138,
"loss": 0.9706,
"step": 812
},
{
"epoch": 2.079283887468031,
"grad_norm": 0.40355937427082544,
"learning_rate": 0.00011614726908838063,
"loss": 0.9712,
"step": 813
},
{
"epoch": 2.081841432225064,
"grad_norm": 0.5018343946579583,
"learning_rate": 0.00011601974770572692,
"loss": 1.0314,
"step": 814
},
{
"epoch": 2.084398976982097,
"grad_norm": 0.49570234160885446,
"learning_rate": 0.0001158921114260922,
"loss": 0.961,
"step": 815
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.5836483164644858,
"learning_rate": 0.00011576436065661484,
"loss": 0.9732,
"step": 816
},
{
"epoch": 2.0895140664961636,
"grad_norm": 0.562651886144191,
"learning_rate": 0.00011563649580479848,
"loss": 0.9827,
"step": 817
},
{
"epoch": 2.0920716112531967,
"grad_norm": 0.3634053027085326,
"learning_rate": 0.00011550851727851067,
"loss": 0.9634,
"step": 818
},
{
"epoch": 2.0946291560102304,
"grad_norm": 0.35421206748470696,
"learning_rate": 0.00011538042548598154,
"loss": 0.9674,
"step": 819
},
{
"epoch": 2.0971867007672635,
"grad_norm": 0.34410099266933664,
"learning_rate": 0.00011525222083580247,
"loss": 0.9682,
"step": 820
},
{
"epoch": 2.0997442455242967,
"grad_norm": 0.36019738429870557,
"learning_rate": 0.00011512390373692495,
"loss": 0.98,
"step": 821
},
{
"epoch": 2.10230179028133,
"grad_norm": 0.4497160405180852,
"learning_rate": 0.00011499547459865908,
"loss": 0.9658,
"step": 822
},
{
"epoch": 2.104859335038363,
"grad_norm": 0.48924052145081715,
"learning_rate": 0.00011486693383067234,
"loss": 0.9961,
"step": 823
},
{
"epoch": 2.1074168797953963,
"grad_norm": 0.51728675513698,
"learning_rate": 0.0001147382818429884,
"loss": 0.9886,
"step": 824
},
{
"epoch": 2.10997442455243,
"grad_norm": 0.48298534091718054,
"learning_rate": 0.0001146095190459855,
"loss": 0.99,
"step": 825
},
{
"epoch": 2.112531969309463,
"grad_norm": 0.3873329201691133,
"learning_rate": 0.00011448064585039555,
"loss": 0.9855,
"step": 826
},
{
"epoch": 2.1150895140664963,
"grad_norm": 0.36617676835976043,
"learning_rate": 0.0001143516626673025,
"loss": 0.9784,
"step": 827
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.39303542839485295,
"learning_rate": 0.00011422256990814115,
"loss": 0.9884,
"step": 828
},
{
"epoch": 2.1202046035805626,
"grad_norm": 0.5159106405133932,
"learning_rate": 0.0001140933679846959,
"loss": 0.9926,
"step": 829
},
{
"epoch": 2.122762148337596,
"grad_norm": 0.7469560811887815,
"learning_rate": 0.00011396405730909925,
"loss": 1.0183,
"step": 830
},
{
"epoch": 2.125319693094629,
"grad_norm": 0.7327464479712988,
"learning_rate": 0.00011383463829383071,
"loss": 1.0098,
"step": 831
},
{
"epoch": 2.1278772378516626,
"grad_norm": 0.5977082749289835,
"learning_rate": 0.00011370511135171532,
"loss": 1.0071,
"step": 832
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.4052295767189102,
"learning_rate": 0.00011357547689592237,
"loss": 1.0049,
"step": 833
},
{
"epoch": 2.132992327365729,
"grad_norm": 0.5292207555015371,
"learning_rate": 0.00011344573533996417,
"loss": 0.9656,
"step": 834
},
{
"epoch": 2.135549872122762,
"grad_norm": 0.4549224765225602,
"learning_rate": 0.0001133158870976946,
"loss": 0.9968,
"step": 835
},
{
"epoch": 2.1381074168797953,
"grad_norm": 0.4460508304219039,
"learning_rate": 0.00011318593258330785,
"loss": 1.0134,
"step": 836
},
{
"epoch": 2.1406649616368285,
"grad_norm": 0.46592246024671363,
"learning_rate": 0.00011305587221133718,
"loss": 0.9522,
"step": 837
},
{
"epoch": 2.1432225063938617,
"grad_norm": 0.4489945484428353,
"learning_rate": 0.00011292570639665342,
"loss": 1.0104,
"step": 838
},
{
"epoch": 2.1457800511508953,
"grad_norm": 0.46784938019320965,
"learning_rate": 0.00011279543555446379,
"loss": 0.988,
"step": 839
},
{
"epoch": 2.1483375959079285,
"grad_norm": 0.4200222134898951,
"learning_rate": 0.00011266506010031052,
"loss": 1.0119,
"step": 840
},
{
"epoch": 2.1508951406649617,
"grad_norm": 0.3655050664603677,
"learning_rate": 0.00011253458045006955,
"loss": 0.9895,
"step": 841
},
{
"epoch": 2.153452685421995,
"grad_norm": 0.3022642865356664,
"learning_rate": 0.00011240399701994919,
"loss": 1.001,
"step": 842
},
{
"epoch": 2.156010230179028,
"grad_norm": 0.3188747440198214,
"learning_rate": 0.00011227331022648877,
"loss": 0.9773,
"step": 843
},
{
"epoch": 2.1585677749360612,
"grad_norm": 0.41190200456297044,
"learning_rate": 0.00011214252048655733,
"loss": 1.024,
"step": 844
},
{
"epoch": 2.1611253196930944,
"grad_norm": 0.33803198230453474,
"learning_rate": 0.00011201162821735228,
"loss": 0.9843,
"step": 845
},
{
"epoch": 2.163682864450128,
"grad_norm": 0.36583158073668925,
"learning_rate": 0.00011188063383639817,
"loss": 0.9809,
"step": 846
},
{
"epoch": 2.166240409207161,
"grad_norm": 0.39675634848639996,
"learning_rate": 0.00011174953776154516,
"loss": 0.942,
"step": 847
},
{
"epoch": 2.1687979539641944,
"grad_norm": 0.4164372273567332,
"learning_rate": 0.00011161834041096782,
"loss": 1.0337,
"step": 848
},
{
"epoch": 2.1713554987212276,
"grad_norm": 0.42306948681428896,
"learning_rate": 0.00011148704220316387,
"loss": 0.9913,
"step": 849
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.374454297267049,
"learning_rate": 0.0001113556435569526,
"loss": 0.9928,
"step": 850
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.31767286286037444,
"learning_rate": 0.00011122414489147376,
"loss": 0.9972,
"step": 851
},
{
"epoch": 2.1790281329923276,
"grad_norm": 0.36673595005863613,
"learning_rate": 0.00011109254662618616,
"loss": 1.0105,
"step": 852
},
{
"epoch": 2.1815856777493607,
"grad_norm": 0.5025085408193712,
"learning_rate": 0.00011096084918086626,
"loss": 0.9508,
"step": 853
},
{
"epoch": 2.184143222506394,
"grad_norm": 0.5453118752197188,
"learning_rate": 0.00011082905297560697,
"loss": 0.9354,
"step": 854
},
{
"epoch": 2.186700767263427,
"grad_norm": 0.535508310533172,
"learning_rate": 0.00011069715843081613,
"loss": 0.986,
"step": 855
},
{
"epoch": 2.1892583120204603,
"grad_norm": 0.5550105153386212,
"learning_rate": 0.00011056516596721534,
"loss": 1.0047,
"step": 856
},
{
"epoch": 2.1918158567774935,
"grad_norm": 0.5522958050937595,
"learning_rate": 0.00011043307600583854,
"loss": 1.0204,
"step": 857
},
{
"epoch": 2.1943734015345266,
"grad_norm": 0.514732209947304,
"learning_rate": 0.0001103008889680306,
"loss": 1.0137,
"step": 858
},
{
"epoch": 2.1969309462915603,
"grad_norm": 0.5281211410564769,
"learning_rate": 0.00011016860527544616,
"loss": 1.0085,
"step": 859
},
{
"epoch": 2.1994884910485935,
"grad_norm": 0.46959816689384604,
"learning_rate": 0.00011003622535004806,
"loss": 1.0058,
"step": 860
},
{
"epoch": 2.2020460358056266,
"grad_norm": 0.3407338275520536,
"learning_rate": 0.0001099037496141062,
"loss": 0.9986,
"step": 861
},
{
"epoch": 2.20460358056266,
"grad_norm": 0.47884582066611536,
"learning_rate": 0.00010977117849019604,
"loss": 0.9707,
"step": 862
},
{
"epoch": 2.207161125319693,
"grad_norm": 0.6169099163617163,
"learning_rate": 0.00010963851240119731,
"loss": 0.9957,
"step": 863
},
{
"epoch": 2.209718670076726,
"grad_norm": 0.5842777084702644,
"learning_rate": 0.00010950575177029271,
"loss": 0.9971,
"step": 864
},
{
"epoch": 2.21227621483376,
"grad_norm": 0.5415512252484223,
"learning_rate": 0.00010937289702096648,
"loss": 0.955,
"step": 865
},
{
"epoch": 2.214833759590793,
"grad_norm": 0.5584987591506012,
"learning_rate": 0.00010923994857700308,
"loss": 0.9858,
"step": 866
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.5438681169787357,
"learning_rate": 0.00010910690686248587,
"loss": 1.0272,
"step": 867
},
{
"epoch": 2.2199488491048593,
"grad_norm": 0.45923876211266634,
"learning_rate": 0.00010897377230179568,
"loss": 0.9689,
"step": 868
},
{
"epoch": 2.2225063938618925,
"grad_norm": 0.344989298275585,
"learning_rate": 0.00010884054531960956,
"loss": 1.005,
"step": 869
},
{
"epoch": 2.2250639386189257,
"grad_norm": 0.3203832886307522,
"learning_rate": 0.00010870722634089927,
"loss": 0.9904,
"step": 870
},
{
"epoch": 2.227621483375959,
"grad_norm": 0.4050058894119621,
"learning_rate": 0.0001085738157909302,
"loss": 0.9716,
"step": 871
},
{
"epoch": 2.2301790281329925,
"grad_norm": 0.5042105083367587,
"learning_rate": 0.00010844031409525962,
"loss": 0.9921,
"step": 872
},
{
"epoch": 2.2327365728900257,
"grad_norm": 0.5771976233792036,
"learning_rate": 0.00010830672167973572,
"loss": 1.0081,
"step": 873
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.6444239077326948,
"learning_rate": 0.00010817303897049597,
"loss": 0.9961,
"step": 874
},
{
"epoch": 2.237851662404092,
"grad_norm": 0.6303091061510789,
"learning_rate": 0.0001080392663939659,
"loss": 0.9648,
"step": 875
},
{
"epoch": 2.2404092071611252,
"grad_norm": 0.5383211537711221,
"learning_rate": 0.00010790540437685771,
"loss": 0.9835,
"step": 876
},
{
"epoch": 2.2429667519181584,
"grad_norm": 0.4021404516007495,
"learning_rate": 0.00010777145334616884,
"loss": 0.9732,
"step": 877
},
{
"epoch": 2.2455242966751916,
"grad_norm": 0.31439318271272565,
"learning_rate": 0.00010763741372918076,
"loss": 0.9799,
"step": 878
},
{
"epoch": 2.2480818414322252,
"grad_norm": 0.4404091457741591,
"learning_rate": 0.00010750328595345744,
"loss": 0.9798,
"step": 879
},
{
"epoch": 2.2506393861892584,
"grad_norm": 0.5676899676174939,
"learning_rate": 0.00010736907044684409,
"loss": 0.956,
"step": 880
},
{
"epoch": 2.2531969309462916,
"grad_norm": 0.6251515987816799,
"learning_rate": 0.00010723476763746578,
"loss": 0.9766,
"step": 881
},
{
"epoch": 2.2557544757033248,
"grad_norm": 0.6188152066667294,
"learning_rate": 0.00010710037795372604,
"loss": 0.9436,
"step": 882
},
{
"epoch": 2.258312020460358,
"grad_norm": 0.561619175816319,
"learning_rate": 0.00010696590182430552,
"loss": 0.9829,
"step": 883
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.42915411587906266,
"learning_rate": 0.00010683133967816062,
"loss": 0.9776,
"step": 884
},
{
"epoch": 2.2634271099744243,
"grad_norm": 0.3524127037006637,
"learning_rate": 0.00010669669194452213,
"loss": 0.9966,
"step": 885
},
{
"epoch": 2.265984654731458,
"grad_norm": 0.3537805903644639,
"learning_rate": 0.00010656195905289382,
"loss": 1.0042,
"step": 886
},
{
"epoch": 2.268542199488491,
"grad_norm": 0.38907067845530163,
"learning_rate": 0.00010642714143305115,
"loss": 0.9591,
"step": 887
},
{
"epoch": 2.2710997442455243,
"grad_norm": 0.4388187336605131,
"learning_rate": 0.00010629223951503975,
"loss": 0.9657,
"step": 888
},
{
"epoch": 2.2736572890025575,
"grad_norm": 0.5259226887120563,
"learning_rate": 0.00010615725372917429,
"loss": 0.9902,
"step": 889
},
{
"epoch": 2.2762148337595907,
"grad_norm": 0.5228861897572435,
"learning_rate": 0.00010602218450603687,
"loss": 1.0222,
"step": 890
},
{
"epoch": 2.2787723785166243,
"grad_norm": 0.5036534202887699,
"learning_rate": 0.00010588703227647573,
"loss": 1.0003,
"step": 891
},
{
"epoch": 2.2813299232736575,
"grad_norm": 0.3581923819862395,
"learning_rate": 0.00010575179747160391,
"loss": 0.9834,
"step": 892
},
{
"epoch": 2.2838874680306906,
"grad_norm": 0.3410033765731837,
"learning_rate": 0.00010561648052279792,
"loss": 0.9893,
"step": 893
},
{
"epoch": 2.286445012787724,
"grad_norm": 0.48497621648344247,
"learning_rate": 0.00010548108186169619,
"loss": 1.0097,
"step": 894
},
{
"epoch": 2.289002557544757,
"grad_norm": 0.4811056602507645,
"learning_rate": 0.00010534560192019784,
"loss": 0.9987,
"step": 895
},
{
"epoch": 2.29156010230179,
"grad_norm": 0.5430558900686754,
"learning_rate": 0.00010521004113046126,
"loss": 0.9863,
"step": 896
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.5520225619306299,
"learning_rate": 0.00010507439992490274,
"loss": 0.9854,
"step": 897
},
{
"epoch": 2.296675191815857,
"grad_norm": 0.5368891057768155,
"learning_rate": 0.00010493867873619509,
"loss": 0.962,
"step": 898
},
{
"epoch": 2.29923273657289,
"grad_norm": 0.45785580350946786,
"learning_rate": 0.00010480287799726624,
"loss": 0.9951,
"step": 899
},
{
"epoch": 2.3017902813299234,
"grad_norm": 0.3134044741551554,
"learning_rate": 0.00010466699814129784,
"loss": 0.9808,
"step": 900
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.3718160522616458,
"learning_rate": 0.00010453103960172399,
"loss": 0.9722,
"step": 901
},
{
"epoch": 2.3069053708439897,
"grad_norm": 0.42777708592376057,
"learning_rate": 0.0001043950028122297,
"loss": 0.9778,
"step": 902
},
{
"epoch": 2.309462915601023,
"grad_norm": 0.5114598924445181,
"learning_rate": 0.00010425888820674964,
"loss": 0.9999,
"step": 903
},
{
"epoch": 2.312020460358056,
"grad_norm": 0.42665599355653705,
"learning_rate": 0.00010412269621946664,
"loss": 0.9277,
"step": 904
},
{
"epoch": 2.3145780051150897,
"grad_norm": 0.32425667546420855,
"learning_rate": 0.0001039864272848104,
"loss": 0.9623,
"step": 905
},
{
"epoch": 2.317135549872123,
"grad_norm": 0.278767997134977,
"learning_rate": 0.00010385008183745614,
"loss": 0.9709,
"step": 906
},
{
"epoch": 2.319693094629156,
"grad_norm": 0.2973268406415685,
"learning_rate": 0.00010371366031232298,
"loss": 0.9752,
"step": 907
},
{
"epoch": 2.3222506393861893,
"grad_norm": 0.32805655210523665,
"learning_rate": 0.00010357716314457286,
"loss": 1.0151,
"step": 908
},
{
"epoch": 2.3248081841432224,
"grad_norm": 0.3136457006720511,
"learning_rate": 0.00010344059076960893,
"loss": 0.9525,
"step": 909
},
{
"epoch": 2.3273657289002556,
"grad_norm": 0.36706796314794027,
"learning_rate": 0.00010330394362307426,
"loss": 1.0263,
"step": 910
},
{
"epoch": 2.329923273657289,
"grad_norm": 0.3628334304816528,
"learning_rate": 0.00010316722214085048,
"loss": 1.0032,
"step": 911
},
{
"epoch": 2.3324808184143224,
"grad_norm": 0.4614008122870428,
"learning_rate": 0.00010303042675905623,
"loss": 0.9655,
"step": 912
},
{
"epoch": 2.3350383631713556,
"grad_norm": 0.5091780040539386,
"learning_rate": 0.00010289355791404597,
"loss": 0.9963,
"step": 913
},
{
"epoch": 2.337595907928389,
"grad_norm": 0.4886959522852251,
"learning_rate": 0.00010275661604240844,
"loss": 0.9959,
"step": 914
},
{
"epoch": 2.340153452685422,
"grad_norm": 0.3477812096500851,
"learning_rate": 0.00010261960158096538,
"loss": 0.9923,
"step": 915
},
{
"epoch": 2.342710997442455,
"grad_norm": 0.3003617995320152,
"learning_rate": 0.00010248251496677002,
"loss": 1.0133,
"step": 916
},
{
"epoch": 2.3452685421994883,
"grad_norm": 0.3907656568645366,
"learning_rate": 0.00010234535663710578,
"loss": 0.9559,
"step": 917
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.44450800877616453,
"learning_rate": 0.00010220812702948483,
"loss": 0.9839,
"step": 918
},
{
"epoch": 2.350383631713555,
"grad_norm": 0.41444476133681435,
"learning_rate": 0.00010207082658164668,
"loss": 0.9695,
"step": 919
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.3486015741078046,
"learning_rate": 0.00010193345573155686,
"loss": 0.9699,
"step": 920
},
{
"epoch": 2.3554987212276215,
"grad_norm": 0.305313779906682,
"learning_rate": 0.00010179601491740546,
"loss": 0.9737,
"step": 921
},
{
"epoch": 2.3580562659846547,
"grad_norm": 0.3210944860271877,
"learning_rate": 0.00010165850457760569,
"loss": 0.9734,
"step": 922
},
{
"epoch": 2.360613810741688,
"grad_norm": 0.33354001864174027,
"learning_rate": 0.00010152092515079263,
"loss": 0.9758,
"step": 923
},
{
"epoch": 2.363171355498721,
"grad_norm": 0.3630435985390137,
"learning_rate": 0.00010138327707582161,
"loss": 0.9843,
"step": 924
},
{
"epoch": 2.3657289002557547,
"grad_norm": 0.3068154551503405,
"learning_rate": 0.00010124556079176705,
"loss": 0.9718,
"step": 925
},
{
"epoch": 2.368286445012788,
"grad_norm": 0.3145375023118287,
"learning_rate": 0.0001011077767379209,
"loss": 0.9485,
"step": 926
},
{
"epoch": 2.370843989769821,
"grad_norm": 0.4562062846091247,
"learning_rate": 0.00010096992535379125,
"loss": 1.0041,
"step": 927
},
{
"epoch": 2.373401534526854,
"grad_norm": 0.4613854636034836,
"learning_rate": 0.00010083200707910109,
"loss": 1.0095,
"step": 928
},
{
"epoch": 2.3759590792838874,
"grad_norm": 0.5020460478647006,
"learning_rate": 0.00010069402235378657,
"loss": 0.9793,
"step": 929
},
{
"epoch": 2.3785166240409206,
"grad_norm": 0.47032502181209285,
"learning_rate": 0.000100555971617996,
"loss": 1.003,
"step": 930
},
{
"epoch": 2.381074168797954,
"grad_norm": 0.37153265133623853,
"learning_rate": 0.00010041785531208813,
"loss": 0.9707,
"step": 931
},
{
"epoch": 2.3836317135549874,
"grad_norm": 0.2954908430723523,
"learning_rate": 0.00010027967387663098,
"loss": 0.9943,
"step": 932
},
{
"epoch": 2.3861892583120206,
"grad_norm": 0.2860326087524264,
"learning_rate": 0.00010014142775240018,
"loss": 0.978,
"step": 933
},
{
"epoch": 2.3887468030690537,
"grad_norm": 0.36670864980970264,
"learning_rate": 0.00010000311738037786,
"loss": 0.9654,
"step": 934
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.39639852002586273,
"learning_rate": 9.986474320175097e-05,
"loss": 0.964,
"step": 935
},
{
"epoch": 2.39386189258312,
"grad_norm": 0.3585981520256939,
"learning_rate": 9.972630565791003e-05,
"loss": 0.9825,
"step": 936
},
{
"epoch": 2.3964194373401533,
"grad_norm": 0.3189834091257556,
"learning_rate": 9.958780519044772e-05,
"loss": 0.9851,
"step": 937
},
{
"epoch": 2.398976982097187,
"grad_norm": 0.3049358905004256,
"learning_rate": 9.944924224115737e-05,
"loss": 0.9939,
"step": 938
},
{
"epoch": 2.40153452685422,
"grad_norm": 0.2622458924767327,
"learning_rate": 9.931061725203167e-05,
"loss": 0.9781,
"step": 939
},
{
"epoch": 2.4040920716112533,
"grad_norm": 0.2924257759631161,
"learning_rate": 9.917193066526122e-05,
"loss": 0.9868,
"step": 940
},
{
"epoch": 2.4066496163682864,
"grad_norm": 0.3604978006726876,
"learning_rate": 9.903318292323301e-05,
"loss": 0.9754,
"step": 941
},
{
"epoch": 2.4092071611253196,
"grad_norm": 0.29745498369836404,
"learning_rate": 9.889437446852923e-05,
"loss": 0.9859,
"step": 942
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.37371862497237623,
"learning_rate": 9.875550574392565e-05,
"loss": 0.9896,
"step": 943
},
{
"epoch": 2.414322250639386,
"grad_norm": 0.38638295584959187,
"learning_rate": 9.86165771923903e-05,
"loss": 0.9881,
"step": 944
},
{
"epoch": 2.4168797953964196,
"grad_norm": 0.4041126989806797,
"learning_rate": 9.84775892570821e-05,
"loss": 0.9428,
"step": 945
},
{
"epoch": 2.419437340153453,
"grad_norm": 0.395096912214402,
"learning_rate": 9.833854238134931e-05,
"loss": 0.9622,
"step": 946
},
{
"epoch": 2.421994884910486,
"grad_norm": 0.3464290247147215,
"learning_rate": 9.819943700872828e-05,
"loss": 1.0125,
"step": 947
},
{
"epoch": 2.424552429667519,
"grad_norm": 0.28843985739584715,
"learning_rate": 9.806027358294195e-05,
"loss": 0.9712,
"step": 948
},
{
"epoch": 2.4271099744245523,
"grad_norm": 0.38051542261971155,
"learning_rate": 9.792105254789834e-05,
"loss": 0.9851,
"step": 949
},
{
"epoch": 2.4296675191815855,
"grad_norm": 0.4466310758086544,
"learning_rate": 9.778177434768935e-05,
"loss": 0.9683,
"step": 950
},
{
"epoch": 2.4322250639386187,
"grad_norm": 0.4692147641165216,
"learning_rate": 9.764243942658919e-05,
"loss": 0.9841,
"step": 951
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.35373867138680226,
"learning_rate": 9.750304822905297e-05,
"loss": 0.9492,
"step": 952
},
{
"epoch": 2.4373401534526855,
"grad_norm": 0.28385300113252654,
"learning_rate": 9.736360119971537e-05,
"loss": 0.9996,
"step": 953
},
{
"epoch": 2.4398976982097187,
"grad_norm": 0.2937003946020655,
"learning_rate": 9.722409878338908e-05,
"loss": 1.0015,
"step": 954
},
{
"epoch": 2.442455242966752,
"grad_norm": 0.3969860787197417,
"learning_rate": 9.708454142506354e-05,
"loss": 0.9774,
"step": 955
},
{
"epoch": 2.445012787723785,
"grad_norm": 0.5498839614052679,
"learning_rate": 9.694492956990345e-05,
"loss": 0.9847,
"step": 956
},
{
"epoch": 2.4475703324808182,
"grad_norm": 0.5513989094448135,
"learning_rate": 9.680526366324726e-05,
"loss": 0.9565,
"step": 957
},
{
"epoch": 2.4501278772378514,
"grad_norm": 0.506905247181652,
"learning_rate": 9.666554415060596e-05,
"loss": 0.9517,
"step": 958
},
{
"epoch": 2.452685421994885,
"grad_norm": 0.44474310752723095,
"learning_rate": 9.652577147766142e-05,
"loss": 0.9743,
"step": 959
},
{
"epoch": 2.455242966751918,
"grad_norm": 0.37097475676427244,
"learning_rate": 9.638594609026515e-05,
"loss": 0.9506,
"step": 960
},
{
"epoch": 2.4578005115089514,
"grad_norm": 0.2734924283931777,
"learning_rate": 9.624606843443675e-05,
"loss": 1.0158,
"step": 961
},
{
"epoch": 2.4603580562659846,
"grad_norm": 0.31804819233085263,
"learning_rate": 9.610613895636263e-05,
"loss": 0.992,
"step": 962
},
{
"epoch": 2.4629156010230178,
"grad_norm": 0.41664714320663915,
"learning_rate": 9.596615810239445e-05,
"loss": 0.999,
"step": 963
},
{
"epoch": 2.4654731457800514,
"grad_norm": 0.5523065515247985,
"learning_rate": 9.582612631904779e-05,
"loss": 1.0055,
"step": 964
},
{
"epoch": 2.4680306905370846,
"grad_norm": 0.4671305490762141,
"learning_rate": 9.568604405300062e-05,
"loss": 0.9579,
"step": 965
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.3279722497396409,
"learning_rate": 9.554591175109194e-05,
"loss": 0.9731,
"step": 966
},
{
"epoch": 2.473145780051151,
"grad_norm": 0.25846610901040445,
"learning_rate": 9.54057298603205e-05,
"loss": 0.9817,
"step": 967
},
{
"epoch": 2.475703324808184,
"grad_norm": 0.3730225408971352,
"learning_rate": 9.526549882784305e-05,
"loss": 0.9874,
"step": 968
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.7271461728885226,
"learning_rate": 9.512521910097316e-05,
"loss": 1.0348,
"step": 969
},
{
"epoch": 2.4808184143222505,
"grad_norm": 0.32875046425746846,
"learning_rate": 9.49848911271798e-05,
"loss": 0.9565,
"step": 970
},
{
"epoch": 2.483375959079284,
"grad_norm": 0.3205410594330121,
"learning_rate": 9.484451535408572e-05,
"loss": 0.9784,
"step": 971
},
{
"epoch": 2.4859335038363173,
"grad_norm": 0.26205949445440796,
"learning_rate": 9.470409222946623e-05,
"loss": 0.9983,
"step": 972
},
{
"epoch": 2.4884910485933505,
"grad_norm": 0.3237027571460551,
"learning_rate": 9.456362220124766e-05,
"loss": 0.98,
"step": 973
},
{
"epoch": 2.4910485933503836,
"grad_norm": 0.35272232039199597,
"learning_rate": 9.442310571750588e-05,
"loss": 0.9779,
"step": 974
},
{
"epoch": 2.493606138107417,
"grad_norm": 0.305939353717968,
"learning_rate": 9.42825432264651e-05,
"loss": 0.9581,
"step": 975
},
{
"epoch": 2.49616368286445,
"grad_norm": 0.2932577303248136,
"learning_rate": 9.414193517649614e-05,
"loss": 0.9855,
"step": 976
},
{
"epoch": 2.498721227621483,
"grad_norm": 0.30059710492898495,
"learning_rate": 9.400128201611521e-05,
"loss": 0.9754,
"step": 977
},
{
"epoch": 2.501278772378517,
"grad_norm": 0.2973031341519278,
"learning_rate": 9.386058419398243e-05,
"loss": 0.9909,
"step": 978
},
{
"epoch": 2.50383631713555,
"grad_norm": 0.3722883437832787,
"learning_rate": 9.371984215890032e-05,
"loss": 0.9946,
"step": 979
},
{
"epoch": 2.506393861892583,
"grad_norm": 0.3473263838445932,
"learning_rate": 9.357905635981251e-05,
"loss": 0.9543,
"step": 980
},
{
"epoch": 2.5089514066496164,
"grad_norm": 0.2867570028047222,
"learning_rate": 9.34382272458022e-05,
"loss": 0.9638,
"step": 981
},
{
"epoch": 2.5115089514066495,
"grad_norm": 0.30564756429493334,
"learning_rate": 9.329735526609071e-05,
"loss": 0.9464,
"step": 982
},
{
"epoch": 2.5140664961636827,
"grad_norm": 0.277493802953859,
"learning_rate": 9.315644087003614e-05,
"loss": 0.9565,
"step": 983
},
{
"epoch": 2.516624040920716,
"grad_norm": 0.32107200459340096,
"learning_rate": 9.301548450713193e-05,
"loss": 0.987,
"step": 984
},
{
"epoch": 2.5191815856777495,
"grad_norm": 0.34282165398687586,
"learning_rate": 9.28744866270053e-05,
"loss": 0.985,
"step": 985
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.32220988156237623,
"learning_rate": 9.273344767941595e-05,
"loss": 0.958,
"step": 986
},
{
"epoch": 2.524296675191816,
"grad_norm": 0.2659763342921004,
"learning_rate": 9.259236811425458e-05,
"loss": 0.9693,
"step": 987
},
{
"epoch": 2.526854219948849,
"grad_norm": 0.31738841820079255,
"learning_rate": 9.245124838154145e-05,
"loss": 0.9938,
"step": 988
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.32830918791297703,
"learning_rate": 9.231008893142496e-05,
"loss": 0.9934,
"step": 989
},
{
"epoch": 2.531969309462916,
"grad_norm": 0.3402708856013208,
"learning_rate": 9.216889021418015e-05,
"loss": 1.0013,
"step": 990
},
{
"epoch": 2.5345268542199486,
"grad_norm": 0.4044102426145664,
"learning_rate": 9.202765268020734e-05,
"loss": 0.9831,
"step": 991
},
{
"epoch": 2.5370843989769822,
"grad_norm": 0.42862262278596586,
"learning_rate": 9.188637678003078e-05,
"loss": 0.9997,
"step": 992
},
{
"epoch": 2.5396419437340154,
"grad_norm": 0.4484266743548927,
"learning_rate": 9.17450629642969e-05,
"loss": 0.9828,
"step": 993
},
{
"epoch": 2.5421994884910486,
"grad_norm": 0.3265912580211292,
"learning_rate": 9.160371168377322e-05,
"loss": 0.9643,
"step": 994
},
{
"epoch": 2.544757033248082,
"grad_norm": 0.32534751123207517,
"learning_rate": 9.146232338934671e-05,
"loss": 0.9582,
"step": 995
},
{
"epoch": 2.547314578005115,
"grad_norm": 0.38239024918470127,
"learning_rate": 9.132089853202243e-05,
"loss": 0.9744,
"step": 996
},
{
"epoch": 2.5498721227621486,
"grad_norm": 0.46563347602108834,
"learning_rate": 9.117943756292208e-05,
"loss": 0.9792,
"step": 997
},
{
"epoch": 2.5524296675191813,
"grad_norm": 0.39461054417861174,
"learning_rate": 9.103794093328248e-05,
"loss": 0.9755,
"step": 998
},
{
"epoch": 2.554987212276215,
"grad_norm": 0.3125908044097884,
"learning_rate": 9.089640909445431e-05,
"loss": 0.9716,
"step": 999
},
{
"epoch": 2.557544757033248,
"grad_norm": 0.2684368877044592,
"learning_rate": 9.075484249790048e-05,
"loss": 0.9747,
"step": 1000
},
{
"epoch": 2.5601023017902813,
"grad_norm": 0.28891578856074146,
"learning_rate": 9.061324159519476e-05,
"loss": 0.9762,
"step": 1001
},
{
"epoch": 2.5626598465473145,
"grad_norm": 0.3034677475712927,
"learning_rate": 9.047160683802046e-05,
"loss": 0.9674,
"step": 1002
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.31908253316340884,
"learning_rate": 9.032993867816876e-05,
"loss": 0.9942,
"step": 1003
},
{
"epoch": 2.5677749360613813,
"grad_norm": 0.2544491678916064,
"learning_rate": 9.018823756753746e-05,
"loss": 1.0001,
"step": 1004
},
{
"epoch": 2.5703324808184145,
"grad_norm": 0.2995352776229395,
"learning_rate": 9.00465039581294e-05,
"loss": 0.9929,
"step": 1005
},
{
"epoch": 2.5728900255754477,
"grad_norm": 0.35913882534331126,
"learning_rate": 8.990473830205118e-05,
"loss": 0.9318,
"step": 1006
},
{
"epoch": 2.575447570332481,
"grad_norm": 0.37010668314829087,
"learning_rate": 8.976294105151154e-05,
"loss": 1.0079,
"step": 1007
},
{
"epoch": 2.578005115089514,
"grad_norm": 0.2570784147501355,
"learning_rate": 8.962111265882006e-05,
"loss": 0.9952,
"step": 1008
},
{
"epoch": 2.580562659846547,
"grad_norm": 0.3149539278736431,
"learning_rate": 8.947925357638561e-05,
"loss": 0.9941,
"step": 1009
},
{
"epoch": 2.5831202046035804,
"grad_norm": 0.2855340149405739,
"learning_rate": 8.933736425671495e-05,
"loss": 0.9816,
"step": 1010
},
{
"epoch": 2.585677749360614,
"grad_norm": 0.25345884892793763,
"learning_rate": 8.91954451524114e-05,
"loss": 0.9818,
"step": 1011
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.29694516426804485,
"learning_rate": 8.905349671617313e-05,
"loss": 0.9876,
"step": 1012
},
{
"epoch": 2.5907928388746804,
"grad_norm": 0.3052840810260173,
"learning_rate": 8.891151940079198e-05,
"loss": 0.9702,
"step": 1013
},
{
"epoch": 2.5933503836317136,
"grad_norm": 0.2661838830871243,
"learning_rate": 8.87695136591519e-05,
"loss": 0.9877,
"step": 1014
},
{
"epoch": 2.5959079283887467,
"grad_norm": 0.2986390559549456,
"learning_rate": 8.862747994422744e-05,
"loss": 0.9707,
"step": 1015
},
{
"epoch": 2.59846547314578,
"grad_norm": 0.3613476612681819,
"learning_rate": 8.848541870908248e-05,
"loss": 0.9703,
"step": 1016
},
{
"epoch": 2.601023017902813,
"grad_norm": 0.33024018130732985,
"learning_rate": 8.834333040686867e-05,
"loss": 0.979,
"step": 1017
},
{
"epoch": 2.6035805626598467,
"grad_norm": 0.31187166502347763,
"learning_rate": 8.820121549082389e-05,
"loss": 0.9829,
"step": 1018
},
{
"epoch": 2.60613810741688,
"grad_norm": 0.3469288630004611,
"learning_rate": 8.805907441427107e-05,
"loss": 0.9558,
"step": 1019
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.3134454892157028,
"learning_rate": 8.791690763061646e-05,
"loss": 0.9644,
"step": 1020
},
{
"epoch": 2.6112531969309463,
"grad_norm": 0.30922058220600745,
"learning_rate": 8.777471559334835e-05,
"loss": 0.9769,
"step": 1021
},
{
"epoch": 2.6138107416879794,
"grad_norm": 0.3164613704707754,
"learning_rate": 8.763249875603568e-05,
"loss": 0.9699,
"step": 1022
},
{
"epoch": 2.6163682864450126,
"grad_norm": 0.3937696035168064,
"learning_rate": 8.74902575723263e-05,
"loss": 0.9913,
"step": 1023
},
{
"epoch": 2.618925831202046,
"grad_norm": 0.3269757525342128,
"learning_rate": 8.734799249594593e-05,
"loss": 0.9714,
"step": 1024
},
{
"epoch": 2.6214833759590794,
"grad_norm": 0.3137372841061025,
"learning_rate": 8.720570398069639e-05,
"loss": 0.9667,
"step": 1025
},
{
"epoch": 2.6240409207161126,
"grad_norm": 0.296905098424126,
"learning_rate": 8.706339248045425e-05,
"loss": 0.9748,
"step": 1026
},
{
"epoch": 2.626598465473146,
"grad_norm": 0.3341447796223413,
"learning_rate": 8.692105844916946e-05,
"loss": 0.9813,
"step": 1027
},
{
"epoch": 2.629156010230179,
"grad_norm": 0.3756191138022281,
"learning_rate": 8.677870234086383e-05,
"loss": 0.9908,
"step": 1028
},
{
"epoch": 2.631713554987212,
"grad_norm": 0.3559465468948902,
"learning_rate": 8.663632460962956e-05,
"loss": 0.9936,
"step": 1029
},
{
"epoch": 2.634271099744246,
"grad_norm": 0.300711572823478,
"learning_rate": 8.649392570962781e-05,
"loss": 0.9795,
"step": 1030
},
{
"epoch": 2.6368286445012785,
"grad_norm": 0.3320572865051935,
"learning_rate": 8.635150609508733e-05,
"loss": 0.984,
"step": 1031
},
{
"epoch": 2.639386189258312,
"grad_norm": 0.3635828441982571,
"learning_rate": 8.620906622030292e-05,
"loss": 0.9536,
"step": 1032
},
{
"epoch": 2.6419437340153453,
"grad_norm": 0.3278411915419061,
"learning_rate": 8.6066606539634e-05,
"loss": 1.0088,
"step": 1033
},
{
"epoch": 2.6445012787723785,
"grad_norm": 0.32767767702958833,
"learning_rate": 8.592412750750312e-05,
"loss": 0.9876,
"step": 1034
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.35097964529502185,
"learning_rate": 8.578162957839462e-05,
"loss": 0.9915,
"step": 1035
},
{
"epoch": 2.649616368286445,
"grad_norm": 0.31991735732581283,
"learning_rate": 8.563911320685312e-05,
"loss": 0.9638,
"step": 1036
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.23787926653601094,
"learning_rate": 8.549657884748205e-05,
"loss": 0.9713,
"step": 1037
},
{
"epoch": 2.6547314578005117,
"grad_norm": 0.32244485030641373,
"learning_rate": 8.535402695494221e-05,
"loss": 0.9772,
"step": 1038
},
{
"epoch": 2.657289002557545,
"grad_norm": 0.312950136510117,
"learning_rate": 8.521145798395035e-05,
"loss": 0.9841,
"step": 1039
},
{
"epoch": 2.659846547314578,
"grad_norm": 0.26212781885375047,
"learning_rate": 8.506887238927764e-05,
"loss": 0.9955,
"step": 1040
},
{
"epoch": 2.662404092071611,
"grad_norm": 0.34105099182259796,
"learning_rate": 8.492627062574837e-05,
"loss": 0.9729,
"step": 1041
},
{
"epoch": 2.6649616368286444,
"grad_norm": 0.297943326170416,
"learning_rate": 8.478365314823831e-05,
"loss": 1.0041,
"step": 1042
},
{
"epoch": 2.6675191815856776,
"grad_norm": 0.23653735859455993,
"learning_rate": 8.464102041167343e-05,
"loss": 0.9385,
"step": 1043
},
{
"epoch": 2.670076726342711,
"grad_norm": 0.24103662980964566,
"learning_rate": 8.449837287102837e-05,
"loss": 0.9798,
"step": 1044
},
{
"epoch": 2.6726342710997444,
"grad_norm": 0.3266522540557997,
"learning_rate": 8.43557109813249e-05,
"loss": 0.9664,
"step": 1045
},
{
"epoch": 2.6751918158567776,
"grad_norm": 0.34157505937073707,
"learning_rate": 8.421303519763067e-05,
"loss": 0.9512,
"step": 1046
},
{
"epoch": 2.6777493606138107,
"grad_norm": 0.32745487240393034,
"learning_rate": 8.407034597505762e-05,
"loss": 0.9847,
"step": 1047
},
{
"epoch": 2.680306905370844,
"grad_norm": 0.30390244215100753,
"learning_rate": 8.392764376876049e-05,
"loss": 0.9847,
"step": 1048
},
{
"epoch": 2.682864450127877,
"grad_norm": 0.28021611753279574,
"learning_rate": 8.378492903393555e-05,
"loss": 0.9592,
"step": 1049
},
{
"epoch": 2.6854219948849103,
"grad_norm": 0.3320556275827844,
"learning_rate": 8.364220222581896e-05,
"loss": 0.9846,
"step": 1050
},
{
"epoch": 2.687979539641944,
"grad_norm": 0.3136101711766941,
"learning_rate": 8.34994637996854e-05,
"loss": 0.9811,
"step": 1051
},
{
"epoch": 2.690537084398977,
"grad_norm": 0.2618192450012102,
"learning_rate": 8.335671421084661e-05,
"loss": 0.9744,
"step": 1052
},
{
"epoch": 2.6930946291560103,
"grad_norm": 0.3220025314640929,
"learning_rate": 8.321395391464995e-05,
"loss": 0.9868,
"step": 1053
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.3598315892247714,
"learning_rate": 8.307118336647694e-05,
"loss": 0.951,
"step": 1054
},
{
"epoch": 2.6982097186700766,
"grad_norm": 0.4106007096012368,
"learning_rate": 8.292840302174178e-05,
"loss": 0.9643,
"step": 1055
},
{
"epoch": 2.70076726342711,
"grad_norm": 0.2548097195613678,
"learning_rate": 8.278561333588993e-05,
"loss": 0.9841,
"step": 1056
},
{
"epoch": 2.703324808184143,
"grad_norm": 0.3371557483370203,
"learning_rate": 8.264281476439662e-05,
"loss": 0.984,
"step": 1057
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.38976688577634183,
"learning_rate": 8.250000776276551e-05,
"loss": 0.9731,
"step": 1058
},
{
"epoch": 2.70843989769821,
"grad_norm": 0.2695308176694805,
"learning_rate": 8.235719278652704e-05,
"loss": 1.0008,
"step": 1059
},
{
"epoch": 2.710997442455243,
"grad_norm": 0.2799834287903197,
"learning_rate": 8.221437029123715e-05,
"loss": 0.96,
"step": 1060
},
{
"epoch": 2.713554987212276,
"grad_norm": 0.3887662531222578,
"learning_rate": 8.20715407324758e-05,
"loss": 1.0134,
"step": 1061
},
{
"epoch": 2.7161125319693094,
"grad_norm": 0.36475843384332224,
"learning_rate": 8.192870456584536e-05,
"loss": 0.9869,
"step": 1062
},
{
"epoch": 2.718670076726343,
"grad_norm": 0.3842950619442295,
"learning_rate": 8.178586224696938e-05,
"loss": 1.0191,
"step": 1063
},
{
"epoch": 2.7212276214833757,
"grad_norm": 0.29521526511075435,
"learning_rate": 8.164301423149104e-05,
"loss": 0.9847,
"step": 1064
},
{
"epoch": 2.7237851662404093,
"grad_norm": 0.2510688717518455,
"learning_rate": 8.150016097507161e-05,
"loss": 0.9537,
"step": 1065
},
{
"epoch": 2.7263427109974425,
"grad_norm": 0.31175386208986516,
"learning_rate": 8.135730293338918e-05,
"loss": 0.9715,
"step": 1066
},
{
"epoch": 2.7289002557544757,
"grad_norm": 0.2969969026627777,
"learning_rate": 8.121444056213698e-05,
"loss": 0.9778,
"step": 1067
},
{
"epoch": 2.731457800511509,
"grad_norm": 0.316196872282454,
"learning_rate": 8.107157431702219e-05,
"loss": 0.9979,
"step": 1068
},
{
"epoch": 2.734015345268542,
"grad_norm": 0.2677096371345643,
"learning_rate": 8.092870465376422e-05,
"loss": 0.972,
"step": 1069
},
{
"epoch": 2.7365728900255757,
"grad_norm": 0.25111395109245066,
"learning_rate": 8.078583202809347e-05,
"loss": 1.0173,
"step": 1070
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.23618007037740435,
"learning_rate": 8.064295689574979e-05,
"loss": 0.9681,
"step": 1071
},
{
"epoch": 2.741687979539642,
"grad_norm": 0.2462154966468633,
"learning_rate": 8.050007971248095e-05,
"loss": 0.9977,
"step": 1072
},
{
"epoch": 2.7442455242966752,
"grad_norm": 0.2396576027964869,
"learning_rate": 8.035720093404133e-05,
"loss": 0.9817,
"step": 1073
},
{
"epoch": 2.7468030690537084,
"grad_norm": 0.23288900252567163,
"learning_rate": 8.021432101619034e-05,
"loss": 0.9677,
"step": 1074
},
{
"epoch": 2.7493606138107416,
"grad_norm": 0.309943456329605,
"learning_rate": 8.007144041469111e-05,
"loss": 1.0198,
"step": 1075
},
{
"epoch": 2.7519181585677748,
"grad_norm": 0.2438257902275988,
"learning_rate": 7.992855958530893e-05,
"loss": 0.9774,
"step": 1076
},
{
"epoch": 2.7544757033248084,
"grad_norm": 0.24225939294568138,
"learning_rate": 7.978567898380968e-05,
"loss": 0.9975,
"step": 1077
},
{
"epoch": 2.7570332480818416,
"grad_norm": 0.2557453042666024,
"learning_rate": 7.96427990659587e-05,
"loss": 0.9601,
"step": 1078
},
{
"epoch": 2.7595907928388748,
"grad_norm": 0.25399744095479343,
"learning_rate": 7.949992028751908e-05,
"loss": 0.94,
"step": 1079
},
{
"epoch": 2.762148337595908,
"grad_norm": 0.25806395609838956,
"learning_rate": 7.935704310425022e-05,
"loss": 0.9856,
"step": 1080
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.2778516319437345,
"learning_rate": 7.921416797190653e-05,
"loss": 0.9485,
"step": 1081
},
{
"epoch": 2.7672634271099743,
"grad_norm": 0.2652382709743763,
"learning_rate": 7.90712953462358e-05,
"loss": 0.9852,
"step": 1082
},
{
"epoch": 2.7698209718670075,
"grad_norm": 0.3078124836381294,
"learning_rate": 7.892842568297784e-05,
"loss": 0.9843,
"step": 1083
},
{
"epoch": 2.772378516624041,
"grad_norm": 0.2630029283693419,
"learning_rate": 7.878555943786304e-05,
"loss": 0.9866,
"step": 1084
},
{
"epoch": 2.7749360613810743,
"grad_norm": 0.3230772942242779,
"learning_rate": 7.864269706661084e-05,
"loss": 0.9617,
"step": 1085
},
{
"epoch": 2.7774936061381075,
"grad_norm": 0.33688102829350425,
"learning_rate": 7.84998390249284e-05,
"loss": 1.0151,
"step": 1086
},
{
"epoch": 2.7800511508951407,
"grad_norm": 0.27010473360932136,
"learning_rate": 7.8356985768509e-05,
"loss": 0.9416,
"step": 1087
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.3216032949279463,
"learning_rate": 7.821413775303063e-05,
"loss": 0.9677,
"step": 1088
},
{
"epoch": 2.785166240409207,
"grad_norm": 0.3184797598775921,
"learning_rate": 7.807129543415467e-05,
"loss": 0.9878,
"step": 1089
},
{
"epoch": 2.78772378516624,
"grad_norm": 0.26980179286312655,
"learning_rate": 7.792845926752422e-05,
"loss": 0.9559,
"step": 1090
},
{
"epoch": 2.790281329923274,
"grad_norm": 0.2788560924053536,
"learning_rate": 7.778562970876285e-05,
"loss": 0.9315,
"step": 1091
},
{
"epoch": 2.792838874680307,
"grad_norm": 0.34225351537345716,
"learning_rate": 7.764280721347296e-05,
"loss": 0.9905,
"step": 1092
},
{
"epoch": 2.79539641943734,
"grad_norm": 0.3181751957801659,
"learning_rate": 7.749999223723451e-05,
"loss": 0.992,
"step": 1093
},
{
"epoch": 2.7979539641943734,
"grad_norm": 0.2617895154207013,
"learning_rate": 7.73571852356034e-05,
"loss": 0.976,
"step": 1094
},
{
"epoch": 2.8005115089514065,
"grad_norm": 0.26160435542511723,
"learning_rate": 7.72143866641101e-05,
"loss": 0.9717,
"step": 1095
},
{
"epoch": 2.80306905370844,
"grad_norm": 0.3005466825228635,
"learning_rate": 7.707159697825824e-05,
"loss": 1.019,
"step": 1096
},
{
"epoch": 2.805626598465473,
"grad_norm": 0.2737567544420114,
"learning_rate": 7.692881663352306e-05,
"loss": 0.9877,
"step": 1097
},
{
"epoch": 2.8081841432225065,
"grad_norm": 0.25383083364525466,
"learning_rate": 7.678604608535007e-05,
"loss": 1.0,
"step": 1098
},
{
"epoch": 2.8107416879795397,
"grad_norm": 0.24966621455789795,
"learning_rate": 7.664328578915341e-05,
"loss": 0.9913,
"step": 1099
},
{
"epoch": 2.813299232736573,
"grad_norm": 0.26731325577468995,
"learning_rate": 7.650053620031461e-05,
"loss": 0.9667,
"step": 1100
},
{
"epoch": 2.815856777493606,
"grad_norm": 0.24369512341274932,
"learning_rate": 7.635779777418105e-05,
"loss": 0.9941,
"step": 1101
},
{
"epoch": 2.8184143222506393,
"grad_norm": 0.22967457166848224,
"learning_rate": 7.621507096606445e-05,
"loss": 0.9755,
"step": 1102
},
{
"epoch": 2.820971867007673,
"grad_norm": 0.2571549233122558,
"learning_rate": 7.607235623123952e-05,
"loss": 0.9896,
"step": 1103
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.21308122874558627,
"learning_rate": 7.592965402494242e-05,
"loss": 0.9671,
"step": 1104
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.23965692093466115,
"learning_rate": 7.578696480236935e-05,
"loss": 0.9572,
"step": 1105
},
{
"epoch": 2.8286445012787724,
"grad_norm": 0.20206088609556147,
"learning_rate": 7.564428901867512e-05,
"loss": 0.9874,
"step": 1106
},
{
"epoch": 2.8312020460358056,
"grad_norm": 0.24456595967971878,
"learning_rate": 7.550162712897166e-05,
"loss": 0.9834,
"step": 1107
},
{
"epoch": 2.833759590792839,
"grad_norm": 0.2395628798306672,
"learning_rate": 7.535897958832657e-05,
"loss": 0.9932,
"step": 1108
},
{
"epoch": 2.836317135549872,
"grad_norm": 0.24488788117262922,
"learning_rate": 7.521634685176171e-05,
"loss": 0.9976,
"step": 1109
},
{
"epoch": 2.8388746803069056,
"grad_norm": 0.2475079536458042,
"learning_rate": 7.507372937425166e-05,
"loss": 0.979,
"step": 1110
},
{
"epoch": 2.8414322250639388,
"grad_norm": 0.25103418982918085,
"learning_rate": 7.493112761072238e-05,
"loss": 0.9784,
"step": 1111
},
{
"epoch": 2.843989769820972,
"grad_norm": 0.21080156526173952,
"learning_rate": 7.478854201604967e-05,
"loss": 0.9861,
"step": 1112
},
{
"epoch": 2.846547314578005,
"grad_norm": 0.2636072879534979,
"learning_rate": 7.464597304505779e-05,
"loss": 0.9767,
"step": 1113
},
{
"epoch": 2.8491048593350383,
"grad_norm": 0.3447559742850428,
"learning_rate": 7.450342115251793e-05,
"loss": 0.9763,
"step": 1114
},
{
"epoch": 2.8516624040920715,
"grad_norm": 0.3554201272513753,
"learning_rate": 7.436088679314689e-05,
"loss": 0.9814,
"step": 1115
},
{
"epoch": 2.8542199488491047,
"grad_norm": 0.2338897866384284,
"learning_rate": 7.42183704216054e-05,
"loss": 0.9737,
"step": 1116
},
{
"epoch": 2.8567774936061383,
"grad_norm": 0.3005337593534035,
"learning_rate": 7.407587249249691e-05,
"loss": 0.9593,
"step": 1117
},
{
"epoch": 2.8593350383631715,
"grad_norm": 0.28306065139483866,
"learning_rate": 7.393339346036604e-05,
"loss": 0.9912,
"step": 1118
},
{
"epoch": 2.8618925831202047,
"grad_norm": 0.32462258403513267,
"learning_rate": 7.379093377969708e-05,
"loss": 0.9636,
"step": 1119
},
{
"epoch": 2.864450127877238,
"grad_norm": 0.23458466619854929,
"learning_rate": 7.364849390491269e-05,
"loss": 1.0179,
"step": 1120
},
{
"epoch": 2.867007672634271,
"grad_norm": 0.26599173050846503,
"learning_rate": 7.350607429037222e-05,
"loss": 0.9865,
"step": 1121
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.28672176422376533,
"learning_rate": 7.336367539037047e-05,
"loss": 0.9697,
"step": 1122
},
{
"epoch": 2.8721227621483374,
"grad_norm": 0.38174167324236646,
"learning_rate": 7.32212976591362e-05,
"loss": 0.9394,
"step": 1123
},
{
"epoch": 2.874680306905371,
"grad_norm": 0.3008937451500426,
"learning_rate": 7.307894155083054e-05,
"loss": 1.0193,
"step": 1124
},
{
"epoch": 2.877237851662404,
"grad_norm": 0.2647744376072329,
"learning_rate": 7.293660751954576e-05,
"loss": 0.9959,
"step": 1125
},
{
"epoch": 2.8797953964194374,
"grad_norm": 0.3361184185105208,
"learning_rate": 7.279429601930365e-05,
"loss": 0.9886,
"step": 1126
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.28703805124273124,
"learning_rate": 7.265200750405408e-05,
"loss": 0.9552,
"step": 1127
},
{
"epoch": 2.8849104859335037,
"grad_norm": 0.2282314607084684,
"learning_rate": 7.250974242767372e-05,
"loss": 0.9613,
"step": 1128
},
{
"epoch": 2.887468030690537,
"grad_norm": 0.2492748754541012,
"learning_rate": 7.236750124396435e-05,
"loss": 0.9668,
"step": 1129
},
{
"epoch": 2.89002557544757,
"grad_norm": 0.25888788395575085,
"learning_rate": 7.222528440665167e-05,
"loss": 0.9925,
"step": 1130
},
{
"epoch": 2.8925831202046037,
"grad_norm": 0.24496080625420605,
"learning_rate": 7.20830923693836e-05,
"loss": 1.0041,
"step": 1131
},
{
"epoch": 2.895140664961637,
"grad_norm": 0.23733176427430222,
"learning_rate": 7.194092558572897e-05,
"loss": 0.9425,
"step": 1132
},
{
"epoch": 2.89769820971867,
"grad_norm": 0.27037826071655174,
"learning_rate": 7.179878450917613e-05,
"loss": 0.9618,
"step": 1133
},
{
"epoch": 2.9002557544757033,
"grad_norm": 0.2110486047552461,
"learning_rate": 7.165666959313135e-05,
"loss": 0.9625,
"step": 1134
},
{
"epoch": 2.9028132992327365,
"grad_norm": 0.2356138250996952,
"learning_rate": 7.151458129091752e-05,
"loss": 0.9868,
"step": 1135
},
{
"epoch": 2.90537084398977,
"grad_norm": 0.2507648626394698,
"learning_rate": 7.137252005577256e-05,
"loss": 0.9579,
"step": 1136
},
{
"epoch": 2.907928388746803,
"grad_norm": 0.21729817798268314,
"learning_rate": 7.123048634084815e-05,
"loss": 1.0193,
"step": 1137
},
{
"epoch": 2.9104859335038364,
"grad_norm": 0.25511738825377567,
"learning_rate": 7.108848059920805e-05,
"loss": 0.9594,
"step": 1138
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.25447395942517514,
"learning_rate": 7.09465032838269e-05,
"loss": 0.9746,
"step": 1139
},
{
"epoch": 2.915601023017903,
"grad_norm": 0.24784365067022293,
"learning_rate": 7.080455484758863e-05,
"loss": 0.9659,
"step": 1140
},
{
"epoch": 2.918158567774936,
"grad_norm": 0.2730224277035152,
"learning_rate": 7.066263574328505e-05,
"loss": 0.9818,
"step": 1141
},
{
"epoch": 2.920716112531969,
"grad_norm": 0.30594100479026,
"learning_rate": 7.052074642361444e-05,
"loss": 0.9915,
"step": 1142
},
{
"epoch": 2.923273657289003,
"grad_norm": 0.32054932862442914,
"learning_rate": 7.037888734117998e-05,
"loss": 0.9882,
"step": 1143
},
{
"epoch": 2.9258312020460355,
"grad_norm": 0.23958919561701653,
"learning_rate": 7.023705894848848e-05,
"loss": 0.9666,
"step": 1144
},
{
"epoch": 2.928388746803069,
"grad_norm": 0.27076318118261017,
"learning_rate": 7.009526169794885e-05,
"loss": 0.9746,
"step": 1145
},
{
"epoch": 2.9309462915601023,
"grad_norm": 0.2729574133461879,
"learning_rate": 6.995349604187061e-05,
"loss": 0.9624,
"step": 1146
},
{
"epoch": 2.9335038363171355,
"grad_norm": 0.3259725455577868,
"learning_rate": 6.981176243246257e-05,
"loss": 0.9795,
"step": 1147
},
{
"epoch": 2.9360613810741687,
"grad_norm": 0.34256481150449963,
"learning_rate": 6.967006132183127e-05,
"loss": 0.977,
"step": 1148
},
{
"epoch": 2.938618925831202,
"grad_norm": 0.2828018012599345,
"learning_rate": 6.952839316197956e-05,
"loss": 0.9928,
"step": 1149
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.2397889702793678,
"learning_rate": 6.938675840480525e-05,
"loss": 0.9822,
"step": 1150
},
{
"epoch": 2.9437340153452687,
"grad_norm": 0.331164422112377,
"learning_rate": 6.924515750209954e-05,
"loss": 0.9973,
"step": 1151
},
{
"epoch": 2.946291560102302,
"grad_norm": 0.2704740780802998,
"learning_rate": 6.910359090554572e-05,
"loss": 0.9685,
"step": 1152
},
{
"epoch": 2.948849104859335,
"grad_norm": 0.2437699512495755,
"learning_rate": 6.896205906671755e-05,
"loss": 0.9896,
"step": 1153
},
{
"epoch": 2.9514066496163682,
"grad_norm": 0.24008371878492457,
"learning_rate": 6.882056243707796e-05,
"loss": 0.9948,
"step": 1154
},
{
"epoch": 2.9539641943734014,
"grad_norm": 0.2714718735118312,
"learning_rate": 6.86791014679776e-05,
"loss": 1.0107,
"step": 1155
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.2689100345729253,
"learning_rate": 6.85376766106533e-05,
"loss": 0.9844,
"step": 1156
},
{
"epoch": 2.959079283887468,
"grad_norm": 0.217002318039709,
"learning_rate": 6.839628831622681e-05,
"loss": 0.9748,
"step": 1157
},
{
"epoch": 2.9616368286445014,
"grad_norm": 0.2919920400101465,
"learning_rate": 6.825493703570311e-05,
"loss": 0.9699,
"step": 1158
},
{
"epoch": 2.9641943734015346,
"grad_norm": 0.3490734108048557,
"learning_rate": 6.811362321996926e-05,
"loss": 0.9694,
"step": 1159
},
{
"epoch": 2.9667519181585678,
"grad_norm": 0.3103643754348234,
"learning_rate": 6.797234731979267e-05,
"loss": 0.991,
"step": 1160
},
{
"epoch": 2.969309462915601,
"grad_norm": 0.1939069857875497,
"learning_rate": 6.783110978581989e-05,
"loss": 0.9614,
"step": 1161
},
{
"epoch": 2.971867007672634,
"grad_norm": 0.2495187824732926,
"learning_rate": 6.768991106857508e-05,
"loss": 0.9656,
"step": 1162
},
{
"epoch": 2.9744245524296673,
"grad_norm": 0.3034345894428266,
"learning_rate": 6.754875161845855e-05,
"loss": 1.0069,
"step": 1163
},
{
"epoch": 2.976982097186701,
"grad_norm": 0.3567922857742952,
"learning_rate": 6.740763188574546e-05,
"loss": 0.9612,
"step": 1164
},
{
"epoch": 2.979539641943734,
"grad_norm": 0.25891106467169334,
"learning_rate": 6.726655232058409e-05,
"loss": 0.9696,
"step": 1165
},
{
"epoch": 2.9820971867007673,
"grad_norm": 0.25153156564503487,
"learning_rate": 6.712551337299473e-05,
"loss": 1.0014,
"step": 1166
},
{
"epoch": 2.9846547314578005,
"grad_norm": 0.32964252932862226,
"learning_rate": 6.69845154928681e-05,
"loss": 0.9773,
"step": 1167
},
{
"epoch": 2.9872122762148337,
"grad_norm": 0.2917177962042733,
"learning_rate": 6.684355912996386e-05,
"loss": 0.9911,
"step": 1168
},
{
"epoch": 2.9897698209718673,
"grad_norm": 0.2002913243087303,
"learning_rate": 6.670264473390931e-05,
"loss": 0.9683,
"step": 1169
},
{
"epoch": 2.9923273657289,
"grad_norm": 0.26813771266232983,
"learning_rate": 6.656177275419785e-05,
"loss": 0.967,
"step": 1170
},
{
"epoch": 2.9948849104859336,
"grad_norm": 0.2590485360645914,
"learning_rate": 6.64209436401875e-05,
"loss": 0.9638,
"step": 1171
},
{
"epoch": 2.997442455242967,
"grad_norm": 0.26357426110685056,
"learning_rate": 6.62801578410997e-05,
"loss": 1.0056,
"step": 1172
},
{
"epoch": 3.0,
"grad_norm": 0.22456837673610008,
"learning_rate": 6.61394158060176e-05,
"loss": 0.9933,
"step": 1173
},
{
"epoch": 3.002557544757033,
"grad_norm": 0.22123515970304183,
"learning_rate": 6.59987179838848e-05,
"loss": 0.9712,
"step": 1174
},
{
"epoch": 3.0051150895140664,
"grad_norm": 0.2497098271402969,
"learning_rate": 6.58580648235039e-05,
"loss": 0.9701,
"step": 1175
},
{
"epoch": 3.0076726342710995,
"grad_norm": 0.2264514281442564,
"learning_rate": 6.571745677353492e-05,
"loss": 0.9498,
"step": 1176
},
{
"epoch": 3.010230179028133,
"grad_norm": 0.24110920081950274,
"learning_rate": 6.557689428249414e-05,
"loss": 0.9841,
"step": 1177
},
{
"epoch": 3.0127877237851663,
"grad_norm": 0.28882150068726187,
"learning_rate": 6.543637779875237e-05,
"loss": 0.9728,
"step": 1178
},
{
"epoch": 3.0153452685421995,
"grad_norm": 0.22165888817736834,
"learning_rate": 6.529590777053378e-05,
"loss": 0.9263,
"step": 1179
},
{
"epoch": 3.0179028132992327,
"grad_norm": 0.2715939791147568,
"learning_rate": 6.515548464591428e-05,
"loss": 0.9353,
"step": 1180
},
{
"epoch": 3.020460358056266,
"grad_norm": 0.3321798212445876,
"learning_rate": 6.501510887282024e-05,
"loss": 0.948,
"step": 1181
},
{
"epoch": 3.023017902813299,
"grad_norm": 0.2852631687681614,
"learning_rate": 6.487478089902685e-05,
"loss": 0.9406,
"step": 1182
},
{
"epoch": 3.0255754475703327,
"grad_norm": 0.23938138232215803,
"learning_rate": 6.473450117215699e-05,
"loss": 0.9612,
"step": 1183
},
{
"epoch": 3.028132992327366,
"grad_norm": 0.2897634546793638,
"learning_rate": 6.459427013967953e-05,
"loss": 0.93,
"step": 1184
},
{
"epoch": 3.030690537084399,
"grad_norm": 0.28668995967161215,
"learning_rate": 6.445408824890805e-05,
"loss": 0.943,
"step": 1185
},
{
"epoch": 3.0332480818414322,
"grad_norm": 0.23250708905243717,
"learning_rate": 6.431395594699943e-05,
"loss": 0.9264,
"step": 1186
},
{
"epoch": 3.0358056265984654,
"grad_norm": 0.3127461016723165,
"learning_rate": 6.417387368095225e-05,
"loss": 0.9492,
"step": 1187
},
{
"epoch": 3.0383631713554986,
"grad_norm": 0.26702473205124055,
"learning_rate": 6.403384189760556e-05,
"loss": 0.9173,
"step": 1188
},
{
"epoch": 3.040920716112532,
"grad_norm": 0.2692197582092417,
"learning_rate": 6.389386104363738e-05,
"loss": 0.9483,
"step": 1189
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.29389458281034464,
"learning_rate": 6.375393156556325e-05,
"loss": 0.938,
"step": 1190
},
{
"epoch": 3.0460358056265986,
"grad_norm": 0.24003231343808254,
"learning_rate": 6.361405390973489e-05,
"loss": 0.9174,
"step": 1191
},
{
"epoch": 3.0485933503836318,
"grad_norm": 0.25208756985944336,
"learning_rate": 6.347422852233862e-05,
"loss": 0.9542,
"step": 1192
},
{
"epoch": 3.051150895140665,
"grad_norm": 0.24466794377181064,
"learning_rate": 6.333445584939407e-05,
"loss": 0.9617,
"step": 1193
},
{
"epoch": 3.053708439897698,
"grad_norm": 0.23317237737554486,
"learning_rate": 6.319473633675275e-05,
"loss": 0.9349,
"step": 1194
},
{
"epoch": 3.0562659846547313,
"grad_norm": 0.24590715837760968,
"learning_rate": 6.305507043009657e-05,
"loss": 0.9414,
"step": 1195
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.21035477411097228,
"learning_rate": 6.291545857493645e-05,
"loss": 0.9512,
"step": 1196
},
{
"epoch": 3.061381074168798,
"grad_norm": 0.2248505455887991,
"learning_rate": 6.277590121661098e-05,
"loss": 0.9522,
"step": 1197
},
{
"epoch": 3.0639386189258313,
"grad_norm": 0.2471462687532793,
"learning_rate": 6.263639880028468e-05,
"loss": 0.9493,
"step": 1198
},
{
"epoch": 3.0664961636828645,
"grad_norm": 0.22868376945738234,
"learning_rate": 6.249695177094707e-05,
"loss": 0.9668,
"step": 1199
},
{
"epoch": 3.0690537084398977,
"grad_norm": 0.23527194146680278,
"learning_rate": 6.235756057341084e-05,
"loss": 0.9279,
"step": 1200
},
{
"epoch": 3.071611253196931,
"grad_norm": 0.2513612868250463,
"learning_rate": 6.221822565231066e-05,
"loss": 0.9403,
"step": 1201
},
{
"epoch": 3.074168797953964,
"grad_norm": 0.22860913544864897,
"learning_rate": 6.207894745210168e-05,
"loss": 0.9616,
"step": 1202
},
{
"epoch": 3.0767263427109977,
"grad_norm": 0.24014291985565175,
"learning_rate": 6.193972641705809e-05,
"loss": 0.9664,
"step": 1203
},
{
"epoch": 3.079283887468031,
"grad_norm": 0.22572397342217615,
"learning_rate": 6.180056299127174e-05,
"loss": 0.9663,
"step": 1204
},
{
"epoch": 3.081841432225064,
"grad_norm": 0.25121933762619786,
"learning_rate": 6.16614576186507e-05,
"loss": 0.9676,
"step": 1205
},
{
"epoch": 3.084398976982097,
"grad_norm": 0.21264743561877053,
"learning_rate": 6.152241074291791e-05,
"loss": 0.9385,
"step": 1206
},
{
"epoch": 3.0869565217391304,
"grad_norm": 0.2110657205113156,
"learning_rate": 6.13834228076097e-05,
"loss": 0.9593,
"step": 1207
},
{
"epoch": 3.0895140664961636,
"grad_norm": 0.23064076505093895,
"learning_rate": 6.12444942560744e-05,
"loss": 0.9859,
"step": 1208
},
{
"epoch": 3.0920716112531967,
"grad_norm": 0.2327889001545048,
"learning_rate": 6.110562553147078e-05,
"loss": 0.9343,
"step": 1209
},
{
"epoch": 3.0946291560102304,
"grad_norm": 0.22081121627352496,
"learning_rate": 6.0966817076767e-05,
"loss": 0.9572,
"step": 1210
},
{
"epoch": 3.0971867007672635,
"grad_norm": 0.21410596357542921,
"learning_rate": 6.08280693347388e-05,
"loss": 0.9577,
"step": 1211
},
{
"epoch": 3.0997442455242967,
"grad_norm": 0.22670771449737367,
"learning_rate": 6.068938274796834e-05,
"loss": 0.9253,
"step": 1212
},
{
"epoch": 3.10230179028133,
"grad_norm": 0.205343189542066,
"learning_rate": 6.055075775884263e-05,
"loss": 0.9896,
"step": 1213
},
{
"epoch": 3.104859335038363,
"grad_norm": 0.22769741326879356,
"learning_rate": 6.0412194809552316e-05,
"loss": 0.9387,
"step": 1214
},
{
"epoch": 3.1074168797953963,
"grad_norm": 0.19822402152888394,
"learning_rate": 6.027369434208999e-05,
"loss": 0.9808,
"step": 1215
},
{
"epoch": 3.10997442455243,
"grad_norm": 0.23051970557462004,
"learning_rate": 6.0135256798249047e-05,
"loss": 0.933,
"step": 1216
},
{
"epoch": 3.112531969309463,
"grad_norm": 0.20329115598362008,
"learning_rate": 5.999688261962216e-05,
"loss": 0.9684,
"step": 1217
},
{
"epoch": 3.1150895140664963,
"grad_norm": 0.21036340816499827,
"learning_rate": 5.985857224759981e-05,
"loss": 0.944,
"step": 1218
},
{
"epoch": 3.1176470588235294,
"grad_norm": 0.20307590074585102,
"learning_rate": 5.972032612336906e-05,
"loss": 0.9598,
"step": 1219
},
{
"epoch": 3.1202046035805626,
"grad_norm": 0.2259792004822342,
"learning_rate": 5.958214468791189e-05,
"loss": 0.9483,
"step": 1220
},
{
"epoch": 3.122762148337596,
"grad_norm": 0.21243681629633632,
"learning_rate": 5.944402838200404e-05,
"loss": 0.9455,
"step": 1221
},
{
"epoch": 3.125319693094629,
"grad_norm": 0.21205256563770825,
"learning_rate": 5.930597764621347e-05,
"loss": 0.8963,
"step": 1222
},
{
"epoch": 3.1278772378516626,
"grad_norm": 0.19717448713959743,
"learning_rate": 5.916799292089895e-05,
"loss": 0.9564,
"step": 1223
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.2244196417767959,
"learning_rate": 5.9030074646208745e-05,
"loss": 0.9272,
"step": 1224
},
{
"epoch": 3.132992327365729,
"grad_norm": 0.21563385011040548,
"learning_rate": 5.8892223262079144e-05,
"loss": 0.9316,
"step": 1225
},
{
"epoch": 3.135549872122762,
"grad_norm": 0.2350946628160643,
"learning_rate": 5.875443920823297e-05,
"loss": 0.9487,
"step": 1226
},
{
"epoch": 3.1381074168797953,
"grad_norm": 0.2865769039296874,
"learning_rate": 5.861672292417842e-05,
"loss": 0.9492,
"step": 1227
},
{
"epoch": 3.1406649616368285,
"grad_norm": 0.23430970345425967,
"learning_rate": 5.84790748492074e-05,
"loss": 0.966,
"step": 1228
},
{
"epoch": 3.1432225063938617,
"grad_norm": 0.2467472265535791,
"learning_rate": 5.834149542239431e-05,
"loss": 0.9708,
"step": 1229
},
{
"epoch": 3.1457800511508953,
"grad_norm": 0.26772393728125105,
"learning_rate": 5.8203985082594575e-05,
"loss": 0.9557,
"step": 1230
},
{
"epoch": 3.1483375959079285,
"grad_norm": 0.2338023529317996,
"learning_rate": 5.806654426844315e-05,
"loss": 0.9638,
"step": 1231
},
{
"epoch": 3.1508951406649617,
"grad_norm": 0.2523069016121197,
"learning_rate": 5.792917341835335e-05,
"loss": 0.9434,
"step": 1232
},
{
"epoch": 3.153452685421995,
"grad_norm": 0.2766552697496739,
"learning_rate": 5.77918729705152e-05,
"loss": 0.9809,
"step": 1233
},
{
"epoch": 3.156010230179028,
"grad_norm": 0.22646812781120942,
"learning_rate": 5.765464336289424e-05,
"loss": 0.9639,
"step": 1234
},
{
"epoch": 3.1585677749360612,
"grad_norm": 0.2205961359884855,
"learning_rate": 5.751748503322999e-05,
"loss": 0.954,
"step": 1235
},
{
"epoch": 3.1611253196930944,
"grad_norm": 0.2701811323136191,
"learning_rate": 5.7380398419034644e-05,
"loss": 0.9589,
"step": 1236
},
{
"epoch": 3.163682864450128,
"grad_norm": 0.2081039558632908,
"learning_rate": 5.7243383957591586e-05,
"loss": 0.9471,
"step": 1237
},
{
"epoch": 3.166240409207161,
"grad_norm": 0.19643865068397245,
"learning_rate": 5.7106442085954045e-05,
"loss": 0.9518,
"step": 1238
},
{
"epoch": 3.1687979539641944,
"grad_norm": 0.30921257471256036,
"learning_rate": 5.69695732409438e-05,
"loss": 0.9242,
"step": 1239
},
{
"epoch": 3.1713554987212276,
"grad_norm": 0.24583021366711547,
"learning_rate": 5.6832777859149536e-05,
"loss": 0.9423,
"step": 1240
},
{
"epoch": 3.1739130434782608,
"grad_norm": 0.18950822302407402,
"learning_rate": 5.669605637692575e-05,
"loss": 0.932,
"step": 1241
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.25157456578331905,
"learning_rate": 5.655940923039111e-05,
"loss": 0.9379,
"step": 1242
},
{
"epoch": 3.1790281329923276,
"grad_norm": 0.18343916898513093,
"learning_rate": 5.642283685542717e-05,
"loss": 0.9456,
"step": 1243
},
{
"epoch": 3.1815856777493607,
"grad_norm": 0.19560349844702873,
"learning_rate": 5.6286339687677044e-05,
"loss": 0.9328,
"step": 1244
},
{
"epoch": 3.184143222506394,
"grad_norm": 0.189610936953741,
"learning_rate": 5.614991816254388e-05,
"loss": 0.9109,
"step": 1245
},
{
"epoch": 3.186700767263427,
"grad_norm": 0.18320058939508785,
"learning_rate": 5.601357271518959e-05,
"loss": 0.9584,
"step": 1246
},
{
"epoch": 3.1892583120204603,
"grad_norm": 0.17494234166851327,
"learning_rate": 5.587730378053339e-05,
"loss": 0.9656,
"step": 1247
},
{
"epoch": 3.1918158567774935,
"grad_norm": 0.19092078945148688,
"learning_rate": 5.574111179325039e-05,
"loss": 0.9487,
"step": 1248
},
{
"epoch": 3.1943734015345266,
"grad_norm": 0.1860857981568226,
"learning_rate": 5.560499718777031e-05,
"loss": 0.9372,
"step": 1249
},
{
"epoch": 3.1969309462915603,
"grad_norm": 0.18572653447801232,
"learning_rate": 5.5468960398276014e-05,
"loss": 0.9459,
"step": 1250
},
{
"epoch": 3.1994884910485935,
"grad_norm": 0.19107345846336404,
"learning_rate": 5.5333001858702164e-05,
"loss": 0.9255,
"step": 1251
},
{
"epoch": 3.2020460358056266,
"grad_norm": 0.20057541760798753,
"learning_rate": 5.519712200273381e-05,
"loss": 0.9615,
"step": 1252
},
{
"epoch": 3.20460358056266,
"grad_norm": 0.20198119736904155,
"learning_rate": 5.5061321263804933e-05,
"loss": 0.9204,
"step": 1253
},
{
"epoch": 3.207161125319693,
"grad_norm": 0.21942879387381486,
"learning_rate": 5.4925600075097285e-05,
"loss": 0.945,
"step": 1254
},
{
"epoch": 3.209718670076726,
"grad_norm": 0.19469068958831684,
"learning_rate": 5.4789958869538756e-05,
"loss": 0.9435,
"step": 1255
},
{
"epoch": 3.21227621483376,
"grad_norm": 0.20250937006123632,
"learning_rate": 5.4654398079802183e-05,
"loss": 0.9364,
"step": 1256
},
{
"epoch": 3.214833759590793,
"grad_norm": 0.19846072138477766,
"learning_rate": 5.451891813830382e-05,
"loss": 0.94,
"step": 1257
},
{
"epoch": 3.217391304347826,
"grad_norm": 0.20425114535656635,
"learning_rate": 5.4383519477202103e-05,
"loss": 0.9363,
"step": 1258
},
{
"epoch": 3.2199488491048593,
"grad_norm": 0.185008322081447,
"learning_rate": 5.42482025283961e-05,
"loss": 0.9815,
"step": 1259
},
{
"epoch": 3.2225063938618925,
"grad_norm": 0.2151529732841821,
"learning_rate": 5.41129677235243e-05,
"loss": 0.9498,
"step": 1260
},
{
"epoch": 3.2250639386189257,
"grad_norm": 0.1885448397273564,
"learning_rate": 5.397781549396316e-05,
"loss": 0.9337,
"step": 1261
},
{
"epoch": 3.227621483375959,
"grad_norm": 0.21418784649002942,
"learning_rate": 5.3842746270825705e-05,
"loss": 0.9171,
"step": 1262
},
{
"epoch": 3.2301790281329925,
"grad_norm": 0.20068889946827412,
"learning_rate": 5.370776048496026e-05,
"loss": 0.9376,
"step": 1263
},
{
"epoch": 3.2327365728900257,
"grad_norm": 0.24899426008654885,
"learning_rate": 5.357285856694891e-05,
"loss": 0.9429,
"step": 1264
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.19686757692012147,
"learning_rate": 5.34380409471062e-05,
"loss": 0.9377,
"step": 1265
},
{
"epoch": 3.237851662404092,
"grad_norm": 0.24870949090788627,
"learning_rate": 5.33033080554779e-05,
"loss": 0.945,
"step": 1266
},
{
"epoch": 3.2404092071611252,
"grad_norm": 0.20621519140618658,
"learning_rate": 5.3168660321839386e-05,
"loss": 0.9379,
"step": 1267
},
{
"epoch": 3.2429667519181584,
"grad_norm": 0.21652792479122668,
"learning_rate": 5.303409817569449e-05,
"loss": 0.9021,
"step": 1268
},
{
"epoch": 3.2455242966751916,
"grad_norm": 0.19103019263904417,
"learning_rate": 5.2899622046274e-05,
"loss": 0.9613,
"step": 1269
},
{
"epoch": 3.2480818414322252,
"grad_norm": 0.21245341007957305,
"learning_rate": 5.276523236253425e-05,
"loss": 0.9387,
"step": 1270
},
{
"epoch": 3.2506393861892584,
"grad_norm": 0.2106216561170891,
"learning_rate": 5.263092955315595e-05,
"loss": 0.9546,
"step": 1271
},
{
"epoch": 3.2531969309462916,
"grad_norm": 0.197972453520414,
"learning_rate": 5.2496714046542583e-05,
"loss": 0.9391,
"step": 1272
},
{
"epoch": 3.2557544757033248,
"grad_norm": 0.199650022114146,
"learning_rate": 5.2362586270819256e-05,
"loss": 0.9386,
"step": 1273
},
{
"epoch": 3.258312020460358,
"grad_norm": 0.18979777369555925,
"learning_rate": 5.222854665383116e-05,
"loss": 0.9495,
"step": 1274
},
{
"epoch": 3.260869565217391,
"grad_norm": 0.2173804109344821,
"learning_rate": 5.2094595623142326e-05,
"loss": 0.9588,
"step": 1275
},
{
"epoch": 3.2634271099744243,
"grad_norm": 0.2016383197459456,
"learning_rate": 5.1960733606034126e-05,
"loss": 0.9151,
"step": 1276
},
{
"epoch": 3.265984654731458,
"grad_norm": 0.2047292724222713,
"learning_rate": 5.182696102950404e-05,
"loss": 0.9686,
"step": 1277
},
{
"epoch": 3.268542199488491,
"grad_norm": 0.2065833579125683,
"learning_rate": 5.1693278320264304e-05,
"loss": 0.9384,
"step": 1278
},
{
"epoch": 3.2710997442455243,
"grad_norm": 0.20569255957459082,
"learning_rate": 5.1559685904740386e-05,
"loss": 0.9869,
"step": 1279
},
{
"epoch": 3.2736572890025575,
"grad_norm": 0.19840584494069785,
"learning_rate": 5.142618420906985e-05,
"loss": 0.9557,
"step": 1280
},
{
"epoch": 3.2762148337595907,
"grad_norm": 0.20387885459079644,
"learning_rate": 5.1292773659100755e-05,
"loss": 0.9642,
"step": 1281
},
{
"epoch": 3.2787723785166243,
"grad_norm": 0.2101778694530114,
"learning_rate": 5.115945468039048e-05,
"loss": 0.9509,
"step": 1282
},
{
"epoch": 3.2813299232736575,
"grad_norm": 0.2155780933816927,
"learning_rate": 5.1026227698204335e-05,
"loss": 0.9499,
"step": 1283
},
{
"epoch": 3.2838874680306906,
"grad_norm": 0.24104255752130535,
"learning_rate": 5.089309313751415e-05,
"loss": 0.9458,
"step": 1284
},
{
"epoch": 3.286445012787724,
"grad_norm": 0.2121724580915078,
"learning_rate": 5.0760051422996925e-05,
"loss": 0.9499,
"step": 1285
},
{
"epoch": 3.289002557544757,
"grad_norm": 0.20440164305922942,
"learning_rate": 5.0627102979033546e-05,
"loss": 0.9458,
"step": 1286
},
{
"epoch": 3.29156010230179,
"grad_norm": 0.21910653895674295,
"learning_rate": 5.049424822970731e-05,
"loss": 0.9379,
"step": 1287
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.17657372919405595,
"learning_rate": 5.036148759880272e-05,
"loss": 0.9249,
"step": 1288
},
{
"epoch": 3.296675191815857,
"grad_norm": 0.22994935624931387,
"learning_rate": 5.0228821509803984e-05,
"loss": 0.9247,
"step": 1289
},
{
"epoch": 3.29923273657289,
"grad_norm": 0.18809716520389427,
"learning_rate": 5.0096250385893825e-05,
"loss": 0.9236,
"step": 1290
},
{
"epoch": 3.3017902813299234,
"grad_norm": 0.20395108123985592,
"learning_rate": 4.9963774649951975e-05,
"loss": 0.9351,
"step": 1291
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.21017478598124728,
"learning_rate": 4.983139472455387e-05,
"loss": 0.9603,
"step": 1292
},
{
"epoch": 3.3069053708439897,
"grad_norm": 0.21877137266724161,
"learning_rate": 4.969911103196942e-05,
"loss": 0.9067,
"step": 1293
},
{
"epoch": 3.309462915601023,
"grad_norm": 0.18726348177523444,
"learning_rate": 4.956692399416149e-05,
"loss": 0.9368,
"step": 1294
},
{
"epoch": 3.312020460358056,
"grad_norm": 0.2241750270363803,
"learning_rate": 4.943483403278468e-05,
"loss": 0.947,
"step": 1295
},
{
"epoch": 3.3145780051150897,
"grad_norm": 0.20581443285806397,
"learning_rate": 4.9302841569183884e-05,
"loss": 0.9575,
"step": 1296
},
{
"epoch": 3.317135549872123,
"grad_norm": 0.17452182993008977,
"learning_rate": 4.9170947024393074e-05,
"loss": 0.9156,
"step": 1297
},
{
"epoch": 3.319693094629156,
"grad_norm": 0.198949333785195,
"learning_rate": 4.9039150819133775e-05,
"loss": 0.9348,
"step": 1298
},
{
"epoch": 3.3222506393861893,
"grad_norm": 0.16601657169918604,
"learning_rate": 4.890745337381388e-05,
"loss": 0.9587,
"step": 1299
},
{
"epoch": 3.3248081841432224,
"grad_norm": 0.23036877304791145,
"learning_rate": 4.877585510852627e-05,
"loss": 0.9792,
"step": 1300
},
{
"epoch": 3.3273657289002556,
"grad_norm": 0.18765197640496664,
"learning_rate": 4.864435644304742e-05,
"loss": 0.9253,
"step": 1301
},
{
"epoch": 3.329923273657289,
"grad_norm": 0.19041731553942576,
"learning_rate": 4.851295779683616e-05,
"loss": 0.9535,
"step": 1302
},
{
"epoch": 3.3324808184143224,
"grad_norm": 0.2087435808060436,
"learning_rate": 4.8381659589032186e-05,
"loss": 0.9338,
"step": 1303
},
{
"epoch": 3.3350383631713556,
"grad_norm": 0.1903448069067344,
"learning_rate": 4.825046223845486e-05,
"loss": 0.9499,
"step": 1304
},
{
"epoch": 3.337595907928389,
"grad_norm": 0.21308090181205586,
"learning_rate": 4.811936616360186e-05,
"loss": 0.9256,
"step": 1305
},
{
"epoch": 3.340153452685422,
"grad_norm": 0.2023342708755437,
"learning_rate": 4.798837178264772e-05,
"loss": 0.9582,
"step": 1306
},
{
"epoch": 3.342710997442455,
"grad_norm": 0.21619791962247753,
"learning_rate": 4.78574795134427e-05,
"loss": 0.9125,
"step": 1307
},
{
"epoch": 3.3452685421994883,
"grad_norm": 0.2487539660815107,
"learning_rate": 4.772668977351128e-05,
"loss": 0.9537,
"step": 1308
},
{
"epoch": 3.3478260869565215,
"grad_norm": 0.2240156883350933,
"learning_rate": 4.7596002980050834e-05,
"loss": 0.9401,
"step": 1309
},
{
"epoch": 3.350383631713555,
"grad_norm": 0.2251746608186689,
"learning_rate": 4.7465419549930476e-05,
"loss": 0.9782,
"step": 1310
},
{
"epoch": 3.3529411764705883,
"grad_norm": 0.22881310384597994,
"learning_rate": 4.733493989968949e-05,
"loss": 0.9458,
"step": 1311
},
{
"epoch": 3.3554987212276215,
"grad_norm": 0.2141099007638843,
"learning_rate": 4.7204564445536234e-05,
"loss": 0.9396,
"step": 1312
},
{
"epoch": 3.3580562659846547,
"grad_norm": 0.1882802550926345,
"learning_rate": 4.707429360334662e-05,
"loss": 0.942,
"step": 1313
},
{
"epoch": 3.360613810741688,
"grad_norm": 0.2179119833942681,
"learning_rate": 4.694412778866285e-05,
"loss": 0.9504,
"step": 1314
},
{
"epoch": 3.363171355498721,
"grad_norm": 0.16843886415285414,
"learning_rate": 4.681406741669216e-05,
"loss": 0.9221,
"step": 1315
},
{
"epoch": 3.3657289002557547,
"grad_norm": 0.21980007814521796,
"learning_rate": 4.668411290230543e-05,
"loss": 0.944,
"step": 1316
},
{
"epoch": 3.368286445012788,
"grad_norm": 0.1510130725197139,
"learning_rate": 4.655426466003586e-05,
"loss": 0.9563,
"step": 1317
},
{
"epoch": 3.370843989769821,
"grad_norm": 0.19586517189701522,
"learning_rate": 4.6424523104077654e-05,
"loss": 0.9508,
"step": 1318
},
{
"epoch": 3.373401534526854,
"grad_norm": 0.1995467600478656,
"learning_rate": 4.629488864828472e-05,
"loss": 0.9502,
"step": 1319
},
{
"epoch": 3.3759590792838874,
"grad_norm": 0.1742993616386661,
"learning_rate": 4.6165361706169325e-05,
"loss": 0.9268,
"step": 1320
},
{
"epoch": 3.3785166240409206,
"grad_norm": 0.2067544794585532,
"learning_rate": 4.603594269090078e-05,
"loss": 0.9268,
"step": 1321
},
{
"epoch": 3.381074168797954,
"grad_norm": 0.2227068577818483,
"learning_rate": 4.5906632015304116e-05,
"loss": 0.9358,
"step": 1322
},
{
"epoch": 3.3836317135549874,
"grad_norm": 0.2034466989052333,
"learning_rate": 4.5777430091858855e-05,
"loss": 0.9302,
"step": 1323
},
{
"epoch": 3.3861892583120206,
"grad_norm": 0.20709571806774676,
"learning_rate": 4.564833733269755e-05,
"loss": 0.9427,
"step": 1324
},
{
"epoch": 3.3887468030690537,
"grad_norm": 0.22013092566675613,
"learning_rate": 4.5519354149604474e-05,
"loss": 0.9437,
"step": 1325
},
{
"epoch": 3.391304347826087,
"grad_norm": 0.18450541197105383,
"learning_rate": 4.539048095401452e-05,
"loss": 0.9466,
"step": 1326
},
{
"epoch": 3.39386189258312,
"grad_norm": 0.22548387813850762,
"learning_rate": 4.526171815701165e-05,
"loss": 0.9336,
"step": 1327
},
{
"epoch": 3.3964194373401533,
"grad_norm": 0.1820733823905873,
"learning_rate": 4.513306616932764e-05,
"loss": 0.9215,
"step": 1328
},
{
"epoch": 3.398976982097187,
"grad_norm": 0.21404349632115405,
"learning_rate": 4.5004525401340915e-05,
"loss": 0.9801,
"step": 1329
},
{
"epoch": 3.40153452685422,
"grad_norm": 0.18377817821243256,
"learning_rate": 4.487609626307508e-05,
"loss": 0.9655,
"step": 1330
},
{
"epoch": 3.4040920716112533,
"grad_norm": 0.1923893878636668,
"learning_rate": 4.4747779164197535e-05,
"loss": 0.9382,
"step": 1331
},
{
"epoch": 3.4066496163682864,
"grad_norm": 0.19516009680845245,
"learning_rate": 4.4619574514018486e-05,
"loss": 0.9557,
"step": 1332
},
{
"epoch": 3.4092071611253196,
"grad_norm": 0.19144644869283248,
"learning_rate": 4.449148272148934e-05,
"loss": 0.9345,
"step": 1333
},
{
"epoch": 3.411764705882353,
"grad_norm": 0.1817955488888704,
"learning_rate": 4.436350419520154e-05,
"loss": 0.9608,
"step": 1334
},
{
"epoch": 3.414322250639386,
"grad_norm": 0.2056911128568184,
"learning_rate": 4.423563934338519e-05,
"loss": 0.9458,
"step": 1335
},
{
"epoch": 3.4168797953964196,
"grad_norm": 0.1693771378014072,
"learning_rate": 4.410788857390785e-05,
"loss": 0.9466,
"step": 1336
},
{
"epoch": 3.419437340153453,
"grad_norm": 0.20830311663566495,
"learning_rate": 4.39802522942731e-05,
"loss": 0.9408,
"step": 1337
},
{
"epoch": 3.421994884910486,
"grad_norm": 0.1698790309922409,
"learning_rate": 4.385273091161937e-05,
"loss": 0.9305,
"step": 1338
},
{
"epoch": 3.424552429667519,
"grad_norm": 0.19474240897387077,
"learning_rate": 4.372532483271863e-05,
"loss": 0.9375,
"step": 1339
},
{
"epoch": 3.4271099744245523,
"grad_norm": 0.2059429092680418,
"learning_rate": 4.3598034463974966e-05,
"loss": 0.9869,
"step": 1340
},
{
"epoch": 3.4296675191815855,
"grad_norm": 0.19031026060303782,
"learning_rate": 4.347086021142339e-05,
"loss": 0.9765,
"step": 1341
},
{
"epoch": 3.4322250639386187,
"grad_norm": 0.19960933133782244,
"learning_rate": 4.3343802480728544e-05,
"loss": 0.9431,
"step": 1342
},
{
"epoch": 3.4347826086956523,
"grad_norm": 0.1924073308227482,
"learning_rate": 4.321686167718337e-05,
"loss": 0.9545,
"step": 1343
},
{
"epoch": 3.4373401534526855,
"grad_norm": 0.2028658725938022,
"learning_rate": 4.309003820570785e-05,
"loss": 0.9377,
"step": 1344
},
{
"epoch": 3.4398976982097187,
"grad_norm": 0.2106823975486889,
"learning_rate": 4.296333247084764e-05,
"loss": 0.9283,
"step": 1345
},
{
"epoch": 3.442455242966752,
"grad_norm": 0.21370019365379003,
"learning_rate": 4.283674487677297e-05,
"loss": 0.9663,
"step": 1346
},
{
"epoch": 3.445012787723785,
"grad_norm": 0.20381679039668288,
"learning_rate": 4.271027582727703e-05,
"loss": 0.9425,
"step": 1347
},
{
"epoch": 3.4475703324808182,
"grad_norm": 0.2465303759456818,
"learning_rate": 4.2583925725774996e-05,
"loss": 0.963,
"step": 1348
},
{
"epoch": 3.4501278772378514,
"grad_norm": 0.2017710128697274,
"learning_rate": 4.2457694975302625e-05,
"loss": 0.969,
"step": 1349
},
{
"epoch": 3.452685421994885,
"grad_norm": 0.2599485575517086,
"learning_rate": 4.233158397851494e-05,
"loss": 0.9578,
"step": 1350
},
{
"epoch": 3.455242966751918,
"grad_norm": 0.20994916380961168,
"learning_rate": 4.220559313768492e-05,
"loss": 0.9517,
"step": 1351
},
{
"epoch": 3.4578005115089514,
"grad_norm": 0.25562334357376887,
"learning_rate": 4.207972285470236e-05,
"loss": 0.9593,
"step": 1352
},
{
"epoch": 3.4603580562659846,
"grad_norm": 0.2018942765243476,
"learning_rate": 4.1953973531072403e-05,
"loss": 0.9238,
"step": 1353
},
{
"epoch": 3.4629156010230178,
"grad_norm": 0.23893893502461097,
"learning_rate": 4.1828345567914426e-05,
"loss": 0.9463,
"step": 1354
},
{
"epoch": 3.4654731457800514,
"grad_norm": 0.2377570507765394,
"learning_rate": 4.17028393659606e-05,
"loss": 0.9379,
"step": 1355
},
{
"epoch": 3.4680306905370846,
"grad_norm": 0.21617110584103066,
"learning_rate": 4.157745532555484e-05,
"loss": 0.9445,
"step": 1356
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.20973373939841763,
"learning_rate": 4.145219384665128e-05,
"loss": 0.9471,
"step": 1357
},
{
"epoch": 3.473145780051151,
"grad_norm": 0.19248666440528944,
"learning_rate": 4.1327055328813036e-05,
"loss": 0.9492,
"step": 1358
},
{
"epoch": 3.475703324808184,
"grad_norm": 0.19782620860430303,
"learning_rate": 4.1202040171211195e-05,
"loss": 0.9677,
"step": 1359
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.18288110899297144,
"learning_rate": 4.107714877262318e-05,
"loss": 0.9574,
"step": 1360
},
{
"epoch": 3.4808184143222505,
"grad_norm": 0.18982354052970898,
"learning_rate": 4.0952381531431716e-05,
"loss": 0.9411,
"step": 1361
},
{
"epoch": 3.483375959079284,
"grad_norm": 0.19047078322563796,
"learning_rate": 4.082773884562342e-05,
"loss": 0.9465,
"step": 1362
},
{
"epoch": 3.4859335038363173,
"grad_norm": 0.20024490556690386,
"learning_rate": 4.0703221112787774e-05,
"loss": 0.9631,
"step": 1363
},
{
"epoch": 3.4884910485933505,
"grad_norm": 0.18855297057246742,
"learning_rate": 4.057882873011543e-05,
"loss": 0.9333,
"step": 1364
},
{
"epoch": 3.4910485933503836,
"grad_norm": 0.18121257314529818,
"learning_rate": 4.045456209439734e-05,
"loss": 0.9683,
"step": 1365
},
{
"epoch": 3.493606138107417,
"grad_norm": 0.19866185503250056,
"learning_rate": 4.033042160202337e-05,
"loss": 0.9872,
"step": 1366
},
{
"epoch": 3.49616368286445,
"grad_norm": 0.17010036933663283,
"learning_rate": 4.020640764898096e-05,
"loss": 0.9685,
"step": 1367
},
{
"epoch": 3.498721227621483,
"grad_norm": 0.18176622769606524,
"learning_rate": 4.0082520630853865e-05,
"loss": 0.9112,
"step": 1368
},
{
"epoch": 3.501278772378517,
"grad_norm": 0.1861883153790341,
"learning_rate": 3.995876094282104e-05,
"loss": 0.9585,
"step": 1369
},
{
"epoch": 3.50383631713555,
"grad_norm": 0.19579755858911602,
"learning_rate": 3.983512897965519e-05,
"loss": 0.959,
"step": 1370
},
{
"epoch": 3.506393861892583,
"grad_norm": 0.18488711544490097,
"learning_rate": 3.9711625135721664e-05,
"loss": 0.9555,
"step": 1371
},
{
"epoch": 3.5089514066496164,
"grad_norm": 0.2073614939639127,
"learning_rate": 3.958824980497704e-05,
"loss": 0.9744,
"step": 1372
},
{
"epoch": 3.5115089514066495,
"grad_norm": 0.17154095562950622,
"learning_rate": 3.946500338096811e-05,
"loss": 0.9353,
"step": 1373
},
{
"epoch": 3.5140664961636827,
"grad_norm": 0.20478213377969626,
"learning_rate": 3.934188625683037e-05,
"loss": 0.9568,
"step": 1374
},
{
"epoch": 3.516624040920716,
"grad_norm": 0.18373687324276738,
"learning_rate": 3.9218898825286806e-05,
"loss": 0.9279,
"step": 1375
},
{
"epoch": 3.5191815856777495,
"grad_norm": 0.1716453870437831,
"learning_rate": 3.9096041478646885e-05,
"loss": 0.9342,
"step": 1376
},
{
"epoch": 3.5217391304347827,
"grad_norm": 0.18268819201544698,
"learning_rate": 3.8973314608805e-05,
"loss": 0.962,
"step": 1377
},
{
"epoch": 3.524296675191816,
"grad_norm": 0.16258821810908097,
"learning_rate": 3.885071860723937e-05,
"loss": 0.9293,
"step": 1378
},
{
"epoch": 3.526854219948849,
"grad_norm": 0.165376063640211,
"learning_rate": 3.8728253865010765e-05,
"loss": 0.9895,
"step": 1379
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.16721193942916188,
"learning_rate": 3.8605920772761274e-05,
"loss": 0.9328,
"step": 1380
},
{
"epoch": 3.531969309462916,
"grad_norm": 0.16130857457103082,
"learning_rate": 3.848371972071304e-05,
"loss": 0.9859,
"step": 1381
},
{
"epoch": 3.5345268542199486,
"grad_norm": 0.16278759213568428,
"learning_rate": 3.8361651098666967e-05,
"loss": 0.9569,
"step": 1382
},
{
"epoch": 3.5370843989769822,
"grad_norm": 0.17183294163130294,
"learning_rate": 3.8239715296001654e-05,
"loss": 0.9418,
"step": 1383
},
{
"epoch": 3.5396419437340154,
"grad_norm": 0.155240959003008,
"learning_rate": 3.8117912701671905e-05,
"loss": 0.9696,
"step": 1384
},
{
"epoch": 3.5421994884910486,
"grad_norm": 0.17273359598041008,
"learning_rate": 3.7996243704207686e-05,
"loss": 0.9502,
"step": 1385
},
{
"epoch": 3.544757033248082,
"grad_norm": 0.1703572907276737,
"learning_rate": 3.787470869171277e-05,
"loss": 0.9673,
"step": 1386
},
{
"epoch": 3.547314578005115,
"grad_norm": 0.163047329660931,
"learning_rate": 3.7753308051863534e-05,
"loss": 0.9244,
"step": 1387
},
{
"epoch": 3.5498721227621486,
"grad_norm": 0.16125670043718637,
"learning_rate": 3.763204217190778e-05,
"loss": 0.9414,
"step": 1388
},
{
"epoch": 3.5524296675191813,
"grad_norm": 0.17450887360011574,
"learning_rate": 3.751091143866338e-05,
"loss": 0.9677,
"step": 1389
},
{
"epoch": 3.554987212276215,
"grad_norm": 0.15580595508138104,
"learning_rate": 3.7389916238517224e-05,
"loss": 0.9758,
"step": 1390
},
{
"epoch": 3.557544757033248,
"grad_norm": 0.17069367779408143,
"learning_rate": 3.726905695742372e-05,
"loss": 0.9142,
"step": 1391
},
{
"epoch": 3.5601023017902813,
"grad_norm": 0.16910211167776398,
"learning_rate": 3.7148333980903796e-05,
"loss": 0.9389,
"step": 1392
},
{
"epoch": 3.5626598465473145,
"grad_norm": 0.1663225487056752,
"learning_rate": 3.7027747694043645e-05,
"loss": 0.9557,
"step": 1393
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.16804185773204355,
"learning_rate": 3.690729848149335e-05,
"loss": 0.9588,
"step": 1394
},
{
"epoch": 3.5677749360613813,
"grad_norm": 0.16402784688128466,
"learning_rate": 3.678698672746581e-05,
"loss": 0.964,
"step": 1395
},
{
"epoch": 3.5703324808184145,
"grad_norm": 0.18174268933477528,
"learning_rate": 3.6666812815735424e-05,
"loss": 0.9433,
"step": 1396
},
{
"epoch": 3.5728900255754477,
"grad_norm": 0.15614453400715234,
"learning_rate": 3.6546777129636886e-05,
"loss": 0.9252,
"step": 1397
},
{
"epoch": 3.575447570332481,
"grad_norm": 0.16700607138470522,
"learning_rate": 3.6426880052064026e-05,
"loss": 0.9636,
"step": 1398
},
{
"epoch": 3.578005115089514,
"grad_norm": 0.20568461367374485,
"learning_rate": 3.630712196546844e-05,
"loss": 0.9649,
"step": 1399
},
{
"epoch": 3.580562659846547,
"grad_norm": 0.14660657078481024,
"learning_rate": 3.6187503251858505e-05,
"loss": 0.9267,
"step": 1400
},
{
"epoch": 3.5831202046035804,
"grad_norm": 0.16935747703951526,
"learning_rate": 3.6068024292797945e-05,
"loss": 0.9356,
"step": 1401
},
{
"epoch": 3.585677749360614,
"grad_norm": 0.15782075450424704,
"learning_rate": 3.59486854694046e-05,
"loss": 0.9548,
"step": 1402
},
{
"epoch": 3.588235294117647,
"grad_norm": 0.17132410907270623,
"learning_rate": 3.582948716234948e-05,
"loss": 0.9493,
"step": 1403
},
{
"epoch": 3.5907928388746804,
"grad_norm": 0.16858095077712948,
"learning_rate": 3.571042975185524e-05,
"loss": 0.9552,
"step": 1404
},
{
"epoch": 3.5933503836317136,
"grad_norm": 0.1634251285228488,
"learning_rate": 3.559151361769517e-05,
"loss": 0.9466,
"step": 1405
},
{
"epoch": 3.5959079283887467,
"grad_norm": 0.1729430282795056,
"learning_rate": 3.547273913919182e-05,
"loss": 0.95,
"step": 1406
},
{
"epoch": 3.59846547314578,
"grad_norm": 0.1821907434145911,
"learning_rate": 3.535410669521605e-05,
"loss": 0.9588,
"step": 1407
},
{
"epoch": 3.601023017902813,
"grad_norm": 0.15781654283531932,
"learning_rate": 3.5235616664185465e-05,
"loss": 0.9591,
"step": 1408
},
{
"epoch": 3.6035805626598467,
"grad_norm": 0.1677674098580371,
"learning_rate": 3.5117269424063466e-05,
"loss": 0.9372,
"step": 1409
},
{
"epoch": 3.60613810741688,
"grad_norm": 0.1668467714604029,
"learning_rate": 3.4999065352358055e-05,
"loss": 0.9128,
"step": 1410
},
{
"epoch": 3.608695652173913,
"grad_norm": 0.16023804099695482,
"learning_rate": 3.488100482612046e-05,
"loss": 0.9533,
"step": 1411
},
{
"epoch": 3.6112531969309463,
"grad_norm": 0.17448057130149636,
"learning_rate": 3.476308822194404e-05,
"loss": 0.9696,
"step": 1412
},
{
"epoch": 3.6138107416879794,
"grad_norm": 0.17176757036978785,
"learning_rate": 3.4645315915963085e-05,
"loss": 0.9295,
"step": 1413
},
{
"epoch": 3.6163682864450126,
"grad_norm": 0.16582442582314796,
"learning_rate": 3.452768828385156e-05,
"loss": 0.9478,
"step": 1414
},
{
"epoch": 3.618925831202046,
"grad_norm": 0.16508960150611576,
"learning_rate": 3.4410205700822e-05,
"loss": 0.9267,
"step": 1415
},
{
"epoch": 3.6214833759590794,
"grad_norm": 0.15842544276922507,
"learning_rate": 3.42928685416242e-05,
"loss": 0.9487,
"step": 1416
},
{
"epoch": 3.6240409207161126,
"grad_norm": 0.16737847990453103,
"learning_rate": 3.417567718054413e-05,
"loss": 0.9257,
"step": 1417
},
{
"epoch": 3.626598465473146,
"grad_norm": 0.16179442819088455,
"learning_rate": 3.405863199140271e-05,
"loss": 0.9594,
"step": 1418
},
{
"epoch": 3.629156010230179,
"grad_norm": 0.17740705653386357,
"learning_rate": 3.3941733347554434e-05,
"loss": 0.954,
"step": 1419
},
{
"epoch": 3.631713554987212,
"grad_norm": 0.1745105989485467,
"learning_rate": 3.3824981621886545e-05,
"loss": 0.9536,
"step": 1420
},
{
"epoch": 3.634271099744246,
"grad_norm": 0.1927262004385616,
"learning_rate": 3.370837718681754e-05,
"loss": 0.9685,
"step": 1421
},
{
"epoch": 3.6368286445012785,
"grad_norm": 0.15752590578867717,
"learning_rate": 3.3591920414296094e-05,
"loss": 0.9248,
"step": 1422
},
{
"epoch": 3.639386189258312,
"grad_norm": 0.21240595387549532,
"learning_rate": 3.347561167579986e-05,
"loss": 0.9521,
"step": 1423
},
{
"epoch": 3.6419437340153453,
"grad_norm": 0.17508530317965004,
"learning_rate": 3.3359451342334306e-05,
"loss": 0.9431,
"step": 1424
},
{
"epoch": 3.6445012787723785,
"grad_norm": 0.21738581132916354,
"learning_rate": 3.324343978443148e-05,
"loss": 0.9716,
"step": 1425
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.16746773638107448,
"learning_rate": 3.3127577372148874e-05,
"loss": 0.9322,
"step": 1426
},
{
"epoch": 3.649616368286445,
"grad_norm": 0.2122059201301744,
"learning_rate": 3.301186447506827e-05,
"loss": 0.9422,
"step": 1427
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.15741451467355758,
"learning_rate": 3.289630146229449e-05,
"loss": 0.9366,
"step": 1428
},
{
"epoch": 3.6547314578005117,
"grad_norm": 0.19813994445803942,
"learning_rate": 3.278088870245423e-05,
"loss": 0.9286,
"step": 1429
},
{
"epoch": 3.657289002557545,
"grad_norm": 0.16851843081939155,
"learning_rate": 3.2665626563694937e-05,
"loss": 0.9572,
"step": 1430
},
{
"epoch": 3.659846547314578,
"grad_norm": 0.20717471275600138,
"learning_rate": 3.2550515413683574e-05,
"loss": 0.9512,
"step": 1431
},
{
"epoch": 3.662404092071611,
"grad_norm": 0.16245953402744545,
"learning_rate": 3.2435555619605504e-05,
"loss": 0.9542,
"step": 1432
},
{
"epoch": 3.6649616368286444,
"grad_norm": 0.19641538640030912,
"learning_rate": 3.232074754816323e-05,
"loss": 0.9306,
"step": 1433
},
{
"epoch": 3.6675191815856776,
"grad_norm": 0.1594631052144963,
"learning_rate": 3.220609156557544e-05,
"loss": 0.9363,
"step": 1434
},
{
"epoch": 3.670076726342711,
"grad_norm": 0.18455147659478868,
"learning_rate": 3.209158803757546e-05,
"loss": 0.9321,
"step": 1435
},
{
"epoch": 3.6726342710997444,
"grad_norm": 0.1790498881096886,
"learning_rate": 3.1977237329410446e-05,
"loss": 0.9608,
"step": 1436
},
{
"epoch": 3.6751918158567776,
"grad_norm": 0.1870454897435218,
"learning_rate": 3.186303980584012e-05,
"loss": 0.9389,
"step": 1437
},
{
"epoch": 3.6777493606138107,
"grad_norm": 0.20530561810770268,
"learning_rate": 3.174899583113548e-05,
"loss": 0.9945,
"step": 1438
},
{
"epoch": 3.680306905370844,
"grad_norm": 0.18019213638281067,
"learning_rate": 3.1635105769077766e-05,
"loss": 0.9307,
"step": 1439
},
{
"epoch": 3.682864450127877,
"grad_norm": 0.20610761052130405,
"learning_rate": 3.152136998295727e-05,
"loss": 0.9321,
"step": 1440
},
{
"epoch": 3.6854219948849103,
"grad_norm": 0.17985929842660886,
"learning_rate": 3.140778883557213e-05,
"loss": 0.932,
"step": 1441
},
{
"epoch": 3.687979539641944,
"grad_norm": 0.20013068677532989,
"learning_rate": 3.129436268922728e-05,
"loss": 0.9324,
"step": 1442
},
{
"epoch": 3.690537084398977,
"grad_norm": 0.17562501633026537,
"learning_rate": 3.118109190573313e-05,
"loss": 0.9145,
"step": 1443
},
{
"epoch": 3.6930946291560103,
"grad_norm": 0.18827294282018908,
"learning_rate": 3.106797684640464e-05,
"loss": 0.9402,
"step": 1444
},
{
"epoch": 3.6956521739130435,
"grad_norm": 0.20170283801470837,
"learning_rate": 3.0955017872059956e-05,
"loss": 0.9591,
"step": 1445
},
{
"epoch": 3.6982097186700766,
"grad_norm": 0.15387225427234089,
"learning_rate": 3.084221534301926e-05,
"loss": 0.9253,
"step": 1446
},
{
"epoch": 3.70076726342711,
"grad_norm": 0.24032338349831264,
"learning_rate": 3.0729569619103876e-05,
"loss": 0.9501,
"step": 1447
},
{
"epoch": 3.703324808184143,
"grad_norm": 0.1613801252077293,
"learning_rate": 3.061708105963481e-05,
"loss": 0.9706,
"step": 1448
},
{
"epoch": 3.7058823529411766,
"grad_norm": 0.18342909310635377,
"learning_rate": 3.0504750023431787e-05,
"loss": 0.9268,
"step": 1449
},
{
"epoch": 3.70843989769821,
"grad_norm": 0.1656531219879725,
"learning_rate": 3.039257686881209e-05,
"loss": 0.9385,
"step": 1450
},
{
"epoch": 3.710997442455243,
"grad_norm": 0.1781080191407481,
"learning_rate": 3.028056195358936e-05,
"loss": 0.9201,
"step": 1451
},
{
"epoch": 3.713554987212276,
"grad_norm": 0.1682926250161123,
"learning_rate": 3.016870563507241e-05,
"loss": 0.9486,
"step": 1452
},
{
"epoch": 3.7161125319693094,
"grad_norm": 0.17403568022524737,
"learning_rate": 3.0057008270064226e-05,
"loss": 0.9326,
"step": 1453
},
{
"epoch": 3.718670076726343,
"grad_norm": 0.17412534323602966,
"learning_rate": 2.9945470214860815e-05,
"loss": 0.9737,
"step": 1454
},
{
"epoch": 3.7212276214833757,
"grad_norm": 0.2012938530305388,
"learning_rate": 2.9834091825249908e-05,
"loss": 0.9319,
"step": 1455
},
{
"epoch": 3.7237851662404093,
"grad_norm": 0.15521247782508635,
"learning_rate": 2.9722873456509985e-05,
"loss": 0.9289,
"step": 1456
},
{
"epoch": 3.7263427109974425,
"grad_norm": 0.15552821509875525,
"learning_rate": 2.961181546340906e-05,
"loss": 0.9707,
"step": 1457
},
{
"epoch": 3.7289002557544757,
"grad_norm": 0.19037886779641314,
"learning_rate": 2.95009182002036e-05,
"loss": 0.9313,
"step": 1458
},
{
"epoch": 3.731457800511509,
"grad_norm": 0.16615970202045902,
"learning_rate": 2.939018202063732e-05,
"loss": 0.9647,
"step": 1459
},
{
"epoch": 3.734015345268542,
"grad_norm": 0.17646317393385902,
"learning_rate": 2.9279607277940196e-05,
"loss": 0.9474,
"step": 1460
},
{
"epoch": 3.7365728900255757,
"grad_norm": 0.16080135640987508,
"learning_rate": 2.9169194324827183e-05,
"loss": 0.926,
"step": 1461
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.17325852442311754,
"learning_rate": 2.9058943513497158e-05,
"loss": 0.9312,
"step": 1462
},
{
"epoch": 3.741687979539642,
"grad_norm": 0.2657172615999172,
"learning_rate": 2.8948855195631797e-05,
"loss": 0.9417,
"step": 1463
},
{
"epoch": 3.7442455242966752,
"grad_norm": 0.18232454995244132,
"learning_rate": 2.883892972239445e-05,
"loss": 0.9596,
"step": 1464
},
{
"epoch": 3.7468030690537084,
"grad_norm": 0.15153887237658853,
"learning_rate": 2.8729167444429042e-05,
"loss": 0.9476,
"step": 1465
},
{
"epoch": 3.7493606138107416,
"grad_norm": 0.17675913819692224,
"learning_rate": 2.8619568711858858e-05,
"loss": 0.945,
"step": 1466
},
{
"epoch": 3.7519181585677748,
"grad_norm": 0.16206615280321732,
"learning_rate": 2.8510133874285633e-05,
"loss": 0.9462,
"step": 1467
},
{
"epoch": 3.7544757033248084,
"grad_norm": 0.1553778010776279,
"learning_rate": 2.8400863280788207e-05,
"loss": 0.9407,
"step": 1468
},
{
"epoch": 3.7570332480818416,
"grad_norm": 0.16829547679009138,
"learning_rate": 2.829175727992147e-05,
"loss": 0.963,
"step": 1469
},
{
"epoch": 3.7595907928388748,
"grad_norm": 0.13746655170307476,
"learning_rate": 2.818281621971541e-05,
"loss": 0.9221,
"step": 1470
},
{
"epoch": 3.762148337595908,
"grad_norm": 0.16271667131621254,
"learning_rate": 2.8074040447673794e-05,
"loss": 0.9535,
"step": 1471
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.16318435465235073,
"learning_rate": 2.7965430310773184e-05,
"loss": 0.9475,
"step": 1472
},
{
"epoch": 3.7672634271099743,
"grad_norm": 0.16520541373584413,
"learning_rate": 2.7856986155461777e-05,
"loss": 0.9315,
"step": 1473
},
{
"epoch": 3.7698209718670075,
"grad_norm": 0.32117889861607873,
"learning_rate": 2.7748708327658317e-05,
"loss": 0.9455,
"step": 1474
},
{
"epoch": 3.772378516624041,
"grad_norm": 0.17314463246020131,
"learning_rate": 2.7640597172751004e-05,
"loss": 0.9525,
"step": 1475
},
{
"epoch": 3.7749360613810743,
"grad_norm": 0.15225032038812816,
"learning_rate": 2.7532653035596336e-05,
"loss": 0.9453,
"step": 1476
},
{
"epoch": 3.7774936061381075,
"grad_norm": 0.17247417052786013,
"learning_rate": 2.7424876260518146e-05,
"loss": 0.9152,
"step": 1477
},
{
"epoch": 3.7800511508951407,
"grad_norm": 0.15503112719134568,
"learning_rate": 2.7317267191306318e-05,
"loss": 0.9398,
"step": 1478
},
{
"epoch": 3.782608695652174,
"grad_norm": 0.1631084235061464,
"learning_rate": 2.7209826171215827e-05,
"loss": 0.9246,
"step": 1479
},
{
"epoch": 3.785166240409207,
"grad_norm": 0.15506280568530903,
"learning_rate": 2.7102553542965577e-05,
"loss": 0.936,
"step": 1480
},
{
"epoch": 3.78772378516624,
"grad_norm": 0.1404687271754989,
"learning_rate": 2.6995449648737343e-05,
"loss": 0.9359,
"step": 1481
},
{
"epoch": 3.790281329923274,
"grad_norm": 0.1557007128341937,
"learning_rate": 2.6888514830174678e-05,
"loss": 0.954,
"step": 1482
},
{
"epoch": 3.792838874680307,
"grad_norm": 0.16612555940333462,
"learning_rate": 2.6781749428381752e-05,
"loss": 1.0034,
"step": 1483
},
{
"epoch": 3.79539641943734,
"grad_norm": 0.1733496961568388,
"learning_rate": 2.6675153783922457e-05,
"loss": 0.9518,
"step": 1484
},
{
"epoch": 3.7979539641943734,
"grad_norm": 0.15940418283478483,
"learning_rate": 2.6568728236819023e-05,
"loss": 0.9817,
"step": 1485
},
{
"epoch": 3.8005115089514065,
"grad_norm": 0.19079011728203774,
"learning_rate": 2.6462473126551187e-05,
"loss": 0.9735,
"step": 1486
},
{
"epoch": 3.80306905370844,
"grad_norm": 0.16130729906636684,
"learning_rate": 2.635638879205504e-05,
"loss": 0.9579,
"step": 1487
},
{
"epoch": 3.805626598465473,
"grad_norm": 0.1745866503183891,
"learning_rate": 2.625047557172189e-05,
"loss": 0.9402,
"step": 1488
},
{
"epoch": 3.8081841432225065,
"grad_norm": 0.18057372768582713,
"learning_rate": 2.6144733803397212e-05,
"loss": 0.9474,
"step": 1489
},
{
"epoch": 3.8107416879795397,
"grad_norm": 0.1560777993171654,
"learning_rate": 2.6039163824379588e-05,
"loss": 0.9506,
"step": 1490
},
{
"epoch": 3.813299232736573,
"grad_norm": 0.1674616567029557,
"learning_rate": 2.5933765971419647e-05,
"loss": 0.9488,
"step": 1491
},
{
"epoch": 3.815856777493606,
"grad_norm": 0.15672982172497663,
"learning_rate": 2.582854058071892e-05,
"loss": 0.9458,
"step": 1492
},
{
"epoch": 3.8184143222506393,
"grad_norm": 0.1558200464104945,
"learning_rate": 2.5723487987928817e-05,
"loss": 0.9518,
"step": 1493
},
{
"epoch": 3.820971867007673,
"grad_norm": 0.14208299213871128,
"learning_rate": 2.5618608528149614e-05,
"loss": 0.93,
"step": 1494
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.16087610572734629,
"learning_rate": 2.5513902535929288e-05,
"loss": 0.9763,
"step": 1495
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.1493299114392072,
"learning_rate": 2.5409370345262385e-05,
"loss": 0.9471,
"step": 1496
},
{
"epoch": 3.8286445012787724,
"grad_norm": 0.15214002644065255,
"learning_rate": 2.5305012289589223e-05,
"loss": 0.9588,
"step": 1497
},
{
"epoch": 3.8312020460358056,
"grad_norm": 0.15727057443971326,
"learning_rate": 2.5200828701794543e-05,
"loss": 0.9294,
"step": 1498
},
{
"epoch": 3.833759590792839,
"grad_norm": 0.14966978310373255,
"learning_rate": 2.5096819914206592e-05,
"loss": 0.9372,
"step": 1499
},
{
"epoch": 3.836317135549872,
"grad_norm": 0.160200304381001,
"learning_rate": 2.4992986258596023e-05,
"loss": 0.9648,
"step": 1500
},
{
"epoch": 3.8388746803069056,
"grad_norm": 0.1364407301299318,
"learning_rate": 2.4889328066174932e-05,
"loss": 0.9458,
"step": 1501
},
{
"epoch": 3.8414322250639388,
"grad_norm": 0.15554384512550426,
"learning_rate": 2.4785845667595565e-05,
"loss": 0.9532,
"step": 1502
},
{
"epoch": 3.843989769820972,
"grad_norm": 0.14270917443883158,
"learning_rate": 2.4682539392949494e-05,
"loss": 0.9194,
"step": 1503
},
{
"epoch": 3.846547314578005,
"grad_norm": 0.15315949958673647,
"learning_rate": 2.4579409571766543e-05,
"loss": 0.9619,
"step": 1504
},
{
"epoch": 3.8491048593350383,
"grad_norm": 0.14236120859618645,
"learning_rate": 2.4476456533013597e-05,
"loss": 0.9637,
"step": 1505
},
{
"epoch": 3.8516624040920715,
"grad_norm": 0.14065482492078218,
"learning_rate": 2.437368060509365e-05,
"loss": 0.9406,
"step": 1506
},
{
"epoch": 3.8542199488491047,
"grad_norm": 0.13361767868605823,
"learning_rate": 2.427108211584476e-05,
"loss": 0.9595,
"step": 1507
},
{
"epoch": 3.8567774936061383,
"grad_norm": 0.13594955260031957,
"learning_rate": 2.4168661392538982e-05,
"loss": 0.9421,
"step": 1508
},
{
"epoch": 3.8593350383631715,
"grad_norm": 0.13851801316117543,
"learning_rate": 2.4066418761881308e-05,
"loss": 0.9687,
"step": 1509
},
{
"epoch": 3.8618925831202047,
"grad_norm": 0.13380711931983305,
"learning_rate": 2.396435455000864e-05,
"loss": 0.9468,
"step": 1510
},
{
"epoch": 3.864450127877238,
"grad_norm": 0.13649849585417867,
"learning_rate": 2.386246908248883e-05,
"loss": 0.9228,
"step": 1511
},
{
"epoch": 3.867007672634271,
"grad_norm": 0.13210578639270845,
"learning_rate": 2.3760762684319508e-05,
"loss": 0.9094,
"step": 1512
},
{
"epoch": 3.869565217391304,
"grad_norm": 0.14259288669579517,
"learning_rate": 2.3659235679927016e-05,
"loss": 0.9351,
"step": 1513
},
{
"epoch": 3.8721227621483374,
"grad_norm": 0.1388101682540646,
"learning_rate": 2.3557888393165627e-05,
"loss": 0.9454,
"step": 1514
},
{
"epoch": 3.874680306905371,
"grad_norm": 0.12901592134412895,
"learning_rate": 2.345672114731624e-05,
"loss": 0.9481,
"step": 1515
},
{
"epoch": 3.877237851662404,
"grad_norm": 0.13894304934030247,
"learning_rate": 2.335573426508547e-05,
"loss": 0.9583,
"step": 1516
},
{
"epoch": 3.8797953964194374,
"grad_norm": 0.1370325882290817,
"learning_rate": 2.325492806860462e-05,
"loss": 0.9799,
"step": 1517
},
{
"epoch": 3.8823529411764706,
"grad_norm": 0.13421409804749201,
"learning_rate": 2.315430287942862e-05,
"loss": 0.9533,
"step": 1518
},
{
"epoch": 3.8849104859335037,
"grad_norm": 0.13298313283238028,
"learning_rate": 2.3053859018535026e-05,
"loss": 0.9709,
"step": 1519
},
{
"epoch": 3.887468030690537,
"grad_norm": 0.1361450777437208,
"learning_rate": 2.295359680632295e-05,
"loss": 0.9615,
"step": 1520
},
{
"epoch": 3.89002557544757,
"grad_norm": 0.1486100399377403,
"learning_rate": 2.2853516562612173e-05,
"loss": 0.9376,
"step": 1521
},
{
"epoch": 3.8925831202046037,
"grad_norm": 0.13690524401965368,
"learning_rate": 2.2753618606641928e-05,
"loss": 0.9092,
"step": 1522
},
{
"epoch": 3.895140664961637,
"grad_norm": 0.15669583951357616,
"learning_rate": 2.2653903257070012e-05,
"loss": 0.9443,
"step": 1523
},
{
"epoch": 3.89769820971867,
"grad_norm": 0.12931778250099024,
"learning_rate": 2.2554370831971743e-05,
"loss": 0.9406,
"step": 1524
},
{
"epoch": 3.9002557544757033,
"grad_norm": 0.17258200785982056,
"learning_rate": 2.2455021648838935e-05,
"loss": 0.9614,
"step": 1525
},
{
"epoch": 3.9028132992327365,
"grad_norm": 0.1521157336174598,
"learning_rate": 2.235585602457891e-05,
"loss": 0.9487,
"step": 1526
},
{
"epoch": 3.90537084398977,
"grad_norm": 0.14390268768179504,
"learning_rate": 2.225687427551341e-05,
"loss": 0.9401,
"step": 1527
},
{
"epoch": 3.907928388746803,
"grad_norm": 0.16337966447000044,
"learning_rate": 2.2158076717377765e-05,
"loss": 0.9536,
"step": 1528
},
{
"epoch": 3.9104859335038364,
"grad_norm": 0.15324748802477992,
"learning_rate": 2.2059463665319623e-05,
"loss": 0.9198,
"step": 1529
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.14907378875032545,
"learning_rate": 2.196103543389815e-05,
"loss": 0.9481,
"step": 1530
},
{
"epoch": 3.915601023017903,
"grad_norm": 0.14207939797213323,
"learning_rate": 2.1862792337083017e-05,
"loss": 0.9387,
"step": 1531
},
{
"epoch": 3.918158567774936,
"grad_norm": 0.13959510597089575,
"learning_rate": 2.176473468825328e-05,
"loss": 0.9536,
"step": 1532
},
{
"epoch": 3.920716112531969,
"grad_norm": 0.14016454333503284,
"learning_rate": 2.1666862800196454e-05,
"loss": 0.9491,
"step": 1533
},
{
"epoch": 3.923273657289003,
"grad_norm": 0.14885818803453518,
"learning_rate": 2.1569176985107535e-05,
"loss": 0.9612,
"step": 1534
},
{
"epoch": 3.9258312020460355,
"grad_norm": 0.14403866973582788,
"learning_rate": 2.1471677554587958e-05,
"loss": 0.9511,
"step": 1535
},
{
"epoch": 3.928388746803069,
"grad_norm": 0.13223516573639468,
"learning_rate": 2.1374364819644623e-05,
"loss": 0.9373,
"step": 1536
},
{
"epoch": 3.9309462915601023,
"grad_norm": 0.14036184466315108,
"learning_rate": 2.1277239090688894e-05,
"loss": 0.9353,
"step": 1537
},
{
"epoch": 3.9335038363171355,
"grad_norm": 0.1396968491520172,
"learning_rate": 2.1180300677535655e-05,
"loss": 0.9531,
"step": 1538
},
{
"epoch": 3.9360613810741687,
"grad_norm": 0.13659743962984422,
"learning_rate": 2.108354988940228e-05,
"loss": 0.936,
"step": 1539
},
{
"epoch": 3.938618925831202,
"grad_norm": 0.1508626854215839,
"learning_rate": 2.0986987034907554e-05,
"loss": 0.9452,
"step": 1540
},
{
"epoch": 3.9411764705882355,
"grad_norm": 0.14129695624224084,
"learning_rate": 2.089061242207092e-05,
"loss": 0.9369,
"step": 1541
},
{
"epoch": 3.9437340153452687,
"grad_norm": 0.1428765331179949,
"learning_rate": 2.0794426358311294e-05,
"loss": 0.9142,
"step": 1542
},
{
"epoch": 3.946291560102302,
"grad_norm": 0.1330347524331098,
"learning_rate": 2.069842915044614e-05,
"loss": 0.9381,
"step": 1543
},
{
"epoch": 3.948849104859335,
"grad_norm": 0.14069953111767788,
"learning_rate": 2.0602621104690517e-05,
"loss": 0.921,
"step": 1544
},
{
"epoch": 3.9514066496163682,
"grad_norm": 0.1456949051715094,
"learning_rate": 2.050700252665615e-05,
"loss": 0.9549,
"step": 1545
},
{
"epoch": 3.9539641943734014,
"grad_norm": 0.13746866783044756,
"learning_rate": 2.041157372135028e-05,
"loss": 0.9287,
"step": 1546
},
{
"epoch": 3.9565217391304346,
"grad_norm": 0.15606889468360874,
"learning_rate": 2.0316334993174856e-05,
"loss": 0.9555,
"step": 1547
},
{
"epoch": 3.959079283887468,
"grad_norm": 0.14118323164397703,
"learning_rate": 2.0221286645925558e-05,
"loss": 0.9343,
"step": 1548
},
{
"epoch": 3.9616368286445014,
"grad_norm": 0.1363380304979579,
"learning_rate": 2.012642898279074e-05,
"loss": 0.9961,
"step": 1549
},
{
"epoch": 3.9641943734015346,
"grad_norm": 0.14317404024733354,
"learning_rate": 2.003176230635049e-05,
"loss": 0.9647,
"step": 1550
},
{
"epoch": 3.9667519181585678,
"grad_norm": 0.14674699824614082,
"learning_rate": 1.9937286918575713e-05,
"loss": 0.9541,
"step": 1551
},
{
"epoch": 3.969309462915601,
"grad_norm": 0.1392728526341487,
"learning_rate": 1.984300312082711e-05,
"loss": 0.9549,
"step": 1552
},
{
"epoch": 3.971867007672634,
"grad_norm": 0.1388687318173855,
"learning_rate": 1.9748911213854267e-05,
"loss": 0.9538,
"step": 1553
},
{
"epoch": 3.9744245524296673,
"grad_norm": 0.13901730161036177,
"learning_rate": 1.9655011497794616e-05,
"loss": 0.9426,
"step": 1554
},
{
"epoch": 3.976982097186701,
"grad_norm": 0.13747089636524243,
"learning_rate": 1.9561304272172644e-05,
"loss": 0.9639,
"step": 1555
},
{
"epoch": 3.979539641943734,
"grad_norm": 0.1395863657318075,
"learning_rate": 1.946778983589873e-05,
"loss": 0.9733,
"step": 1556
},
{
"epoch": 3.9820971867007673,
"grad_norm": 0.1388892460599247,
"learning_rate": 1.9374468487268254e-05,
"loss": 0.944,
"step": 1557
},
{
"epoch": 3.9846547314578005,
"grad_norm": 0.1542426182338673,
"learning_rate": 1.9281340523960806e-05,
"loss": 0.9575,
"step": 1558
},
{
"epoch": 3.9872122762148337,
"grad_norm": 0.14702194394411322,
"learning_rate": 1.9188406243039015e-05,
"loss": 0.939,
"step": 1559
},
{
"epoch": 3.9897698209718673,
"grad_norm": 0.15088719580788107,
"learning_rate": 1.9095665940947717e-05,
"loss": 0.9523,
"step": 1560
},
{
"epoch": 3.9923273657289,
"grad_norm": 0.13979637370531914,
"learning_rate": 1.9003119913512992e-05,
"loss": 0.9518,
"step": 1561
},
{
"epoch": 3.9948849104859336,
"grad_norm": 0.13293457854923818,
"learning_rate": 1.891076845594122e-05,
"loss": 0.966,
"step": 1562
},
{
"epoch": 3.997442455242967,
"grad_norm": 0.1330659091048459,
"learning_rate": 1.881861186281813e-05,
"loss": 0.9425,
"step": 1563
},
{
"epoch": 4.0,
"grad_norm": 0.15532958865697588,
"learning_rate": 1.872665042810784e-05,
"loss": 0.9491,
"step": 1564
},
{
"epoch": 4.002557544757034,
"grad_norm": 0.172134213325208,
"learning_rate": 1.863488444515203e-05,
"loss": 0.9131,
"step": 1565
},
{
"epoch": 4.005115089514066,
"grad_norm": 0.15705142364202992,
"learning_rate": 1.854331420666882e-05,
"loss": 0.9254,
"step": 1566
},
{
"epoch": 4.0076726342711,
"grad_norm": 0.16319791463669756,
"learning_rate": 1.845194000475199e-05,
"loss": 0.9005,
"step": 1567
},
{
"epoch": 4.010230179028133,
"grad_norm": 0.16550445546270565,
"learning_rate": 1.836076213087e-05,
"loss": 0.9177,
"step": 1568
},
{
"epoch": 4.012787723785166,
"grad_norm": 0.17000604940332,
"learning_rate": 1.826978087586502e-05,
"loss": 0.9288,
"step": 1569
},
{
"epoch": 4.015345268542199,
"grad_norm": 0.17439370178321326,
"learning_rate": 1.8178996529952088e-05,
"loss": 0.9302,
"step": 1570
},
{
"epoch": 4.017902813299233,
"grad_norm": 0.16621808084873166,
"learning_rate": 1.808840938271807e-05,
"loss": 0.9277,
"step": 1571
},
{
"epoch": 4.020460358056266,
"grad_norm": 0.1502855048809297,
"learning_rate": 1.799801972312092e-05,
"loss": 0.9146,
"step": 1572
},
{
"epoch": 4.023017902813299,
"grad_norm": 0.15792591947199125,
"learning_rate": 1.7907827839488474e-05,
"loss": 0.9175,
"step": 1573
},
{
"epoch": 4.025575447570333,
"grad_norm": 0.1563775392864349,
"learning_rate": 1.7817834019517805e-05,
"loss": 0.9128,
"step": 1574
},
{
"epoch": 4.028132992327365,
"grad_norm": 0.14597718440990778,
"learning_rate": 1.7728038550274193e-05,
"loss": 0.9185,
"step": 1575
},
{
"epoch": 4.030690537084399,
"grad_norm": 0.1569564550463153,
"learning_rate": 1.7638441718190192e-05,
"loss": 0.9296,
"step": 1576
},
{
"epoch": 4.033248081841432,
"grad_norm": 0.15089755959303894,
"learning_rate": 1.7549043809064697e-05,
"loss": 0.9011,
"step": 1577
},
{
"epoch": 4.035805626598465,
"grad_norm": 0.14320940233490406,
"learning_rate": 1.74598451080622e-05,
"loss": 0.9301,
"step": 1578
},
{
"epoch": 4.038363171355499,
"grad_norm": 0.1640364740345872,
"learning_rate": 1.737084589971157e-05,
"loss": 0.9294,
"step": 1579
},
{
"epoch": 4.040920716112532,
"grad_norm": 0.15372462860199906,
"learning_rate": 1.728204646790544e-05,
"loss": 0.9464,
"step": 1580
},
{
"epoch": 4.043478260869565,
"grad_norm": 0.14792763942080298,
"learning_rate": 1.7193447095899206e-05,
"loss": 0.9224,
"step": 1581
},
{
"epoch": 4.046035805626598,
"grad_norm": 0.13951058738523123,
"learning_rate": 1.710504806631005e-05,
"loss": 0.9087,
"step": 1582
},
{
"epoch": 4.048593350383632,
"grad_norm": 0.13260882878617228,
"learning_rate": 1.701684966111615e-05,
"loss": 0.9036,
"step": 1583
},
{
"epoch": 4.051150895140665,
"grad_norm": 0.14125256658288957,
"learning_rate": 1.6928852161655616e-05,
"loss": 0.92,
"step": 1584
},
{
"epoch": 4.053708439897698,
"grad_norm": 0.13237438231494236,
"learning_rate": 1.684105584862584e-05,
"loss": 0.9156,
"step": 1585
},
{
"epoch": 4.056265984654732,
"grad_norm": 0.1359119819403516,
"learning_rate": 1.6753461002082395e-05,
"loss": 0.9554,
"step": 1586
},
{
"epoch": 4.0588235294117645,
"grad_norm": 0.136943228077222,
"learning_rate": 1.6666067901438178e-05,
"loss": 0.8844,
"step": 1587
},
{
"epoch": 4.061381074168798,
"grad_norm": 0.14746043096646916,
"learning_rate": 1.657887682546264e-05,
"loss": 0.9091,
"step": 1588
},
{
"epoch": 4.063938618925831,
"grad_norm": 0.13289891251117492,
"learning_rate": 1.649188805228076e-05,
"loss": 0.9462,
"step": 1589
},
{
"epoch": 4.0664961636828645,
"grad_norm": 0.14117852752538673,
"learning_rate": 1.6405101859372123e-05,
"loss": 0.9153,
"step": 1590
},
{
"epoch": 4.069053708439898,
"grad_norm": 0.12613455462183037,
"learning_rate": 1.631851852357026e-05,
"loss": 0.9519,
"step": 1591
},
{
"epoch": 4.071611253196931,
"grad_norm": 0.1396860703236042,
"learning_rate": 1.6232138321061544e-05,
"loss": 0.9412,
"step": 1592
},
{
"epoch": 4.0741687979539645,
"grad_norm": 0.1360638603818121,
"learning_rate": 1.6145961527384395e-05,
"loss": 0.9517,
"step": 1593
},
{
"epoch": 4.076726342710997,
"grad_norm": 0.1324923155606263,
"learning_rate": 1.6059988417428396e-05,
"loss": 0.9513,
"step": 1594
},
{
"epoch": 4.079283887468031,
"grad_norm": 0.14265745538296148,
"learning_rate": 1.5974219265433406e-05,
"loss": 0.9154,
"step": 1595
},
{
"epoch": 4.081841432225064,
"grad_norm": 0.14492559140570338,
"learning_rate": 1.58886543449887e-05,
"loss": 0.9394,
"step": 1596
},
{
"epoch": 4.084398976982097,
"grad_norm": 0.12579546842676975,
"learning_rate": 1.5803293929032078e-05,
"loss": 0.9281,
"step": 1597
},
{
"epoch": 4.086956521739131,
"grad_norm": 0.14549537683931857,
"learning_rate": 1.5718138289849055e-05,
"loss": 0.8957,
"step": 1598
},
{
"epoch": 4.089514066496164,
"grad_norm": 0.14813650458162753,
"learning_rate": 1.563318769907187e-05,
"loss": 0.9004,
"step": 1599
},
{
"epoch": 4.092071611253197,
"grad_norm": 0.12523568970989923,
"learning_rate": 1.554844242767872e-05,
"loss": 0.9311,
"step": 1600
},
{
"epoch": 4.09462915601023,
"grad_norm": 0.13296174952051867,
"learning_rate": 1.546390274599289e-05,
"loss": 0.9256,
"step": 1601
},
{
"epoch": 4.0971867007672635,
"grad_norm": 0.12809367590620266,
"learning_rate": 1.5379568923681833e-05,
"loss": 0.9136,
"step": 1602
},
{
"epoch": 4.099744245524296,
"grad_norm": 0.13109260024902633,
"learning_rate": 1.5295441229756364e-05,
"loss": 0.9007,
"step": 1603
},
{
"epoch": 4.10230179028133,
"grad_norm": 0.12407094954940708,
"learning_rate": 1.521151993256977e-05,
"loss": 0.9406,
"step": 1604
},
{
"epoch": 4.1048593350383635,
"grad_norm": 0.1298161922376652,
"learning_rate": 1.5127805299817025e-05,
"loss": 0.9264,
"step": 1605
},
{
"epoch": 4.107416879795396,
"grad_norm": 0.1481163518427539,
"learning_rate": 1.5044297598533777e-05,
"loss": 0.9285,
"step": 1606
},
{
"epoch": 4.10997442455243,
"grad_norm": 0.12078740228639545,
"learning_rate": 1.496099709509565e-05,
"loss": 0.9078,
"step": 1607
},
{
"epoch": 4.112531969309463,
"grad_norm": 0.13027908099413282,
"learning_rate": 1.4877904055217376e-05,
"loss": 0.9149,
"step": 1608
},
{
"epoch": 4.115089514066496,
"grad_norm": 0.1468019204651356,
"learning_rate": 1.4795018743951857e-05,
"loss": 0.9304,
"step": 1609
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.1349316946630024,
"learning_rate": 1.4712341425689406e-05,
"loss": 0.926,
"step": 1610
},
{
"epoch": 4.120204603580563,
"grad_norm": 0.1228754724620514,
"learning_rate": 1.4629872364156854e-05,
"loss": 0.9185,
"step": 1611
},
{
"epoch": 4.122762148337596,
"grad_norm": 0.14313419206388078,
"learning_rate": 1.4547611822416748e-05,
"loss": 0.9126,
"step": 1612
},
{
"epoch": 4.125319693094629,
"grad_norm": 0.14531581013669995,
"learning_rate": 1.446556006286648e-05,
"loss": 0.9372,
"step": 1613
},
{
"epoch": 4.127877237851663,
"grad_norm": 0.12636103579388067,
"learning_rate": 1.4383717347237425e-05,
"loss": 0.9255,
"step": 1614
},
{
"epoch": 4.130434782608695,
"grad_norm": 0.13484501378576969,
"learning_rate": 1.4302083936594247e-05,
"loss": 0.9267,
"step": 1615
},
{
"epoch": 4.132992327365729,
"grad_norm": 0.1306495047012211,
"learning_rate": 1.4220660091333875e-05,
"loss": 0.9237,
"step": 1616
},
{
"epoch": 4.135549872122763,
"grad_norm": 0.12979097348457122,
"learning_rate": 1.4139446071184737e-05,
"loss": 0.9197,
"step": 1617
},
{
"epoch": 4.138107416879795,
"grad_norm": 0.13739201337062779,
"learning_rate": 1.405844213520604e-05,
"loss": 0.9197,
"step": 1618
},
{
"epoch": 4.140664961636829,
"grad_norm": 0.1294644982423319,
"learning_rate": 1.3977648541786804e-05,
"loss": 0.896,
"step": 1619
},
{
"epoch": 4.143222506393862,
"grad_norm": 0.12588348274914363,
"learning_rate": 1.3897065548645104e-05,
"loss": 0.9453,
"step": 1620
},
{
"epoch": 4.145780051150895,
"grad_norm": 0.15398362387202247,
"learning_rate": 1.381669341282721e-05,
"loss": 0.9317,
"step": 1621
},
{
"epoch": 4.148337595907928,
"grad_norm": 0.13197721364304257,
"learning_rate": 1.3736532390706878e-05,
"loss": 0.9279,
"step": 1622
},
{
"epoch": 4.150895140664962,
"grad_norm": 0.12322044737512756,
"learning_rate": 1.3656582737984318e-05,
"loss": 0.9439,
"step": 1623
},
{
"epoch": 4.153452685421995,
"grad_norm": 0.12440470950789576,
"learning_rate": 1.3576844709685583e-05,
"loss": 0.9088,
"step": 1624
},
{
"epoch": 4.156010230179028,
"grad_norm": 0.12465116010990127,
"learning_rate": 1.3497318560161704e-05,
"loss": 0.9211,
"step": 1625
},
{
"epoch": 4.158567774936062,
"grad_norm": 0.13358086347052778,
"learning_rate": 1.3418004543087792e-05,
"loss": 0.9312,
"step": 1626
},
{
"epoch": 4.161125319693094,
"grad_norm": 0.1224560124714394,
"learning_rate": 1.3338902911462336e-05,
"loss": 0.9253,
"step": 1627
},
{
"epoch": 4.163682864450128,
"grad_norm": 0.12240140914681184,
"learning_rate": 1.3260013917606319e-05,
"loss": 0.9383,
"step": 1628
},
{
"epoch": 4.166240409207161,
"grad_norm": 0.12945740752464988,
"learning_rate": 1.318133781316247e-05,
"loss": 0.9416,
"step": 1629
},
{
"epoch": 4.168797953964194,
"grad_norm": 0.13087100044291045,
"learning_rate": 1.3102874849094414e-05,
"loss": 0.9316,
"step": 1630
},
{
"epoch": 4.171355498721228,
"grad_norm": 0.14189296661844325,
"learning_rate": 1.3024625275685891e-05,
"loss": 0.9465,
"step": 1631
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.1297951759919457,
"learning_rate": 1.2946589342540023e-05,
"loss": 0.9275,
"step": 1632
},
{
"epoch": 4.176470588235294,
"grad_norm": 0.11911786087772278,
"learning_rate": 1.2868767298578395e-05,
"loss": 0.9225,
"step": 1633
},
{
"epoch": 4.179028132992327,
"grad_norm": 0.12225398214034955,
"learning_rate": 1.2791159392040275e-05,
"loss": 0.9196,
"step": 1634
},
{
"epoch": 4.181585677749361,
"grad_norm": 0.1310216078232746,
"learning_rate": 1.2713765870481995e-05,
"loss": 0.9353,
"step": 1635
},
{
"epoch": 4.1841432225063935,
"grad_norm": 0.12742055135018454,
"learning_rate": 1.2636586980775945e-05,
"loss": 0.9666,
"step": 1636
},
{
"epoch": 4.186700767263427,
"grad_norm": 0.12384487664186089,
"learning_rate": 1.2559622969109886e-05,
"loss": 0.9209,
"step": 1637
},
{
"epoch": 4.189258312020461,
"grad_norm": 0.1340544434519516,
"learning_rate": 1.2482874080986176e-05,
"loss": 0.9377,
"step": 1638
},
{
"epoch": 4.1918158567774935,
"grad_norm": 0.13746772119236356,
"learning_rate": 1.2406340561220947e-05,
"loss": 0.9207,
"step": 1639
},
{
"epoch": 4.194373401534527,
"grad_norm": 0.1280603990954687,
"learning_rate": 1.2330022653943358e-05,
"loss": 0.914,
"step": 1640
},
{
"epoch": 4.19693094629156,
"grad_norm": 0.12374468420399631,
"learning_rate": 1.2253920602594759e-05,
"loss": 0.8923,
"step": 1641
},
{
"epoch": 4.1994884910485935,
"grad_norm": 0.12384342114389504,
"learning_rate": 1.2178034649928034e-05,
"loss": 0.9396,
"step": 1642
},
{
"epoch": 4.202046035805626,
"grad_norm": 0.1230247461338335,
"learning_rate": 1.2102365038006672e-05,
"loss": 0.8981,
"step": 1643
},
{
"epoch": 4.20460358056266,
"grad_norm": 0.12441020446608941,
"learning_rate": 1.2026912008204117e-05,
"loss": 0.9395,
"step": 1644
},
{
"epoch": 4.207161125319693,
"grad_norm": 0.1207928603043833,
"learning_rate": 1.195167580120292e-05,
"loss": 0.9257,
"step": 1645
},
{
"epoch": 4.209718670076726,
"grad_norm": 0.12168214916803673,
"learning_rate": 1.1876656656994032e-05,
"loss": 0.907,
"step": 1646
},
{
"epoch": 4.21227621483376,
"grad_norm": 0.12409121363381591,
"learning_rate": 1.180185481487599e-05,
"loss": 0.9082,
"step": 1647
},
{
"epoch": 4.2148337595907925,
"grad_norm": 0.12218546237016087,
"learning_rate": 1.1727270513454161e-05,
"loss": 0.9207,
"step": 1648
},
{
"epoch": 4.217391304347826,
"grad_norm": 0.1373741099688316,
"learning_rate": 1.1652903990640075e-05,
"loss": 0.9041,
"step": 1649
},
{
"epoch": 4.21994884910486,
"grad_norm": 0.126043833861761,
"learning_rate": 1.1578755483650465e-05,
"loss": 0.9071,
"step": 1650
},
{
"epoch": 4.2225063938618925,
"grad_norm": 0.12907468546494064,
"learning_rate": 1.150482522900668e-05,
"loss": 0.9267,
"step": 1651
},
{
"epoch": 4.225063938618926,
"grad_norm": 0.11696490881508001,
"learning_rate": 1.1431113462533942e-05,
"loss": 0.9188,
"step": 1652
},
{
"epoch": 4.227621483375959,
"grad_norm": 0.1219772936698238,
"learning_rate": 1.1357620419360438e-05,
"loss": 0.93,
"step": 1653
},
{
"epoch": 4.2301790281329925,
"grad_norm": 0.12317189729882781,
"learning_rate": 1.128434633391673e-05,
"loss": 0.9248,
"step": 1654
},
{
"epoch": 4.232736572890025,
"grad_norm": 0.12135967777000363,
"learning_rate": 1.121129143993489e-05,
"loss": 0.9482,
"step": 1655
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.12569146595438008,
"learning_rate": 1.1138455970447857e-05,
"loss": 0.9237,
"step": 1656
},
{
"epoch": 4.2378516624040925,
"grad_norm": 0.12009749843054457,
"learning_rate": 1.1065840157788599e-05,
"loss": 0.9117,
"step": 1657
},
{
"epoch": 4.240409207161125,
"grad_norm": 0.12262206120182582,
"learning_rate": 1.099344423358943e-05,
"loss": 0.944,
"step": 1658
},
{
"epoch": 4.242966751918159,
"grad_norm": 0.12739673009436395,
"learning_rate": 1.0921268428781277e-05,
"loss": 0.928,
"step": 1659
},
{
"epoch": 4.245524296675192,
"grad_norm": 0.12049563257356445,
"learning_rate": 1.084931297359293e-05,
"loss": 0.9307,
"step": 1660
},
{
"epoch": 4.248081841432225,
"grad_norm": 0.1268732696430339,
"learning_rate": 1.0777578097550206e-05,
"loss": 0.938,
"step": 1661
},
{
"epoch": 4.250639386189258,
"grad_norm": 0.1302689278877736,
"learning_rate": 1.0706064029475436e-05,
"loss": 0.9339,
"step": 1662
},
{
"epoch": 4.253196930946292,
"grad_norm": 0.1207622169109695,
"learning_rate": 1.0634770997486546e-05,
"loss": 0.9153,
"step": 1663
},
{
"epoch": 4.255754475703325,
"grad_norm": 0.11706181174774555,
"learning_rate": 1.0563699228996405e-05,
"loss": 0.9129,
"step": 1664
},
{
"epoch": 4.258312020460358,
"grad_norm": 0.11849875702011481,
"learning_rate": 1.0492848950712067e-05,
"loss": 0.9183,
"step": 1665
},
{
"epoch": 4.260869565217392,
"grad_norm": 0.12286048694545573,
"learning_rate": 1.0422220388634145e-05,
"loss": 0.9194,
"step": 1666
},
{
"epoch": 4.263427109974424,
"grad_norm": 0.12106155524848677,
"learning_rate": 1.03518137680559e-05,
"loss": 0.93,
"step": 1667
},
{
"epoch": 4.265984654731458,
"grad_norm": 0.11931612070623257,
"learning_rate": 1.0281629313562704e-05,
"loss": 0.8812,
"step": 1668
},
{
"epoch": 4.268542199488491,
"grad_norm": 0.12412002218869622,
"learning_rate": 1.0211667249031278e-05,
"loss": 0.9211,
"step": 1669
},
{
"epoch": 4.271099744245524,
"grad_norm": 0.11050129272365039,
"learning_rate": 1.0141927797628913e-05,
"loss": 0.9346,
"step": 1670
},
{
"epoch": 4.273657289002558,
"grad_norm": 0.11696142916514798,
"learning_rate": 1.0072411181812805e-05,
"loss": 0.9103,
"step": 1671
},
{
"epoch": 4.276214833759591,
"grad_norm": 0.12523114611535077,
"learning_rate": 1.0003117623329373e-05,
"loss": 0.9188,
"step": 1672
},
{
"epoch": 4.278772378516624,
"grad_norm": 0.1211246626009557,
"learning_rate": 9.934047343213468e-06,
"loss": 0.8779,
"step": 1673
},
{
"epoch": 4.281329923273657,
"grad_norm": 0.11896385138151676,
"learning_rate": 9.865200561787779e-06,
"loss": 0.916,
"step": 1674
},
{
"epoch": 4.283887468030691,
"grad_norm": 0.12907351319734606,
"learning_rate": 9.796577498662017e-06,
"loss": 0.9316,
"step": 1675
},
{
"epoch": 4.286445012787723,
"grad_norm": 0.1175024733129538,
"learning_rate": 9.728178372732323e-06,
"loss": 0.9175,
"step": 1676
},
{
"epoch": 4.289002557544757,
"grad_norm": 0.11765409328640529,
"learning_rate": 9.660003402180495e-06,
"loss": 0.9322,
"step": 1677
},
{
"epoch": 4.291560102301791,
"grad_norm": 0.11606048414482627,
"learning_rate": 9.592052804473248e-06,
"loss": 0.9338,
"step": 1678
},
{
"epoch": 4.294117647058823,
"grad_norm": 0.12217997194310143,
"learning_rate": 9.524326796361704e-06,
"loss": 0.9198,
"step": 1679
},
{
"epoch": 4.296675191815857,
"grad_norm": 0.13681552209998984,
"learning_rate": 9.456825593880502e-06,
"loss": 0.9381,
"step": 1680
},
{
"epoch": 4.29923273657289,
"grad_norm": 0.11707040245774833,
"learning_rate": 9.389549412347204e-06,
"loss": 0.9114,
"step": 1681
},
{
"epoch": 4.301790281329923,
"grad_norm": 0.11739134713610266,
"learning_rate": 9.322498466361574e-06,
"loss": 0.9564,
"step": 1682
},
{
"epoch": 4.304347826086957,
"grad_norm": 0.11490889884017837,
"learning_rate": 9.25567296980499e-06,
"loss": 0.9372,
"step": 1683
},
{
"epoch": 4.30690537084399,
"grad_norm": 0.13548343430667473,
"learning_rate": 9.18907313583958e-06,
"loss": 0.9571,
"step": 1684
},
{
"epoch": 4.309462915601023,
"grad_norm": 0.1169879093609689,
"learning_rate": 9.122699176907699e-06,
"loss": 0.91,
"step": 1685
},
{
"epoch": 4.312020460358056,
"grad_norm": 0.12181883918771313,
"learning_rate": 9.056551304731216e-06,
"loss": 0.9403,
"step": 1686
},
{
"epoch": 4.31457800511509,
"grad_norm": 0.11516301601447926,
"learning_rate": 8.990629730310787e-06,
"loss": 0.9045,
"step": 1687
},
{
"epoch": 4.3171355498721224,
"grad_norm": 0.1130886469711019,
"learning_rate": 8.924934663925228e-06,
"loss": 0.9005,
"step": 1688
},
{
"epoch": 4.319693094629156,
"grad_norm": 0.12056683149234801,
"learning_rate": 8.859466315130833e-06,
"loss": 0.905,
"step": 1689
},
{
"epoch": 4.322250639386189,
"grad_norm": 0.12131053610936289,
"learning_rate": 8.794224892760694e-06,
"loss": 0.964,
"step": 1690
},
{
"epoch": 4.324808184143222,
"grad_norm": 0.11072666373506544,
"learning_rate": 8.729210604924075e-06,
"loss": 0.9168,
"step": 1691
},
{
"epoch": 4.327365728900256,
"grad_norm": 0.11419375138008123,
"learning_rate": 8.66442365900566e-06,
"loss": 0.9155,
"step": 1692
},
{
"epoch": 4.329923273657289,
"grad_norm": 0.11067325544749756,
"learning_rate": 8.599864261665032e-06,
"loss": 0.929,
"step": 1693
},
{
"epoch": 4.332480818414322,
"grad_norm": 0.13119769270640452,
"learning_rate": 8.535532618835894e-06,
"loss": 0.9196,
"step": 1694
},
{
"epoch": 4.335038363171355,
"grad_norm": 0.12122259309350006,
"learning_rate": 8.471428935725394e-06,
"loss": 0.9097,
"step": 1695
},
{
"epoch": 4.337595907928389,
"grad_norm": 0.1186567073290791,
"learning_rate": 8.407553416813621e-06,
"loss": 0.9486,
"step": 1696
},
{
"epoch": 4.340153452685422,
"grad_norm": 0.13863787273855152,
"learning_rate": 8.343906265852806e-06,
"loss": 0.9194,
"step": 1697
},
{
"epoch": 4.342710997442455,
"grad_norm": 0.11736813648606277,
"learning_rate": 8.280487685866707e-06,
"loss": 0.8964,
"step": 1698
},
{
"epoch": 4.345268542199489,
"grad_norm": 0.11874382513666652,
"learning_rate": 8.217297879150065e-06,
"loss": 0.9305,
"step": 1699
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.12096917615982158,
"learning_rate": 8.154337047267763e-06,
"loss": 0.926,
"step": 1700
},
{
"epoch": 4.350383631713555,
"grad_norm": 0.12459874607610563,
"learning_rate": 8.091605391054354e-06,
"loss": 0.8922,
"step": 1701
},
{
"epoch": 4.352941176470588,
"grad_norm": 0.12221739613538536,
"learning_rate": 8.02910311061333e-06,
"loss": 0.9401,
"step": 1702
},
{
"epoch": 4.3554987212276215,
"grad_norm": 0.12254645629749011,
"learning_rate": 7.966830405316561e-06,
"loss": 0.9547,
"step": 1703
},
{
"epoch": 4.358056265984655,
"grad_norm": 0.12001133797508247,
"learning_rate": 7.90478747380357e-06,
"loss": 0.9103,
"step": 1704
},
{
"epoch": 4.360613810741688,
"grad_norm": 0.12199519070925526,
"learning_rate": 7.842974513980946e-06,
"loss": 0.9271,
"step": 1705
},
{
"epoch": 4.3631713554987215,
"grad_norm": 0.11295241635294967,
"learning_rate": 7.781391723021711e-06,
"loss": 0.9363,
"step": 1706
},
{
"epoch": 4.365728900255754,
"grad_norm": 0.12686526411244078,
"learning_rate": 7.720039297364681e-06,
"loss": 0.9274,
"step": 1707
},
{
"epoch": 4.368286445012788,
"grad_norm": 0.1333081116381865,
"learning_rate": 7.658917432713839e-06,
"loss": 0.9172,
"step": 1708
},
{
"epoch": 4.370843989769821,
"grad_norm": 0.12577470275328256,
"learning_rate": 7.598026324037762e-06,
"loss": 0.939,
"step": 1709
},
{
"epoch": 4.373401534526854,
"grad_norm": 0.12345544691397578,
"learning_rate": 7.537366165568909e-06,
"loss": 0.9288,
"step": 1710
},
{
"epoch": 4.375959079283888,
"grad_norm": 0.11948532376497799,
"learning_rate": 7.476937150803025e-06,
"loss": 0.9497,
"step": 1711
},
{
"epoch": 4.378516624040921,
"grad_norm": 0.12876903997603817,
"learning_rate": 7.416739472498613e-06,
"loss": 0.9479,
"step": 1712
},
{
"epoch": 4.381074168797954,
"grad_norm": 0.11529385831506739,
"learning_rate": 7.356773322676205e-06,
"loss": 0.9158,
"step": 1713
},
{
"epoch": 4.383631713554987,
"grad_norm": 0.11078825541988917,
"learning_rate": 7.2970388926178045e-06,
"loss": 0.937,
"step": 1714
},
{
"epoch": 4.3861892583120206,
"grad_norm": 0.11173435690628004,
"learning_rate": 7.237536372866247e-06,
"loss": 0.9327,
"step": 1715
},
{
"epoch": 4.388746803069053,
"grad_norm": 0.1223612229123131,
"learning_rate": 7.178265953224701e-06,
"loss": 0.9227,
"step": 1716
},
{
"epoch": 4.391304347826087,
"grad_norm": 0.12507251852936713,
"learning_rate": 7.119227822755843e-06,
"loss": 0.9571,
"step": 1717
},
{
"epoch": 4.3938618925831205,
"grad_norm": 0.11397092222799754,
"learning_rate": 7.060422169781467e-06,
"loss": 0.9041,
"step": 1718
},
{
"epoch": 4.396419437340153,
"grad_norm": 0.10753667090584995,
"learning_rate": 7.001849181881808e-06,
"loss": 0.9166,
"step": 1719
},
{
"epoch": 4.398976982097187,
"grad_norm": 0.12054572854799732,
"learning_rate": 6.943509045894905e-06,
"loss": 0.9341,
"step": 1720
},
{
"epoch": 4.40153452685422,
"grad_norm": 0.11185867845020742,
"learning_rate": 6.885401947916048e-06,
"loss": 0.9514,
"step": 1721
},
{
"epoch": 4.404092071611253,
"grad_norm": 0.11085335077105966,
"learning_rate": 6.827528073297185e-06,
"loss": 0.9382,
"step": 1722
},
{
"epoch": 4.406649616368286,
"grad_norm": 0.11479224410155166,
"learning_rate": 6.769887606646306e-06,
"loss": 0.9414,
"step": 1723
},
{
"epoch": 4.40920716112532,
"grad_norm": 0.11417555802279347,
"learning_rate": 6.712480731826878e-06,
"loss": 0.912,
"step": 1724
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.11413292812828428,
"learning_rate": 6.6553076319572394e-06,
"loss": 0.9268,
"step": 1725
},
{
"epoch": 4.414322250639386,
"grad_norm": 0.10996848327532169,
"learning_rate": 6.59836848941005e-06,
"loss": 0.9253,
"step": 1726
},
{
"epoch": 4.41687979539642,
"grad_norm": 0.12150368369219573,
"learning_rate": 6.541663485811667e-06,
"loss": 0.915,
"step": 1727
},
{
"epoch": 4.419437340153452,
"grad_norm": 0.11980533715997778,
"learning_rate": 6.485192802041553e-06,
"loss": 0.9156,
"step": 1728
},
{
"epoch": 4.421994884910486,
"grad_norm": 0.11392894414591724,
"learning_rate": 6.428956618231788e-06,
"loss": 0.9197,
"step": 1729
},
{
"epoch": 4.42455242966752,
"grad_norm": 0.11760332661995491,
"learning_rate": 6.3729551137664055e-06,
"loss": 0.9545,
"step": 1730
},
{
"epoch": 4.427109974424552,
"grad_norm": 0.10904085632244291,
"learning_rate": 6.3171884672808524e-06,
"loss": 0.9103,
"step": 1731
},
{
"epoch": 4.429667519181586,
"grad_norm": 0.10863502669554059,
"learning_rate": 6.26165685666142e-06,
"loss": 0.9016,
"step": 1732
},
{
"epoch": 4.432225063938619,
"grad_norm": 0.11509438949225145,
"learning_rate": 6.206360459044671e-06,
"loss": 0.931,
"step": 1733
},
{
"epoch": 4.434782608695652,
"grad_norm": 0.11748690634314717,
"learning_rate": 6.15129945081689e-06,
"loss": 0.9151,
"step": 1734
},
{
"epoch": 4.437340153452685,
"grad_norm": 0.11639698873895774,
"learning_rate": 6.096474007613476e-06,
"loss": 0.9365,
"step": 1735
},
{
"epoch": 4.439897698209719,
"grad_norm": 0.11159987657775047,
"learning_rate": 6.0418843043184636e-06,
"loss": 0.9552,
"step": 1736
},
{
"epoch": 4.442455242966752,
"grad_norm": 0.10952923402441073,
"learning_rate": 5.987530515063889e-06,
"loss": 0.9194,
"step": 1737
},
{
"epoch": 4.445012787723785,
"grad_norm": 0.11072771958857656,
"learning_rate": 5.933412813229256e-06,
"loss": 0.9189,
"step": 1738
},
{
"epoch": 4.447570332480819,
"grad_norm": 0.11775592911375234,
"learning_rate": 5.879531371440994e-06,
"loss": 0.9388,
"step": 1739
},
{
"epoch": 4.450127877237851,
"grad_norm": 0.11460729784468633,
"learning_rate": 5.825886361571922e-06,
"loss": 0.8945,
"step": 1740
},
{
"epoch": 4.452685421994885,
"grad_norm": 0.11581761610879335,
"learning_rate": 5.772477954740652e-06,
"loss": 0.9126,
"step": 1741
},
{
"epoch": 4.455242966751918,
"grad_norm": 0.11118413455302595,
"learning_rate": 5.719306321311075e-06,
"loss": 0.9565,
"step": 1742
},
{
"epoch": 4.457800511508951,
"grad_norm": 0.10749836975161339,
"learning_rate": 5.666371630891858e-06,
"loss": 0.9127,
"step": 1743
},
{
"epoch": 4.460358056265985,
"grad_norm": 0.10944652966346073,
"learning_rate": 5.613674052335798e-06,
"loss": 0.9184,
"step": 1744
},
{
"epoch": 4.462915601023018,
"grad_norm": 0.11540805854208941,
"learning_rate": 5.561213753739356e-06,
"loss": 0.9281,
"step": 1745
},
{
"epoch": 4.465473145780051,
"grad_norm": 0.11318814770450754,
"learning_rate": 5.5089909024421685e-06,
"loss": 0.9327,
"step": 1746
},
{
"epoch": 4.468030690537084,
"grad_norm": 0.11689654113549015,
"learning_rate": 5.4570056650263784e-06,
"loss": 0.9196,
"step": 1747
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.11410697533075874,
"learning_rate": 5.405258207316228e-06,
"loss": 0.9248,
"step": 1748
},
{
"epoch": 4.4731457800511505,
"grad_norm": 0.11032997359153394,
"learning_rate": 5.3537486943774674e-06,
"loss": 0.9278,
"step": 1749
},
{
"epoch": 4.475703324808184,
"grad_norm": 0.11362254544830364,
"learning_rate": 5.302477290516832e-06,
"loss": 0.9508,
"step": 1750
},
{
"epoch": 4.478260869565218,
"grad_norm": 0.114903272001298,
"learning_rate": 5.251444159281551e-06,
"loss": 0.9177,
"step": 1751
},
{
"epoch": 4.4808184143222505,
"grad_norm": 0.11311594662750116,
"learning_rate": 5.200649463458769e-06,
"loss": 0.9315,
"step": 1752
},
{
"epoch": 4.483375959079284,
"grad_norm": 0.1080019715192445,
"learning_rate": 5.150093365075117e-06,
"loss": 0.9423,
"step": 1753
},
{
"epoch": 4.485933503836317,
"grad_norm": 0.11099521632078349,
"learning_rate": 5.0997760253961036e-06,
"loss": 0.9432,
"step": 1754
},
{
"epoch": 4.4884910485933505,
"grad_norm": 0.1115281668793938,
"learning_rate": 5.049697604925605e-06,
"loss": 0.9201,
"step": 1755
},
{
"epoch": 4.491048593350383,
"grad_norm": 0.11559474894332394,
"learning_rate": 4.999858263405468e-06,
"loss": 0.9335,
"step": 1756
},
{
"epoch": 4.493606138107417,
"grad_norm": 0.10752469888696953,
"learning_rate": 4.9502581598148425e-06,
"loss": 0.9326,
"step": 1757
},
{
"epoch": 4.4961636828644505,
"grad_norm": 0.11823364858584975,
"learning_rate": 4.900897452369782e-06,
"loss": 0.9085,
"step": 1758
},
{
"epoch": 4.498721227621483,
"grad_norm": 0.12367303838985884,
"learning_rate": 4.851776298522692e-06,
"loss": 0.8962,
"step": 1759
},
{
"epoch": 4.501278772378517,
"grad_norm": 0.11649199224229981,
"learning_rate": 4.802894854961882e-06,
"loss": 0.945,
"step": 1760
},
{
"epoch": 4.5038363171355495,
"grad_norm": 0.10951836253938066,
"learning_rate": 4.754253277610969e-06,
"loss": 0.9362,
"step": 1761
},
{
"epoch": 4.506393861892583,
"grad_norm": 0.11824940633958814,
"learning_rate": 4.705851721628465e-06,
"loss": 0.9489,
"step": 1762
},
{
"epoch": 4.508951406649617,
"grad_norm": 0.11623129349141179,
"learning_rate": 4.6576903414072576e-06,
"loss": 0.9345,
"step": 1763
},
{
"epoch": 4.5115089514066495,
"grad_norm": 0.10609179613886349,
"learning_rate": 4.6097692905741194e-06,
"loss": 0.912,
"step": 1764
},
{
"epoch": 4.514066496163683,
"grad_norm": 0.1110236313063869,
"learning_rate": 4.562088721989178e-06,
"loss": 0.9263,
"step": 1765
},
{
"epoch": 4.516624040920716,
"grad_norm": 0.10545968825146992,
"learning_rate": 4.514648787745506e-06,
"loss": 0.9132,
"step": 1766
},
{
"epoch": 4.5191815856777495,
"grad_norm": 0.11497860724139544,
"learning_rate": 4.467449639168564e-06,
"loss": 0.9435,
"step": 1767
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.11514110122345275,
"learning_rate": 4.420491426815758e-06,
"loss": 0.9405,
"step": 1768
},
{
"epoch": 4.524296675191816,
"grad_norm": 0.1123546579246865,
"learning_rate": 4.373774300475928e-06,
"loss": 0.9013,
"step": 1769
},
{
"epoch": 4.526854219948849,
"grad_norm": 0.10434900776877028,
"learning_rate": 4.327298409168928e-06,
"loss": 0.9234,
"step": 1770
},
{
"epoch": 4.529411764705882,
"grad_norm": 0.10753377323226707,
"learning_rate": 4.281063901145102e-06,
"loss": 0.9191,
"step": 1771
},
{
"epoch": 4.531969309462916,
"grad_norm": 0.10990039699899636,
"learning_rate": 4.235070923884772e-06,
"loss": 0.9218,
"step": 1772
},
{
"epoch": 4.534526854219949,
"grad_norm": 0.10914742733757979,
"learning_rate": 4.18931962409789e-06,
"loss": 0.9109,
"step": 1773
},
{
"epoch": 4.537084398976982,
"grad_norm": 0.10959258250347798,
"learning_rate": 4.143810147723448e-06,
"loss": 0.9152,
"step": 1774
},
{
"epoch": 4.539641943734015,
"grad_norm": 0.11106116826490182,
"learning_rate": 4.098542639929086e-06,
"loss": 0.9046,
"step": 1775
},
{
"epoch": 4.542199488491049,
"grad_norm": 0.10748546841476085,
"learning_rate": 4.0535172451105785e-06,
"loss": 0.9128,
"step": 1776
},
{
"epoch": 4.544757033248082,
"grad_norm": 0.11225561412585737,
"learning_rate": 4.008734106891439e-06,
"loss": 0.929,
"step": 1777
},
{
"epoch": 4.547314578005115,
"grad_norm": 0.10831404168834766,
"learning_rate": 3.964193368122384e-06,
"loss": 0.9397,
"step": 1778
},
{
"epoch": 4.549872122762149,
"grad_norm": 0.11033594472176086,
"learning_rate": 3.919895170880938e-06,
"loss": 0.9252,
"step": 1779
},
{
"epoch": 4.552429667519181,
"grad_norm": 0.10441833953450541,
"learning_rate": 3.875839656470959e-06,
"loss": 0.9182,
"step": 1780
},
{
"epoch": 4.554987212276215,
"grad_norm": 0.11080119595164395,
"learning_rate": 3.832026965422184e-06,
"loss": 0.949,
"step": 1781
},
{
"epoch": 4.557544757033249,
"grad_norm": 0.11022335632664775,
"learning_rate": 3.788457237489773e-06,
"loss": 0.9238,
"step": 1782
},
{
"epoch": 4.560102301790281,
"grad_norm": 0.11308201432747443,
"learning_rate": 3.7451306116538867e-06,
"loss": 0.9711,
"step": 1783
},
{
"epoch": 4.562659846547315,
"grad_norm": 0.1028220418076954,
"learning_rate": 3.7020472261192253e-06,
"loss": 0.9005,
"step": 1784
},
{
"epoch": 4.565217391304348,
"grad_norm": 0.10528950924867539,
"learning_rate": 3.6592072183146043e-06,
"loss": 0.9014,
"step": 1785
},
{
"epoch": 4.567774936061381,
"grad_norm": 0.10885389205625104,
"learning_rate": 3.616610724892473e-06,
"loss": 0.9105,
"step": 1786
},
{
"epoch": 4.570332480818414,
"grad_norm": 0.10574673017545647,
"learning_rate": 3.5742578817285777e-06,
"loss": 0.9193,
"step": 1787
},
{
"epoch": 4.572890025575448,
"grad_norm": 0.1117883112559058,
"learning_rate": 3.532148823921375e-06,
"loss": 0.91,
"step": 1788
},
{
"epoch": 4.57544757033248,
"grad_norm": 0.1096961353796292,
"learning_rate": 3.490283685791722e-06,
"loss": 0.9594,
"step": 1789
},
{
"epoch": 4.578005115089514,
"grad_norm": 0.11161221492802147,
"learning_rate": 3.4486626008824575e-06,
"loss": 0.9327,
"step": 1790
},
{
"epoch": 4.580562659846548,
"grad_norm": 0.10744759992585007,
"learning_rate": 3.4072857019578787e-06,
"loss": 0.9219,
"step": 1791
},
{
"epoch": 4.58312020460358,
"grad_norm": 0.10620450789029019,
"learning_rate": 3.3661531210033684e-06,
"loss": 0.9256,
"step": 1792
},
{
"epoch": 4.585677749360614,
"grad_norm": 0.11017512262461532,
"learning_rate": 3.3252649892250123e-06,
"loss": 0.9188,
"step": 1793
},
{
"epoch": 4.588235294117647,
"grad_norm": 0.10649203584062787,
"learning_rate": 3.2846214370491114e-06,
"loss": 0.9286,
"step": 1794
},
{
"epoch": 4.59079283887468,
"grad_norm": 0.10775649571843056,
"learning_rate": 3.2442225941218175e-06,
"loss": 0.91,
"step": 1795
},
{
"epoch": 4.593350383631714,
"grad_norm": 0.10474409566182012,
"learning_rate": 3.20406858930868e-06,
"loss": 0.9187,
"step": 1796
},
{
"epoch": 4.595907928388747,
"grad_norm": 0.10901379780591824,
"learning_rate": 3.164159550694299e-06,
"loss": 0.9268,
"step": 1797
},
{
"epoch": 4.59846547314578,
"grad_norm": 0.10466246579829651,
"learning_rate": 3.12449560558183e-06,
"loss": 0.9045,
"step": 1798
},
{
"epoch": 4.601023017902813,
"grad_norm": 0.10734422633494305,
"learning_rate": 3.085076880492608e-06,
"loss": 0.9131,
"step": 1799
},
{
"epoch": 4.603580562659847,
"grad_norm": 0.1102245685075459,
"learning_rate": 3.045903501165821e-06,
"loss": 0.9456,
"step": 1800
},
{
"epoch": 4.6061381074168795,
"grad_norm": 0.10268613459994491,
"learning_rate": 3.0069755925579945e-06,
"loss": 0.9068,
"step": 1801
},
{
"epoch": 4.608695652173913,
"grad_norm": 0.1041191008417218,
"learning_rate": 2.9682932788426622e-06,
"loss": 0.8961,
"step": 1802
},
{
"epoch": 4.611253196930946,
"grad_norm": 0.10864214050559602,
"learning_rate": 2.9298566834099307e-06,
"loss": 0.9196,
"step": 1803
},
{
"epoch": 4.6138107416879794,
"grad_norm": 0.10289987799334356,
"learning_rate": 2.891665928866152e-06,
"loss": 0.8891,
"step": 1804
},
{
"epoch": 4.616368286445013,
"grad_norm": 0.10627932552480018,
"learning_rate": 2.853721137033425e-06,
"loss": 0.9309,
"step": 1805
},
{
"epoch": 4.618925831202046,
"grad_norm": 0.10976448315029629,
"learning_rate": 2.816022428949303e-06,
"loss": 0.8956,
"step": 1806
},
{
"epoch": 4.621483375959079,
"grad_norm": 0.10383428088111558,
"learning_rate": 2.7785699248663946e-06,
"loss": 0.9245,
"step": 1807
},
{
"epoch": 4.624040920716112,
"grad_norm": 0.10746935820829795,
"learning_rate": 2.741363744251917e-06,
"loss": 0.9641,
"step": 1808
},
{
"epoch": 4.626598465473146,
"grad_norm": 0.1077084422715649,
"learning_rate": 2.70440400578738e-06,
"loss": 0.936,
"step": 1809
},
{
"epoch": 4.629156010230179,
"grad_norm": 0.10619050887196295,
"learning_rate": 2.6676908273681745e-06,
"loss": 0.9236,
"step": 1810
},
{
"epoch": 4.631713554987212,
"grad_norm": 0.09868786010783248,
"learning_rate": 2.63122432610321e-06,
"loss": 0.9235,
"step": 1811
},
{
"epoch": 4.634271099744246,
"grad_norm": 0.10946907000550939,
"learning_rate": 2.5950046183145315e-06,
"loss": 0.9477,
"step": 1812
},
{
"epoch": 4.6368286445012785,
"grad_norm": 0.10911271296863308,
"learning_rate": 2.559031819536966e-06,
"loss": 0.8923,
"step": 1813
},
{
"epoch": 4.639386189258312,
"grad_norm": 0.1057852003057491,
"learning_rate": 2.523306044517737e-06,
"loss": 0.9575,
"step": 1814
},
{
"epoch": 4.641943734015345,
"grad_norm": 0.10597129201414962,
"learning_rate": 2.4878274072161147e-06,
"loss": 0.9478,
"step": 1815
},
{
"epoch": 4.6445012787723785,
"grad_norm": 0.10530345780753828,
"learning_rate": 2.4525960208029843e-06,
"loss": 0.9468,
"step": 1816
},
{
"epoch": 4.647058823529412,
"grad_norm": 0.11128520568838593,
"learning_rate": 2.417611997660636e-06,
"loss": 0.9441,
"step": 1817
},
{
"epoch": 4.649616368286445,
"grad_norm": 0.10763480468498407,
"learning_rate": 2.3828754493822315e-06,
"loss": 0.9342,
"step": 1818
},
{
"epoch": 4.6521739130434785,
"grad_norm": 0.10157629367738297,
"learning_rate": 2.348386486771572e-06,
"loss": 0.9121,
"step": 1819
},
{
"epoch": 4.654731457800511,
"grad_norm": 0.10471609831813257,
"learning_rate": 2.314145219842683e-06,
"loss": 0.8991,
"step": 1820
},
{
"epoch": 4.657289002557545,
"grad_norm": 0.10785688490272143,
"learning_rate": 2.2801517578194997e-06,
"loss": 0.9023,
"step": 1821
},
{
"epoch": 4.659846547314578,
"grad_norm": 0.10437430915631776,
"learning_rate": 2.246406209135481e-06,
"loss": 0.9526,
"step": 1822
},
{
"epoch": 4.662404092071611,
"grad_norm": 0.09976754454013415,
"learning_rate": 2.212908681433286e-06,
"loss": 0.9032,
"step": 1823
},
{
"epoch": 4.664961636828645,
"grad_norm": 0.10687421431181417,
"learning_rate": 2.179659281564446e-06,
"loss": 0.9164,
"step": 1824
},
{
"epoch": 4.667519181585678,
"grad_norm": 0.10095706529924005,
"learning_rate": 2.146658115589002e-06,
"loss": 0.9191,
"step": 1825
},
{
"epoch": 4.670076726342711,
"grad_norm": 0.10132269971777201,
"learning_rate": 2.113905288775149e-06,
"loss": 0.9155,
"step": 1826
},
{
"epoch": 4.672634271099744,
"grad_norm": 0.10307251320208077,
"learning_rate": 2.0814009055989403e-06,
"loss": 0.9165,
"step": 1827
},
{
"epoch": 4.675191815856778,
"grad_norm": 0.10286096825987698,
"learning_rate": 2.0491450697439362e-06,
"loss": 0.9101,
"step": 1828
},
{
"epoch": 4.677749360613811,
"grad_norm": 0.11262366728295894,
"learning_rate": 2.017137884100855e-06,
"loss": 0.914,
"step": 1829
},
{
"epoch": 4.680306905370844,
"grad_norm": 0.11116962011162274,
"learning_rate": 1.9853794507672885e-06,
"loss": 0.9376,
"step": 1830
},
{
"epoch": 4.6828644501278776,
"grad_norm": 0.1040833044448223,
"learning_rate": 1.9538698710473404e-06,
"loss": 0.9236,
"step": 1831
},
{
"epoch": 4.68542199488491,
"grad_norm": 0.10541970140434043,
"learning_rate": 1.9226092454512945e-06,
"loss": 0.9449,
"step": 1832
},
{
"epoch": 4.687979539641944,
"grad_norm": 0.10066677117893352,
"learning_rate": 1.8915976736953157e-06,
"loss": 0.9138,
"step": 1833
},
{
"epoch": 4.690537084398977,
"grad_norm": 0.10836258727940289,
"learning_rate": 1.8608352547011722e-06,
"loss": 0.9687,
"step": 1834
},
{
"epoch": 4.69309462915601,
"grad_norm": 0.11074221672096896,
"learning_rate": 1.8303220865958194e-06,
"loss": 0.9331,
"step": 1835
},
{
"epoch": 4.695652173913043,
"grad_norm": 0.10768331106543749,
"learning_rate": 1.8000582667111777e-06,
"loss": 0.945,
"step": 1836
},
{
"epoch": 4.698209718670077,
"grad_norm": 0.11098771435258944,
"learning_rate": 1.7700438915837858e-06,
"loss": 0.9284,
"step": 1837
},
{
"epoch": 4.70076726342711,
"grad_norm": 0.10799063090442731,
"learning_rate": 1.7402790569544813e-06,
"loss": 0.9,
"step": 1838
},
{
"epoch": 4.703324808184143,
"grad_norm": 0.1063256441527157,
"learning_rate": 1.7107638577681073e-06,
"loss": 0.8962,
"step": 1839
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.1040346093959911,
"learning_rate": 1.681498388173246e-06,
"loss": 0.9516,
"step": 1840
},
{
"epoch": 4.708439897698209,
"grad_norm": 0.10335093559260676,
"learning_rate": 1.652482741521837e-06,
"loss": 0.9131,
"step": 1841
},
{
"epoch": 4.710997442455243,
"grad_norm": 0.10497266871186595,
"learning_rate": 1.6237170103689547e-06,
"loss": 0.9119,
"step": 1842
},
{
"epoch": 4.713554987212277,
"grad_norm": 0.09874397507531227,
"learning_rate": 1.5952012864724898e-06,
"loss": 0.9141,
"step": 1843
},
{
"epoch": 4.716112531969309,
"grad_norm": 0.10588059236614217,
"learning_rate": 1.5669356607928188e-06,
"loss": 0.9331,
"step": 1844
},
{
"epoch": 4.718670076726343,
"grad_norm": 0.10070088788493103,
"learning_rate": 1.5389202234925837e-06,
"loss": 0.929,
"step": 1845
},
{
"epoch": 4.721227621483376,
"grad_norm": 0.10575607673396381,
"learning_rate": 1.5111550639363447e-06,
"loss": 0.9195,
"step": 1846
},
{
"epoch": 4.723785166240409,
"grad_norm": 0.1052143724728097,
"learning_rate": 1.483640270690332e-06,
"loss": 0.9236,
"step": 1847
},
{
"epoch": 4.726342710997442,
"grad_norm": 0.10525748489261051,
"learning_rate": 1.4563759315221515e-06,
"loss": 0.9515,
"step": 1848
},
{
"epoch": 4.728900255754476,
"grad_norm": 0.10259868287875906,
"learning_rate": 1.4293621334004581e-06,
"loss": 0.9522,
"step": 1849
},
{
"epoch": 4.731457800511509,
"grad_norm": 0.10136041128342929,
"learning_rate": 1.4025989624947856e-06,
"loss": 0.9207,
"step": 1850
},
{
"epoch": 4.734015345268542,
"grad_norm": 0.09781638687367422,
"learning_rate": 1.3760865041751736e-06,
"loss": 0.9226,
"step": 1851
},
{
"epoch": 4.736572890025576,
"grad_norm": 0.10175570288516775,
"learning_rate": 1.3498248430119465e-06,
"loss": 0.9141,
"step": 1852
},
{
"epoch": 4.739130434782608,
"grad_norm": 0.10920419786681472,
"learning_rate": 1.3238140627754014e-06,
"loss": 0.9544,
"step": 1853
},
{
"epoch": 4.741687979539642,
"grad_norm": 0.10426566657693524,
"learning_rate": 1.2980542464355962e-06,
"loss": 0.9492,
"step": 1854
},
{
"epoch": 4.744245524296675,
"grad_norm": 0.10161986714655702,
"learning_rate": 1.272545476162037e-06,
"loss": 0.9253,
"step": 1855
},
{
"epoch": 4.746803069053708,
"grad_norm": 0.10568474804520346,
"learning_rate": 1.2472878333234407e-06,
"loss": 0.895,
"step": 1856
},
{
"epoch": 4.749360613810742,
"grad_norm": 0.10079844884131213,
"learning_rate": 1.2222813984874749e-06,
"loss": 0.9146,
"step": 1857
},
{
"epoch": 4.751918158567775,
"grad_norm": 0.09772653572503225,
"learning_rate": 1.197526251420502e-06,
"loss": 0.9434,
"step": 1858
},
{
"epoch": 4.754475703324808,
"grad_norm": 0.10521061309223152,
"learning_rate": 1.1730224710872862e-06,
"loss": 0.917,
"step": 1859
},
{
"epoch": 4.757033248081841,
"grad_norm": 0.10102811382690155,
"learning_rate": 1.148770135650814e-06,
"loss": 0.9402,
"step": 1860
},
{
"epoch": 4.759590792838875,
"grad_norm": 0.10184925109076563,
"learning_rate": 1.1247693224719768e-06,
"loss": 0.9341,
"step": 1861
},
{
"epoch": 4.762148337595908,
"grad_norm": 0.10416605640976224,
"learning_rate": 1.1010201081093653e-06,
"loss": 0.9258,
"step": 1862
},
{
"epoch": 4.764705882352941,
"grad_norm": 0.10242702305319981,
"learning_rate": 1.0775225683190027e-06,
"loss": 0.9401,
"step": 1863
},
{
"epoch": 4.767263427109975,
"grad_norm": 0.1054355472195325,
"learning_rate": 1.0542767780541242e-06,
"loss": 0.9452,
"step": 1864
},
{
"epoch": 4.7698209718670075,
"grad_norm": 0.09850748287302327,
"learning_rate": 1.0312828114649175e-06,
"loss": 0.9147,
"step": 1865
},
{
"epoch": 4.772378516624041,
"grad_norm": 0.10426914175715249,
"learning_rate": 1.008540741898285e-06,
"loss": 0.9364,
"step": 1866
},
{
"epoch": 4.774936061381074,
"grad_norm": 0.10421190980413071,
"learning_rate": 9.860506418976556e-07,
"loss": 0.9155,
"step": 1867
},
{
"epoch": 4.7774936061381075,
"grad_norm": 0.09974968560728949,
"learning_rate": 9.638125832026658e-07,
"loss": 0.9164,
"step": 1868
},
{
"epoch": 4.78005115089514,
"grad_norm": 0.10323506252287525,
"learning_rate": 9.418266367490347e-07,
"loss": 0.9294,
"step": 1869
},
{
"epoch": 4.782608695652174,
"grad_norm": 0.10057988567304277,
"learning_rate": 9.200928726682456e-07,
"loss": 0.9198,
"step": 1870
},
{
"epoch": 4.7851662404092075,
"grad_norm": 0.10109533674227822,
"learning_rate": 8.986113602873758e-07,
"loss": 0.9696,
"step": 1871
},
{
"epoch": 4.78772378516624,
"grad_norm": 0.10248654252247842,
"learning_rate": 8.773821681288752e-07,
"loss": 0.9059,
"step": 1872
},
{
"epoch": 4.790281329923274,
"grad_norm": 0.10623698814695832,
"learning_rate": 8.564053639103087e-07,
"loss": 0.9104,
"step": 1873
},
{
"epoch": 4.792838874680307,
"grad_norm": 0.10184589368398628,
"learning_rate": 8.356810145441874e-07,
"loss": 0.8999,
"step": 1874
},
{
"epoch": 4.79539641943734,
"grad_norm": 0.09973933906653507,
"learning_rate": 8.152091861377198e-07,
"loss": 0.9281,
"step": 1875
},
{
"epoch": 4.797953964194374,
"grad_norm": 0.0965602895068992,
"learning_rate": 7.949899439926345e-07,
"loss": 0.8972,
"step": 1876
},
{
"epoch": 4.8005115089514065,
"grad_norm": 0.09817984542309073,
"learning_rate": 7.750233526049222e-07,
"loss": 0.9374,
"step": 1877
},
{
"epoch": 4.80306905370844,
"grad_norm": 0.10767556941660049,
"learning_rate": 7.553094756646761e-07,
"loss": 0.922,
"step": 1878
},
{
"epoch": 4.805626598465473,
"grad_norm": 0.09968854723854502,
"learning_rate": 7.358483760558877e-07,
"loss": 0.9092,
"step": 1879
},
{
"epoch": 4.8081841432225065,
"grad_norm": 0.10013368895859236,
"learning_rate": 7.166401158561886e-07,
"loss": 0.9053,
"step": 1880
},
{
"epoch": 4.810741687979539,
"grad_norm": 0.10050188953527933,
"learning_rate": 6.976847563367539e-07,
"loss": 0.9342,
"step": 1881
},
{
"epoch": 4.813299232736573,
"grad_norm": 0.10572001540704473,
"learning_rate": 6.789823579619992e-07,
"loss": 0.9055,
"step": 1882
},
{
"epoch": 4.8158567774936065,
"grad_norm": 0.0958884248641111,
"learning_rate": 6.605329803894389e-07,
"loss": 0.8971,
"step": 1883
},
{
"epoch": 4.818414322250639,
"grad_norm": 0.10042711105691594,
"learning_rate": 6.423366824695265e-07,
"loss": 0.9176,
"step": 1884
},
{
"epoch": 4.820971867007673,
"grad_norm": 0.10511225981510647,
"learning_rate": 6.243935222454145e-07,
"loss": 0.9176,
"step": 1885
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.09696941259664335,
"learning_rate": 6.067035569527768e-07,
"loss": 0.9336,
"step": 1886
},
{
"epoch": 4.826086956521739,
"grad_norm": 0.09743670957958701,
"learning_rate": 5.89266843019658e-07,
"loss": 0.9335,
"step": 1887
},
{
"epoch": 4.828644501278772,
"grad_norm": 0.10334868098940422,
"learning_rate": 5.720834360662597e-07,
"loss": 0.9302,
"step": 1888
},
{
"epoch": 4.831202046035806,
"grad_norm": 0.10567530011947436,
"learning_rate": 5.551533909047812e-07,
"loss": 0.9173,
"step": 1889
},
{
"epoch": 4.833759590792839,
"grad_norm": 0.10109569243664909,
"learning_rate": 5.384767615392328e-07,
"loss": 0.8973,
"step": 1890
},
{
"epoch": 4.836317135549872,
"grad_norm": 0.10107099176370515,
"learning_rate": 5.220536011652933e-07,
"loss": 0.9327,
"step": 1891
},
{
"epoch": 4.838874680306906,
"grad_norm": 0.09592817542499839,
"learning_rate": 5.058839621700973e-07,
"loss": 0.8986,
"step": 1892
},
{
"epoch": 4.841432225063938,
"grad_norm": 0.10402134439975212,
"learning_rate": 4.899678961320842e-07,
"loss": 0.8783,
"step": 1893
},
{
"epoch": 4.843989769820972,
"grad_norm": 0.09879349396951775,
"learning_rate": 4.743054538208558e-07,
"loss": 0.9265,
"step": 1894
},
{
"epoch": 4.846547314578006,
"grad_norm": 0.10801219003494308,
"learning_rate": 4.5889668519698117e-07,
"loss": 0.917,
"step": 1895
},
{
"epoch": 4.849104859335038,
"grad_norm": 0.10336628048777474,
"learning_rate": 4.437416394118721e-07,
"loss": 0.9475,
"step": 1896
},
{
"epoch": 4.851662404092072,
"grad_norm": 0.09915519846574018,
"learning_rate": 4.2884036480757896e-07,
"loss": 0.9136,
"step": 1897
},
{
"epoch": 4.854219948849105,
"grad_norm": 0.10488853611936978,
"learning_rate": 4.1419290891669293e-07,
"loss": 0.9276,
"step": 1898
},
{
"epoch": 4.856777493606138,
"grad_norm": 0.10257283710076046,
"learning_rate": 3.997993184621418e-07,
"loss": 0.9584,
"step": 1899
},
{
"epoch": 4.859335038363171,
"grad_norm": 0.10288770850501508,
"learning_rate": 3.856596393570744e-07,
"loss": 0.9128,
"step": 1900
},
{
"epoch": 4.861892583120205,
"grad_norm": 0.09729119851077626,
"learning_rate": 3.717739167047185e-07,
"loss": 0.912,
"step": 1901
},
{
"epoch": 4.864450127877237,
"grad_norm": 0.1024901619430387,
"learning_rate": 3.581421947982122e-07,
"loss": 0.9166,
"step": 1902
},
{
"epoch": 4.867007672634271,
"grad_norm": 0.10281823220549692,
"learning_rate": 3.447645171204528e-07,
"loss": 0.9308,
"step": 1903
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.1014220238267167,
"learning_rate": 3.316409263440168e-07,
"loss": 0.9401,
"step": 1904
},
{
"epoch": 4.872122762148337,
"grad_norm": 0.10082233886495114,
"learning_rate": 3.1877146433095584e-07,
"loss": 0.9349,
"step": 1905
},
{
"epoch": 4.874680306905371,
"grad_norm": 0.09966232794121334,
"learning_rate": 3.0615617213271664e-07,
"loss": 0.9218,
"step": 1906
},
{
"epoch": 4.877237851662404,
"grad_norm": 0.09941244859685047,
"learning_rate": 2.937950899899633e-07,
"loss": 0.9278,
"step": 1907
},
{
"epoch": 4.879795396419437,
"grad_norm": 0.09951897237383148,
"learning_rate": 2.816882573324886e-07,
"loss": 0.949,
"step": 1908
},
{
"epoch": 4.882352941176471,
"grad_norm": 0.10401741016384587,
"learning_rate": 2.6983571277907184e-07,
"loss": 0.9563,
"step": 1909
},
{
"epoch": 4.884910485933504,
"grad_norm": 0.09725714975876674,
"learning_rate": 2.582374941373456e-07,
"loss": 0.9211,
"step": 1910
},
{
"epoch": 4.887468030690537,
"grad_norm": 0.10133318561817573,
"learning_rate": 2.468936384036891e-07,
"loss": 0.9013,
"step": 1911
},
{
"epoch": 4.89002557544757,
"grad_norm": 0.10119524228199774,
"learning_rate": 2.3580418176311293e-07,
"loss": 0.9417,
"step": 1912
},
{
"epoch": 4.892583120204604,
"grad_norm": 0.09951712783614965,
"learning_rate": 2.2496915958913458e-07,
"loss": 0.9253,
"step": 1913
},
{
"epoch": 4.8951406649616365,
"grad_norm": 0.0988058097334845,
"learning_rate": 2.143886064436629e-07,
"loss": 0.9344,
"step": 1914
},
{
"epoch": 4.89769820971867,
"grad_norm": 0.0988533205503812,
"learning_rate": 2.0406255607688274e-07,
"loss": 0.9258,
"step": 1915
},
{
"epoch": 4.900255754475703,
"grad_norm": 0.09899535759420186,
"learning_rate": 1.9399104142719283e-07,
"loss": 0.9484,
"step": 1916
},
{
"epoch": 4.9028132992327365,
"grad_norm": 0.10153569163687459,
"learning_rate": 1.8417409462102798e-07,
"loss": 0.9073,
"step": 1917
},
{
"epoch": 4.90537084398977,
"grad_norm": 0.09957601677253938,
"learning_rate": 1.746117469728148e-07,
"loss": 0.8841,
"step": 1918
},
{
"epoch": 4.907928388746803,
"grad_norm": 0.10184723073884586,
"learning_rate": 1.6530402898484733e-07,
"loss": 0.9525,
"step": 1919
},
{
"epoch": 4.910485933503836,
"grad_norm": 0.09694091907819868,
"learning_rate": 1.5625097034719815e-07,
"loss": 0.9193,
"step": 1920
},
{
"epoch": 4.913043478260869,
"grad_norm": 0.10383046531826044,
"learning_rate": 1.474525999375942e-07,
"loss": 0.9339,
"step": 1921
},
{
"epoch": 4.915601023017903,
"grad_norm": 0.09727962611523398,
"learning_rate": 1.3890894582138103e-07,
"loss": 0.9271,
"step": 1922
},
{
"epoch": 4.918158567774936,
"grad_norm": 0.10045856203495888,
"learning_rate": 1.3062003525138089e-07,
"loss": 0.9129,
"step": 1923
},
{
"epoch": 4.920716112531969,
"grad_norm": 0.09953247096750498,
"learning_rate": 1.225858946678393e-07,
"loss": 0.9149,
"step": 1924
},
{
"epoch": 4.923273657289003,
"grad_norm": 0.10381806462155738,
"learning_rate": 1.1480654969833638e-07,
"loss": 0.9473,
"step": 1925
},
{
"epoch": 4.9258312020460355,
"grad_norm": 0.09951540982333777,
"learning_rate": 1.0728202515766228e-07,
"loss": 0.9452,
"step": 1926
},
{
"epoch": 4.928388746803069,
"grad_norm": 0.09714908717583805,
"learning_rate": 1.0001234504779966e-07,
"loss": 0.9478,
"step": 1927
},
{
"epoch": 4.930946291560103,
"grad_norm": 0.10355673013634514,
"learning_rate": 9.299753255781696e-08,
"loss": 0.9113,
"step": 1928
},
{
"epoch": 4.9335038363171355,
"grad_norm": 0.1010600576834511,
"learning_rate": 8.623761006379738e-08,
"loss": 0.9322,
"step": 1929
},
{
"epoch": 4.936061381074169,
"grad_norm": 0.09937740112494577,
"learning_rate": 7.973259912875897e-08,
"loss": 0.9529,
"step": 1930
},
{
"epoch": 4.938618925831202,
"grad_norm": 0.10172138015837517,
"learning_rate": 7.348252050261018e-08,
"loss": 0.9516,
"step": 1931
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.10153203845561144,
"learning_rate": 6.748739412205218e-08,
"loss": 0.9327,
"step": 1932
},
{
"epoch": 4.943734015345268,
"grad_norm": 0.09630467933849142,
"learning_rate": 6.174723911053449e-08,
"loss": 0.9033,
"step": 1933
},
{
"epoch": 4.946291560102302,
"grad_norm": 0.09792982830145779,
"learning_rate": 5.6262073778192705e-08,
"loss": 0.9289,
"step": 1934
},
{
"epoch": 4.948849104859335,
"grad_norm": 0.10137971801200332,
"learning_rate": 5.1031915621795325e-08,
"loss": 0.9127,
"step": 1935
},
{
"epoch": 4.951406649616368,
"grad_norm": 0.09867014858433792,
"learning_rate": 4.605678132467262e-08,
"loss": 0.9195,
"step": 1936
},
{
"epoch": 4.953964194373402,
"grad_norm": 0.09945447399480298,
"learning_rate": 4.133668675666336e-08,
"loss": 0.9235,
"step": 1937
},
{
"epoch": 4.956521739130435,
"grad_norm": 0.09740241154451518,
"learning_rate": 3.687164697408818e-08,
"loss": 0.8983,
"step": 1938
},
{
"epoch": 4.959079283887468,
"grad_norm": 0.10216904139394242,
"learning_rate": 3.266167621967853e-08,
"loss": 0.9333,
"step": 1939
},
{
"epoch": 4.961636828644501,
"grad_norm": 1.7447830173428402,
"learning_rate": 2.8706787922541112e-08,
"loss": 0.9677,
"step": 1940
},
{
"epoch": 4.964194373401535,
"grad_norm": 0.10248140850999501,
"learning_rate": 2.5006994698095754e-08,
"loss": 0.9205,
"step": 1941
},
{
"epoch": 4.966751918158568,
"grad_norm": 0.10291780599089813,
"learning_rate": 2.156230834808426e-08,
"loss": 0.9314,
"step": 1942
},
{
"epoch": 4.969309462915601,
"grad_norm": 0.09792527077121264,
"learning_rate": 1.837273986046384e-08,
"loss": 0.9289,
"step": 1943
},
{
"epoch": 4.971867007672635,
"grad_norm": 0.0960164691356107,
"learning_rate": 1.5438299409433755e-08,
"loss": 0.9013,
"step": 1944
},
{
"epoch": 4.974424552429667,
"grad_norm": 0.09979959822032446,
"learning_rate": 1.2758996355373144e-08,
"loss": 0.9203,
"step": 1945
},
{
"epoch": 4.976982097186701,
"grad_norm": 0.10827315260460384,
"learning_rate": 1.0334839244805495e-08,
"loss": 0.9541,
"step": 1946
},
{
"epoch": 4.979539641943734,
"grad_norm": 0.0988359933652592,
"learning_rate": 8.165835810389766e-09,
"loss": 0.9064,
"step": 1947
},
{
"epoch": 4.982097186700767,
"grad_norm": 0.09820054319763678,
"learning_rate": 6.251992970875975e-09,
"loss": 0.9214,
"step": 1948
},
{
"epoch": 4.9846547314578,
"grad_norm": 0.10015641951197356,
"learning_rate": 4.5933168311140805e-09,
"loss": 0.9461,
"step": 1949
},
{
"epoch": 4.987212276214834,
"grad_norm": 0.10040227257081992,
"learning_rate": 3.1898126820006924e-09,
"loss": 0.9465,
"step": 1950
},
{
"epoch": 4.989769820971867,
"grad_norm": 0.09609050872126598,
"learning_rate": 2.041485000479071e-09,
"loss": 0.9108,
"step": 1951
},
{
"epoch": 4.9923273657289,
"grad_norm": 0.09913441529294063,
"learning_rate": 1.148337449521364e-09,
"loss": 0.9356,
"step": 1952
},
{
"epoch": 4.994884910485934,
"grad_norm": 0.09800757849537761,
"learning_rate": 5.103728781197248e-10,
"loss": 0.9002,
"step": 1953
},
{
"epoch": 4.997442455242966,
"grad_norm": 0.09827002033578132,
"learning_rate": 1.275933212774305e-10,
"loss": 0.9081,
"step": 1954
},
{
"epoch": 5.0,
"grad_norm": 0.10411131044626397,
"learning_rate": 0.0,
"loss": 0.9254,
"step": 1955
},
{
"epoch": 5.0,
"step": 1955,
"total_flos": 7122204608430080.0,
"train_loss": 1.0036099467436066,
"train_runtime": 36219.8634,
"train_samples_per_second": 13.805,
"train_steps_per_second": 0.054
}
],
"logging_steps": 1.0,
"max_steps": 1955,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7122204608430080.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}